go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/server.go (about) 1 // Copyright 2019 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package server implements an environment for running LUCI servers. 16 // 17 // It interprets command line flags and initializes the serving environment with 18 // the following core services: 19 // 20 // - go.chromium.org/luci/common/logging: logging via Google Cloud Logging. 21 // - go.opentelemetry.io/otel/trace: OpenTelemetry tracing with export to 22 // Google Cloud Trace. 23 // - go.chromium.org/luci/server/tsmon: monitoring metrics via ProdX. 24 // - go.chromium.org/luci/server/auth: sending and receiving RPCs 25 // authenticated with Google OAuth2 or OpenID tokens. Support for 26 // authorization via LUCI groups and LUCI realms. 27 // - go.chromium.org/luci/server/caching: in-process caching. 28 // - go.chromium.org/luci/server/warmup: allows other server components to 29 // register warmup callbacks that run before the server starts handling 30 // requests. 31 // - go.chromium.org/luci/server/experiments: simple feature flags support. 32 // - go.chromium.org/luci/grpc/prpc: pRPC server and RPC Explorer UI. 33 // - Error reporting via Google Cloud Error Reporting. 34 // - Continuous profiling via Google Cloud Profiler. 35 // 36 // Other functionality is optional and provided by modules (objects implementing 37 // module.Module interface). They should be passed to the server when it starts 38 // (see the example below). Modules usually expose their configuration via 39 // command line flags, and provide functionality by injecting state into 40 // the server's global context.Context or by exposing gRPC endpoints. 41 // 42 // Usage example: 43 // 44 // import ( 45 // ... 46 // 47 // "go.chromium.org/luci/server" 48 // "go.chromium.org/luci/server/gaeemulation" 49 // "go.chromium.org/luci/server/module" 50 // "go.chromium.org/luci/server/redisconn" 51 // ) 52 // 53 // func main() { 54 // modules := []module.Module{ 55 // gaeemulation.NewModuleFromFlags(), 56 // redisconn.NewModuleFromFlags(), 57 // } 58 // server.Main(nil, modules, func(srv *server.Server) error { 59 // // Initialize global state, change root context (if necessary). 60 // if err := initializeGlobalStuff(srv.Context); err != nil { 61 // return err 62 // } 63 // srv.Context = injectGlobalStuff(srv.Context) 64 // 65 // // Install regular HTTP routes. 66 // srv.Routes.GET("/", nil, func(c *router.Context) { 67 // // ... 68 // }) 69 // 70 // // Install gRPC services. 71 // servicepb.RegisterSomeServer(srv, &SomeServer{}) 72 // return nil 73 // }) 74 // } 75 // 76 // More examples can be found in the code search: https://source.chromium.org/search?q=%22server.Main%28nil%2C%20modules%2C%22 77 // 78 // # Known modules 79 // 80 // The following modules (in alphabetical order) are a part of the LUCI 81 // repository and can be used in any server binary: 82 // 83 // - go.chromium.org/luci/config/server/cfgmodule: provides LUCI Config 84 // client, exposes config validation endpoints used by LUCI Config service. 85 // - go.chromium.org/luci/server/analytics: generates Google Analytics js 86 // snippets for inclusion in a service's web pages. 87 // - go.chromium.org/luci/server/bqlog: implements best effort low-overhead 88 // structured logging to BigQuery suitable for debug data like access logs. 89 // - go.chromium.org/luci/server/cron: allows registering Cloud Scheduler (aka 90 // Appengine cron.yaml) handlers, with proper authentication and monitoring 91 // metrics. 92 // - go.chromium.org/luci/server/encryptedcookies: implements an 93 // authentication scheme for HTTP routes based on encrypted cookies and user 94 // sessions in some session store. 95 // - go.chromium.org/luci/server/dsmapper: provides a way to apply some 96 // function to all datastore entities of some particular kind, in parallel, 97 // distributing work via Cloud Tasks. 98 // - go.chromium.org/luci/server/gaeemulation: implements 99 // go.chromium.org/luci/gae Datastore interface via Google Cloud Datastore 100 // API. Named so because because it enables migration of GAEv1 apps to GAEv2 101 // without touching datastore-related code. 102 // - go.chromium.org/luci/server/gerritauth: implements authentication using 103 // Gerrit JWTs. Useful if a service is used by a Gerrit frontend plugin. 104 // - go.chromium.org/luci/server/limiter: a simple load shedding mechanism 105 // that puts a limit on a number of concurrent gRPC requests the server 106 // is handling. 107 // - go.chromium.org/luci/server/mailer: sending simple emails. 108 // - go.chromium.org/luci/server/redisconn: a Redis client. Also enables Redis 109 // as a caching backend for go.chromium.org/luci/server/caching and for 110 // go.chromium.org/luci/gae/filter/dscache. 111 // - go.chromium.org/luci/server/secrets: enables generation and validation of 112 // HMAC-tagged tokens via go.chromium.org/luci/server/tokens. 113 // - go.chromium.org/luci/server/span: a Cloud Spanner client. Wraps Spanner 114 // API a bit to improve interoperability with other modules (in particular 115 // the TQ module). 116 // - go.chromium.org/luci/server/tq: implements a task queue mechanism on top 117 // of Cloud Tasks and Cloud PubSub. Also implements transactional task 118 // enqueuing when submitting tasks in a Cloud Datastore or a Cloud Spanner 119 // transaction. 120 // 121 // Most of them need to be configured via corresponding CLI flags to be useful. 122 // See implementation of individual modules for details. 123 // 124 // An up-to-date list of all known module implementations can be found here: 125 // https://source.chromium.org/search?q=%22NewModuleFromFlags()%20module.Module%22 126 // 127 // # gRPC services 128 // 129 // The server implements grpc.ServiceRegistrar interface which means it can be 130 // used to register gRPC service implementations in. The registered services 131 // will be exposed via gRPC protocol over the gRPC port (if the gRPC serving 132 // port is configured in options) and via pRPC protocol over the main HTTP port 133 // (if the main HTTP serving port is configured in options). The server is also 134 // pre-configured with a set of gRPC interceptors that collect performance 135 // metrics, catch panics and authenticate requests. More interceptors can be 136 // added via RegisterUnaryServerInterceptors. 137 // 138 // # Security considerations 139 // 140 // The expected deployment environments are Kubernetes, Google App Engine and 141 // Google Cloud Run. In all cases the server is expected to be behind a load 142 // balancer or proxy (or a series of load balancers and proxies) that terminate 143 // TLS and set `X-Forwarded-For` and `X-Forwarded-Proto` headers. In particular 144 // `X-Forwarded-For` header should look like: 145 // 146 // [<untrusted part>,]<IP that connected to the LB>,<unimportant>[,<more>]. 147 // 148 // Where `<untrusted part>` may be present if the original request from the 149 // Internet comes with `X-Forwarded-For` header. The IP specified there is not 150 // trusted, but the server assumes the load balancer at least sanitizes the 151 // format of this field. 152 // 153 // `<IP that connected to the LB>` is the end-client IP that can be used by the 154 // server for logs and for IP-allowlist checks. 155 // 156 // `<unimportant>` is a "global forwarding rule external IP" for GKE or 157 // the constant "169.254.1.1" for GAE and Cloud Run. It is unused. See 158 // https://cloud.google.com/load-balancing/docs/https for more info. 159 // 160 // `<more>` may be present if the request was proxied through more layers of 161 // load balancers while already inside the cluster. The server currently assumes 162 // this is not happening (i.e. `<more>` is absent, or, in other words, the 163 // client IP is the second to last in the `X-Forwarded-For` list). If you need 164 // to recognize more layers of load balancing, please file a feature request to 165 // add a CLI flag specifying how many layers of load balancers to skip to get to 166 // the original IP. 167 package server 168 169 import ( 170 "context" 171 cryptorand "crypto/rand" 172 "crypto/sha256" 173 "encoding/binary" 174 "encoding/hex" 175 "flag" 176 "fmt" 177 "math/rand" 178 "net" 179 "net/http" 180 "net/http/pprof" 181 "os" 182 "runtime" 183 "strings" 184 "sync" 185 "sync/atomic" 186 "time" 187 188 gcemetadata "cloud.google.com/go/compute/metadata" 189 "cloud.google.com/go/errorreporting" 190 credentials "cloud.google.com/go/iam/credentials/apiv1" 191 "cloud.google.com/go/iam/credentials/apiv1/credentialspb" 192 "cloud.google.com/go/profiler" 193 texporter "github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/trace" 194 gcppropagator "github.com/GoogleCloudPlatform/opentelemetry-operations-go/propagator" 195 "go.opentelemetry.io/contrib/detectors/gcp" 196 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 197 "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" 198 "go.opentelemetry.io/otel" 199 "go.opentelemetry.io/otel/propagation" 200 "go.opentelemetry.io/otel/sdk/resource" 201 "go.opentelemetry.io/otel/sdk/trace" 202 semconv "go.opentelemetry.io/otel/semconv/v1.17.0" 203 oteltrace "go.opentelemetry.io/otel/trace" 204 "golang.org/x/oauth2" 205 "google.golang.org/api/option" 206 codepb "google.golang.org/genproto/googleapis/rpc/code" 207 "google.golang.org/grpc" 208 "google.golang.org/grpc/status" 209 210 clientauth "go.chromium.org/luci/auth" 211 "go.chromium.org/luci/common/clock" 212 "go.chromium.org/luci/common/errors" 213 luciflag "go.chromium.org/luci/common/flag" 214 "go.chromium.org/luci/common/flag/stringlistflag" 215 "go.chromium.org/luci/common/iotools" 216 "go.chromium.org/luci/common/logging" 217 "go.chromium.org/luci/common/logging/gologger" 218 "go.chromium.org/luci/common/logging/sdlogger" 219 "go.chromium.org/luci/common/system/signals" 220 tsmoncommon "go.chromium.org/luci/common/tsmon" 221 "go.chromium.org/luci/common/tsmon/metric" 222 "go.chromium.org/luci/common/tsmon/monitor" 223 "go.chromium.org/luci/common/tsmon/target" 224 "go.chromium.org/luci/grpc/discovery" 225 "go.chromium.org/luci/grpc/grpcmon" 226 "go.chromium.org/luci/grpc/grpcutil" 227 "go.chromium.org/luci/grpc/prpc" 228 "go.chromium.org/luci/hardcoded/chromeinfra" // should be used ONLY in Main() 229 "go.chromium.org/luci/web/rpcexplorer" 230 231 "go.chromium.org/luci/server/auth" 232 "go.chromium.org/luci/server/auth/authdb" 233 "go.chromium.org/luci/server/auth/authdb/dump" 234 "go.chromium.org/luci/server/auth/openid" 235 "go.chromium.org/luci/server/auth/signing" 236 "go.chromium.org/luci/server/caching" 237 "go.chromium.org/luci/server/experiments" 238 "go.chromium.org/luci/server/internal" 239 "go.chromium.org/luci/server/internal/gae" 240 "go.chromium.org/luci/server/middleware" 241 "go.chromium.org/luci/server/module" 242 "go.chromium.org/luci/server/portal" 243 "go.chromium.org/luci/server/router" 244 "go.chromium.org/luci/server/secrets" 245 "go.chromium.org/luci/server/tsmon" 246 "go.chromium.org/luci/server/warmup" 247 ) 248 249 const ( 250 // Path of the health check endpoint. 251 healthEndpoint = "/healthz" 252 253 // Log a warning if health check is slower than this. 254 healthTimeLogThreshold = 50 * time.Millisecond 255 defaultTsMonFlushInterval = 60 * time.Second 256 defaultTsMonFlushTimeout = 15 * time.Second 257 ) 258 259 var ( 260 versionMetric = metric.NewString( 261 "server/version", 262 "Version of the running container image (taken from -container-image-id).", 263 nil) 264 ) 265 266 // cloudRegionFromGAERegion maps GAE region codes (e.g. `s`) to corresponding 267 // cloud regions (e.g. `us-central1`), which may be defined as regions where GAE 268 // creates resources associated with the app, such as Task Queues or Flex VMs. 269 // 270 // Sadly this mapping is not documented, thus the below map is incomplete. Feel 271 // free to modify it if you deployed to some new GAE region. 272 // 273 // This mapping is unused if `-cloud-region` flag is passed explicitly. 274 var cloudRegionFromGAERegion = map[string]string{ 275 "e": "europe-west1", 276 "g": "europe-west2", 277 "h": "europe-west3", 278 "m": "us-west2", 279 "p": "us-east1", 280 "s": "us-central1", 281 } 282 283 // Context key of *incomingRequest{...}, see httpRoot(...) and grpcRoot(...). 284 var incomingRequestKey = "go.chromium.org/luci/server.incomingRequest" 285 286 // Main initializes the server and runs its serving loop until SIGTERM. 287 // 288 // Registers all options in the default flag set and uses `flag.Parse` to parse 289 // them. If 'opts' is nil, the default options will be used. Only flags are 290 // allowed in the command line (no positional arguments). 291 // 292 // Additionally recognizes GAE_* and K_* env vars as an indicator that the 293 // server is running in the corresponding serverless runtime. This slightly 294 // tweaks its behavior to match what these runtimes expects from servers. 295 // 296 // On errors, logs them and aborts the process with non-zero exit code. 297 func Main(opts *Options, mods []module.Module, init func(srv *Server) error) { 298 // Prepopulate defaults for flags based on the runtime environment. 299 opts, err := OptionsFromEnv(opts) 300 if err != nil { 301 fmt.Fprintf(os.Stderr, "When constructing options: %s\n", err) 302 os.Exit(3) 303 } 304 305 // Register and parse server flags. 306 opts.Register(flag.CommandLine) 307 flag.Parse() 308 if args := flag.Args(); len(args) > 0 { 309 fmt.Fprintf(os.Stderr, "got unexpected positional command line arguments: %v\n", args) 310 os.Exit(3) 311 } 312 313 // Construct the server and run its serving loop. 314 srv, err := New(context.Background(), *opts, mods) 315 if err != nil { 316 srv.Fatal(err) 317 } 318 if init != nil { 319 if err = init(srv); err != nil { 320 srv.Fatal(err) 321 } 322 } 323 if err = srv.Serve(); err != nil { 324 srv.Fatal(err) 325 } 326 } 327 328 // Options are used to configure the server. 329 // 330 // Most of them are exposed as command line flags (see Register implementation). 331 // Some (specific to serverless runtimes) are only settable through code or are 332 // derived from the environment. 333 type Options struct { 334 Prod bool // set when running in production (not on a dev workstation) 335 Serverless module.Serverless // set when running in a serverless environment, implies Prod 336 Hostname string // used for logging and metric fields, default is os.Hostname 337 338 HTTPAddr string // address to bind the main listening socket to 339 GRPCAddr string // address to bind the gRPC listening socket to 340 AdminAddr string // address to bind the admin socket to, ignored on GAE and Cloud Run 341 AllowH2C bool // if true, allow HTTP/2 Cleartext traffic on non-gRPC HTTP ports 342 343 DefaultRequestTimeout time.Duration // how long non-internal HTTP handlers are allowed to run, 1 min by default 344 InternalRequestTimeout time.Duration // how long "/internal/*" HTTP handlers are allowed to run, 10 min by default 345 ShutdownDelay time.Duration // how long to wait after SIGTERM before shutting down 346 347 ClientAuth clientauth.Options // base settings for client auth options 348 TokenCacheDir string // where to cache auth tokens (optional) 349 AuthDBProvider auth.DBProvider // source of the AuthDB: if set all Auth* options below are ignored 350 AuthDBPath string // if set, load AuthDB from a file 351 AuthServiceHost string // hostname of an Auth Service to use 352 AuthDBDump string // Google Storage path to fetch AuthDB dumps from 353 AuthDBSigner string // service account that signs AuthDB dumps 354 FrontendClientID string // OAuth2 ClientID for frontend (e.g. user sign in) 355 356 OpenIDRPCAuthEnable bool // if true, use OIDC identity tokens for RPC authentication 357 OpenIDRPCAuthAudience stringlistflag.Flag // additional allowed OIDC token audiences 358 359 CloudProject string // name of the hosting Google Cloud Project 360 CloudRegion string // name of the hosting Google Cloud region 361 362 TraceSampling string // what portion of traces to upload to Cloud Trace (ignored on GAE and Cloud Run) 363 364 TsMonAccount string // service account to flush metrics as 365 TsMonServiceName string // service name of tsmon target 366 TsMonJobName string // job name of tsmon target 367 TsMonFlushInterval time.Duration // how often to flush metrics 368 TsMonFlushTimeout time.Duration // timeout for flushing 369 370 ProfilingProbability float64 // an [0; 1.0] float with a chance to enable Cloud Profiler in the process 371 ProfilingServiceID string // service name to associated with profiles in Cloud Profiler 372 373 ContainerImageID string // ID of the container image with this binary, for logs (optional) 374 375 EnableExperiments []string // names of go.chromium.org/luci/server/experiments to enable 376 377 CloudErrorReporting bool // set to true to enable Cloud Error Reporting 378 379 testSeed int64 // used to seed rng in tests 380 testStdout sdlogger.LogEntryWriter // mocks stdout in tests 381 testStderr sdlogger.LogEntryWriter // mocks stderr in tests 382 testListeners map[string]net.Listener // addr => net.Listener, for tests 383 testDisableTracing bool // don't install a tracing backend 384 } 385 386 // OptionsFromEnv prepopulates options based on the runtime environment. 387 // 388 // It detects if the process is running on GAE or Cloud Run and adjust options 389 // accordingly. See FromGAEEnv and FromCloudRunEnv for exact details of how it 390 // happens. 391 // 392 // Either mutates give `opts`, returning it in the end, or (if `opts` is nil) 393 // create new Options. 394 func OptionsFromEnv(opts *Options) (*Options, error) { 395 if opts == nil { 396 opts = &Options{} 397 } 398 399 // Populate unset ClientAuth fields with hardcoded defaults. 400 authDefaults := chromeinfra.DefaultAuthOptions() 401 if opts.ClientAuth.ClientID == "" { 402 opts.ClientAuth.ClientID = authDefaults.ClientID 403 opts.ClientAuth.ClientSecret = authDefaults.ClientSecret 404 } 405 if opts.ClientAuth.TokenServerHost == "" { 406 opts.ClientAuth.TokenServerHost = authDefaults.TokenServerHost 407 } 408 if opts.ClientAuth.SecretsDir == "" { 409 opts.ClientAuth.SecretsDir = authDefaults.SecretsDir 410 } 411 412 // Use CloudOAuthScopes by default when using UserCredentialsMethod auth mode. 413 // This is ignored when running in the cloud (the server uses the ambient 414 // credentials provided by the environment). 415 if len(opts.ClientAuth.Scopes) == 0 { 416 opts.ClientAuth.Scopes = auth.CloudOAuthScopes 417 } 418 419 // Prepopulate defaults for flags based on the runtime environment. 420 opts.FromGAEEnv() 421 if err := opts.FromCloudRunEnv(); err != nil { 422 return nil, errors.Annotate(err, "failed to probe Cloud Run environment").Err() 423 } 424 return opts, nil 425 } 426 427 // Register registers the command line flags. 428 func (o *Options) Register(f *flag.FlagSet) { 429 if o.HTTPAddr == "" { 430 o.HTTPAddr = "localhost:8800" 431 } 432 if o.GRPCAddr == "" { 433 o.GRPCAddr = "-" // disabled by default 434 } 435 if o.AdminAddr == "" { 436 o.AdminAddr = "localhost:8900" 437 } 438 if o.DefaultRequestTimeout == 0 { 439 o.DefaultRequestTimeout = time.Minute 440 } 441 if o.InternalRequestTimeout == 0 { 442 o.InternalRequestTimeout = 10 * time.Minute 443 } 444 if o.ShutdownDelay == 0 { 445 o.ShutdownDelay = 15 * time.Second 446 } 447 if o.TsMonFlushInterval == 0 { 448 o.TsMonFlushInterval = defaultTsMonFlushInterval 449 } 450 if o.TsMonFlushTimeout == 0 { 451 o.TsMonFlushTimeout = defaultTsMonFlushTimeout 452 } 453 if o.ProfilingProbability == 0 { 454 o.ProfilingProbability = 1.0 455 } else if o.ProfilingProbability < 0 { 456 o.ProfilingProbability = 0 457 } 458 f.BoolVar(&o.Prod, "prod", o.Prod, "Switch the server into production mode") 459 f.StringVar(&o.HTTPAddr, "http-addr", o.HTTPAddr, "Address to bind the main listening socket to or '-' to disable") 460 f.StringVar(&o.GRPCAddr, "grpc-addr", o.GRPCAddr, "Address to bind the gRPC listening socket to or '-' to disable") 461 f.StringVar(&o.AdminAddr, "admin-addr", o.AdminAddr, "Address to bind the admin socket to or '-' to disable") 462 f.BoolVar(&o.AllowH2C, "allow-h2c", o.AllowH2C, "If set, allow HTTP/2 Cleartext traffic on non-gRPC HTTP ports (in addition to HTTP/1 traffic). The gRPC port always allows it, it is essential for gRPC") 463 f.DurationVar(&o.DefaultRequestTimeout, "default-request-timeout", o.DefaultRequestTimeout, "How long incoming HTTP requests are allowed to run before being canceled (or 0 for infinity)") 464 f.DurationVar(&o.InternalRequestTimeout, "internal-request-timeout", o.InternalRequestTimeout, "How long incoming /internal/* HTTP requests are allowed to run before being canceled (or 0 for infinity)") 465 f.DurationVar(&o.ShutdownDelay, "shutdown-delay", o.ShutdownDelay, "How long to wait after SIGTERM before shutting down") 466 f.StringVar( 467 &o.ClientAuth.ServiceAccountJSONPath, 468 "service-account-json", 469 o.ClientAuth.ServiceAccountJSONPath, 470 "Path to a JSON file with service account private key", 471 ) 472 f.StringVar( 473 &o.ClientAuth.ActAsServiceAccount, 474 "act-as", 475 o.ClientAuth.ActAsServiceAccount, 476 "Act as this service account", 477 ) 478 f.StringVar( 479 &o.TokenCacheDir, 480 "token-cache-dir", 481 o.TokenCacheDir, 482 "Where to cache auth tokens (optional)", 483 ) 484 f.StringVar( 485 &o.AuthDBPath, 486 "auth-db-path", 487 o.AuthDBPath, 488 "If set, load AuthDB text proto from this file (incompatible with -auth-service-host)", 489 ) 490 f.StringVar( 491 &o.AuthServiceHost, 492 "auth-service-host", 493 o.AuthServiceHost, 494 "Hostname of an Auth Service to use (incompatible with -auth-db-path)", 495 ) 496 f.StringVar( 497 &o.AuthDBDump, 498 "auth-db-dump", 499 o.AuthDBDump, 500 "Google Storage path to fetch AuthDB dumps from. Default is gs://<auth-service-host>/auth-db", 501 ) 502 f.StringVar( 503 &o.AuthDBSigner, 504 "auth-db-signer", 505 o.AuthDBSigner, 506 "Service account that signs AuthDB dumps. Default is derived from -auth-service-host if it is *.appspot.com", 507 ) 508 f.StringVar( 509 &o.FrontendClientID, 510 "frontend-client-id", 511 o.FrontendClientID, 512 "OAuth2 clientID for use in frontend, e.g. for user sign in (optional)", 513 ) 514 f.BoolVar( 515 &o.OpenIDRPCAuthEnable, 516 "open-id-rpc-auth-enable", 517 o.OpenIDRPCAuthEnable, 518 "If set accept OpenID Connect ID tokens as per-RPC credentials", 519 ) 520 f.Var( 521 &o.OpenIDRPCAuthAudience, 522 "open-id-rpc-auth-audience", 523 "Additional accepted value of `aud` claim in OpenID tokens, can be repeated", 524 ) 525 f.StringVar( 526 &o.CloudProject, 527 "cloud-project", 528 o.CloudProject, 529 "Name of hosting Google Cloud Project (optional)", 530 ) 531 f.StringVar( 532 &o.CloudRegion, 533 "cloud-region", 534 o.CloudRegion, 535 "Name of hosting Google Cloud region, e.g. 'us-central1' (optional)", 536 ) 537 f.StringVar( 538 &o.TraceSampling, 539 "trace-sampling", 540 o.TraceSampling, 541 "What portion of traces to upload to Cloud Trace. Either a percent (i.e. '0.1%') or a QPS (i.e. '1qps'). Ignored on GAE and Cloud Run. Default is 0.1qps.", 542 ) 543 f.StringVar( 544 &o.TsMonAccount, 545 "ts-mon-account", 546 o.TsMonAccount, 547 "Collect and flush tsmon metrics using this account for auth (disables tsmon if not set)", 548 ) 549 f.StringVar( 550 &o.TsMonServiceName, 551 "ts-mon-service-name", 552 o.TsMonServiceName, 553 "Service name of tsmon target (disables tsmon if not set)", 554 ) 555 f.StringVar( 556 &o.TsMonJobName, 557 "ts-mon-job-name", 558 o.TsMonJobName, 559 "Job name of tsmon target (disables tsmon if not set)", 560 ) 561 f.DurationVar( 562 &o.TsMonFlushInterval, 563 "ts-mon-flush-interval", 564 o.TsMonFlushInterval, 565 fmt.Sprintf("How often to flush tsmon metrics. Default to %s if < 1s or unset", o.TsMonFlushInterval), 566 ) 567 f.DurationVar( 568 &o.TsMonFlushTimeout, 569 "ts-mon-flush-timeout", 570 o.TsMonFlushTimeout, 571 fmt.Sprintf("Timeout for tsmon flush. Default to %s if < 1s or unset. Must be shorter than --ts-mon-flush-interval.", o.TsMonFlushTimeout), 572 ) 573 f.Float64Var( 574 &o.ProfilingProbability, 575 "profiling-probability", 576 o.ProfilingProbability, 577 fmt.Sprintf("A float [0; 1.0] with probability to enable Cloud Profiler for the current process. Default is %f.", o.ProfilingProbability), 578 ) 579 f.StringVar( 580 &o.ProfilingServiceID, 581 "profiling-service-id", 582 o.ProfilingServiceID, 583 "Service name to associated with profiles in Cloud Profiler. Defaults to the value of -ts-mon-job-name.", 584 ) 585 f.StringVar( 586 &o.ContainerImageID, 587 "container-image-id", 588 o.ContainerImageID, 589 "ID of the container image with this binary, for logs (optional)", 590 ) 591 f.BoolVar( 592 &o.CloudErrorReporting, 593 "cloud-error-reporting", 594 o.CloudErrorReporting, 595 "Enable Cloud Error Reporting", 596 ) 597 598 // See go.chromium.org/luci/server/experiments. 599 f.Var(luciflag.StringSlice(&o.EnableExperiments), "enable-experiment", 600 `A name of the experiment to enable. May be repeated.`) 601 } 602 603 // FromGAEEnv uses the GAE_* env vars to configure the server for the GAE 604 // environment. 605 // 606 // Does nothing if GAE_VERSION is not set. 607 // 608 // Equivalent to passing the following flags: 609 // 610 // -prod 611 // -http-addr 0.0.0.0:${PORT} 612 // -admin-addr - 613 // -shutdown-delay 1s 614 // -cloud-project ${GOOGLE_CLOUD_PROJECT} 615 // -cloud-region <derived from the region code in GAE_APPLICATION> 616 // -service-account-json :gce 617 // -ts-mon-service-name ${GOOGLE_CLOUD_PROJECT} 618 // -ts-mon-job-name ${GAE_SERVICE} 619 // 620 // Additionally the hostname and -container-image-id (used in metric and trace 621 // fields) are derived from available GAE_* env vars to be semantically similar 622 // to what they represent in the GKE environment. 623 // 624 // Note that a mapping between a region code in GAE_APPLICATION and 625 // the corresponding cloud region is not documented anywhere, so if you see 626 // warnings when your app starts up either update the code to recognize your 627 // region code or pass '-cloud-region' argument explicitly in app.yaml. 628 // 629 // See https://cloud.google.com/appengine/docs/standard/go/runtime. 630 func (o *Options) FromGAEEnv() { 631 if os.Getenv("GAE_VERSION") == "" { 632 return 633 } 634 o.Serverless = module.GAE 635 o.Prod = true 636 o.Hostname = uniqueServerlessHostname( 637 os.Getenv("GAE_SERVICE"), 638 os.Getenv("GAE_DEPLOYMENT_ID"), 639 os.Getenv("GAE_INSTANCE"), 640 ) 641 o.HTTPAddr = fmt.Sprintf("0.0.0.0:%s", os.Getenv("PORT")) 642 o.GRPCAddr = "-" 643 o.AdminAddr = "-" 644 o.ShutdownDelay = time.Second 645 o.CloudProject = os.Getenv("GOOGLE_CLOUD_PROJECT") 646 o.ClientAuth.ServiceAccountJSONPath = clientauth.GCEServiceAccount 647 o.TsMonServiceName = os.Getenv("GOOGLE_CLOUD_PROJECT") 648 o.TsMonJobName = os.Getenv("GAE_SERVICE") 649 o.ContainerImageID = fmt.Sprintf("appengine/%s/%s:%s", 650 os.Getenv("GOOGLE_CLOUD_PROJECT"), 651 os.Getenv("GAE_SERVICE"), 652 os.Getenv("GAE_VERSION"), 653 ) 654 // Note: GAE_APPLICATION is missing on Flex. 655 if appID := os.Getenv("GAE_APPLICATION"); appID != "" && o.CloudRegion == "" { 656 o.CloudRegion = cloudRegionFromGAERegion[strings.Split(appID, "~")[0]] 657 } 658 } 659 660 // FromCloudRunEnv recognized K_SERVICE environment variable and configures 661 // some options based on what it discovers in the environment. 662 // 663 // Does nothing if K_SERVICE is not set. 664 // 665 // Equivalent to passing the following flags: 666 // 667 // -prod 668 // -http-addr - 669 // -grpc-addr - 670 // -admin-addr - 671 // -allow-h2c 672 // -shutdown-delay 1s 673 // -cloud-project <cloud project Cloud Run container is running in> 674 // -cloud-region <cloud region Cloud Run container is running in> 675 // -service-account-json :gce 676 // -open-id-rpc-auth-enable 677 // -ts-mon-service-name <cloud project Cloud Run container is running in> 678 // -ts-mon-job-name ${K_SERVICE} 679 // 680 // Flags passed via the actual command line in the Cloud Run manifest override 681 // these prefilled defaults. In particular pass either `-http-addr` or 682 // `-grpc-addr` (or both) to enable corresponding ports. 683 // 684 // Additionally the hostname (used in metric and trace fields) is derived from 685 // environment to be semantically similar to what it looks like in the GKE 686 // environment. 687 func (o *Options) FromCloudRunEnv() error { 688 if os.Getenv("K_SERVICE") == "" { 689 return nil 690 } 691 692 // See https://cloud.google.com/run/docs/container-contract. 693 project, err := gcemetadata.Get("project/project-id") 694 if err != nil { 695 return errors.Annotate(err, "failed to get the project ID").Err() 696 } 697 region, err := gcemetadata.Get("instance/region") 698 if err != nil { 699 return errors.Annotate(err, "failed to get the cloud region").Err() 700 } 701 // Region format returned by Cloud Run is `projects/PROJECT-NUMBER/regions/REGION` 702 parts := strings.Split(region, "/") 703 region = parts[len(parts)-1] 704 instance, err := gcemetadata.Get("instance/id") 705 if err != nil { 706 return errors.Annotate(err, "failed to get the instance ID").Err() 707 } 708 709 o.Serverless = module.CloudRun 710 o.Prod = true 711 o.Hostname = uniqueServerlessHostname(os.Getenv("K_REVISION"), instance) 712 o.HTTPAddr = "-" 713 o.GRPCAddr = "-" 714 o.AdminAddr = "-" 715 o.AllowH2C = true // to allow using HTTP2 end-to-end with `--use-http2` deployment flag 716 o.ShutdownDelay = time.Second 717 o.CloudProject = project 718 o.CloudRegion = region 719 o.ClientAuth.ServiceAccountJSONPath = clientauth.GCEServiceAccount 720 o.OpenIDRPCAuthEnable = true 721 o.TsMonServiceName = project 722 o.TsMonJobName = os.Getenv("K_SERVICE") 723 724 return nil 725 } 726 727 // uniqueServerlessHostname generates a hostname to use when running in a GCP 728 // serverless environment. 729 // 730 // Unlike GKE or GCE environments, serverless containers do not have a proper 731 // unique hostname set, but we still need to identify them uniquely in logs 732 // and monitoring metrics. They do have a giant hex instance ID string, but it 733 // is not informative on its own and cumbersome to use. 734 // 735 // This functions produces a reasonably readable and unique string that looks 736 // like `parts[0]-parts[1]-...-hash(parts[last])`. It assumes the last string 737 // in `parts` is the giant instance ID. 738 func uniqueServerlessHostname(parts ...string) string { 739 id := sha256.Sum256([]byte(parts[len(parts)-1])) 740 parts[len(parts)-1] = hex.EncodeToString(id[:])[:16] 741 return strings.Join(parts, "-") 742 } 743 744 // ImageVersion extracts image tag or digest from ContainerImageID. 745 // 746 // This is eventually reported as a value of 'server/version' metric. 747 // 748 // On GAE it would return the service version name based on GAE_VERSION env var, 749 // since ContainerImageID is artificially constructed to look like 750 // "appengine/${CLOUD_PROJECT}/${GAE_SERVICE}:${GAE_VERSION}". 751 // 752 // On Cloud Run it is responsibility of the deployment layer to correctly 753 // populate -container-image-id command line flag. 754 // 755 // Returns "unknown" if ContainerImageID is empty or malformed. 756 func (o *Options) ImageVersion() string { 757 // Recognize "<path>@sha256:<digest>" and "<path>:<tag>". 758 idx := strings.LastIndex(o.ContainerImageID, "@") 759 if idx == -1 { 760 idx = strings.LastIndex(o.ContainerImageID, ":") 761 } 762 if idx == -1 { 763 return "unknown" 764 } 765 return o.ContainerImageID[idx+1:] 766 } 767 768 // ImageName extracts image name from ContainerImageID. 769 // 770 // This is the part of ContainerImageID before ':' or '@'. 771 func (o *Options) ImageName() string { 772 // Recognize "<path>@sha256:<digest>" and "<path>:<tag>". 773 idx := strings.LastIndex(o.ContainerImageID, "@") 774 if idx == -1 { 775 idx = strings.LastIndex(o.ContainerImageID, ":") 776 } 777 if idx == -1 { 778 return "unknown" 779 } 780 return o.ContainerImageID[:idx] 781 } 782 783 // userAgent derives a user-agent like string identifying the server. 784 func (o *Options) userAgent() string { 785 return fmt.Sprintf("LUCI-Server (service: %s; job: %s; ver: %s);", o.TsMonServiceName, o.TsMonJobName, o.ImageVersion()) 786 } 787 788 // shouldEnableTracing is true if options indicate we should enable tracing. 789 func (o *Options) shouldEnableTracing() bool { 790 switch { 791 case o.CloudProject == "": 792 return false // nowhere to upload traces to 793 case !o.Prod && o.TraceSampling == "": 794 return false // in dev mode don't upload samples by default 795 default: 796 return !o.testDisableTracing 797 } 798 } 799 800 // hostOptions constructs HostOptions for module.Initialize(...). 801 func (o *Options) hostOptions() module.HostOptions { 802 return module.HostOptions{ 803 Prod: o.Prod, 804 Serverless: o.Serverless, 805 CloudProject: o.CloudProject, 806 CloudRegion: o.CloudRegion, 807 } 808 } 809 810 // Server is responsible for initializing and launching the serving environment. 811 // 812 // Generally assumed to be a singleton: do not launch multiple Server instances 813 // within the same process, use AddPort instead if you want to expose multiple 814 // HTTP ports with different routers. 815 // 816 // Server can serve plain HTTP endpoints, routing them trough a router.Router, 817 // and gRPC APIs (exposing them over gRPC and pRPC protocols). Use an instance 818 // of Server as a grpc.ServiceRegistrar when registering gRPC services. Services 819 // registered that way will be available via gRPC protocol over the gRPC port 820 // and via pRPC protocol over the main HTTP port. Interceptors can be added via 821 // RegisterUnaryServerInterceptors. RPC authentication can be configured via 822 // SetRPCAuthMethods. 823 // 824 // pRPC protocol is served on the same port as the main HTTP router, making it 825 // possible to expose just a single HTTP port for everything (which is a 826 // requirement on Appengine). 827 // 828 // Native gRPC protocol is always served though a dedicated gRPC h2c port since 829 // the gRPC library has its own HTTP/2 server implementation not compatible 830 // with net/http package used everywhere else. There's an assortments of hacks 831 // to workaround this, but many ultimately depend on experimental and slow 832 // grpc.Server.ServeHTTP method. See https://github.com/grpc/grpc-go/issues/586 833 // and https://github.com/grpc/grpc-go/issues/4620. Another often recommended 834 // workaround is https://github.com/soheilhy/cmux, which decides if a new 835 // connection is a gRPC one or a regular HTTP/2 one. It doesn't work when the 836 // server is running behind a load balancer that understand HTTP/2, since it 837 // just opens a **single** backend connection and sends both gRPC and regular 838 // HTTP/2 requests over it. This happens on Cloud Run, for example. See e.g. 839 // https://ahmet.im/blog/grpc-http-mux-go/. 840 // 841 // If you want to serve HTTP and gRPC over the same public port, configure your 842 // HTTP load balancer (e.g. https://cloud.google.com/load-balancing/docs/https) 843 // to route requests into appropriate containers and ports. Another alternative 844 // is to put an HTTP/2 proxy (e.g. Envoy) right into the pod with the server 845 // process and route traffic "locally" there. This option would also allow to 846 // add local grpc-web proxy into the mix if necessary. 847 // 848 // The server doesn't do TLS termination (even for gRPC traffic). It must be 849 // sitting behind a load balancer or a proxy that terminates TLS and sends clear 850 // text (HTTP/1 or HTTP/2 for gRPC) requests to corresponding ports, injecting 851 // `X-Forwarded-*` headers. See "Security considerations" section above for more 852 // details. 853 type Server struct { 854 // Context is the root context used by all requests and background activities. 855 // 856 // Can be replaced (by a derived context) before Serve call, for example to 857 // inject values accessible to all request handlers. 858 Context context.Context 859 860 // Routes is a router for requests hitting HTTPAddr port. 861 // 862 // This router is used for all requests whose Host header does not match any 863 // specially registered per-host routers (see VirtualHost). Normally, there 864 // are no such per-host routers, so usually Routes is used for all requests. 865 // 866 // This router is also accessible to the server modules and they can install 867 // routes into it. 868 // 869 // Should be populated before Serve call. 870 Routes *router.Router 871 872 // CookieAuth is an authentication method implemented via cookies. 873 // 874 // It is initialized only if the server has a module implementing such scheme 875 // (e.g. "go.chromium.org/luci/server/encryptedcookies"). 876 CookieAuth auth.Method 877 878 // Options is a copy of options passed to New. 879 Options Options 880 881 startTime time.Time // for calculating uptime for /healthz 882 lastReqTime atomic.Value // time.Time when the last request started 883 884 stdout sdlogger.LogEntryWriter // for logging to stdout, nil in dev mode 885 stderr sdlogger.LogEntryWriter // for logging to stderr, nil in dev mode 886 errRptClient *errorreporting.Client // for reporting to the cloud Error Reporting 887 logRequestCB func(context.Context, *sdlogger.LogEntry) // if non-nil, need to emit request log entries via it 888 889 mainPort *Port // pre-registered main HTTP port, see initMainPort 890 grpcPort *grpcPort // non-nil when exposing a gRPC port 891 prpc *prpc.Server // pRPC server implementation exposed on the main port 892 893 mu sync.Mutex // protects fields below 894 ports []servingPort // all non-dummy ports (each one bound to a TCP socket) 895 started bool // true inside and after Serve 896 stopped bool // true inside and after Shutdown 897 ready chan struct{} // closed right before starting the serving loop 898 done chan struct{} // closed after Shutdown returns 899 900 // gRPC/pRPC configuration. 901 unaryInterceptors []grpc.UnaryServerInterceptor 902 streamInterceptors []grpc.StreamServerInterceptor 903 rpcAuthMethods []auth.Method 904 905 rndM sync.Mutex // protects rnd 906 rnd *rand.Rand // used to generate trace and operation IDs 907 908 bgrDone chan struct{} // closed to stop background activities 909 bgrWg sync.WaitGroup // waits for RunInBackground goroutines to stop 910 911 warmupM sync.Mutex // protects 'warmup' and the actual warmup critical section 912 warmup []func(context.Context) 913 914 cleanupM sync.Mutex // protects 'cleanup' and the actual cleanup critical section 915 cleanup []func(context.Context) 916 917 tsmon *tsmon.State // manages flushing of tsmon metrics 918 propagator propagation.TextMapPropagator // knows how to propagate trace headers 919 920 cloudTS oauth2.TokenSource // source of cloud-scoped tokens for Cloud APIs 921 signer *signerImpl // the signer used by the auth system 922 actorTokens *actorTokensImpl // for impersonating service accounts 923 authDB atomic.Value // if not using AuthDBProvider, the last known good authdb.DB instance 924 925 runningAs string // email of an account the server runs as 926 } 927 928 // servingPort represents either an HTTP or gRPC serving port. 929 type servingPort interface { 930 nameForLog() string 931 serve(baseCtx func() context.Context) error 932 shutdown(ctx context.Context) 933 } 934 935 // moduleHostImpl implements module.Host via server.Server. 936 // 937 // Just a tiny wrapper to make sure modules consume only curated limited set of 938 // the server API and do not retain the pointer to the server. 939 type moduleHostImpl struct { 940 srv *Server 941 mod module.Module 942 invalid bool 943 cookieAuth auth.Method 944 } 945 946 func (h *moduleHostImpl) panicIfInvalid() { 947 if h.invalid { 948 panic("module.Host must not be used outside of Initialize") 949 } 950 } 951 952 func (h *moduleHostImpl) HTTPAddr() net.Addr { 953 h.panicIfInvalid() 954 if h.srv.mainPort.listener != nil { 955 return h.srv.mainPort.listener.Addr() 956 } 957 return nil 958 } 959 960 func (h *moduleHostImpl) GRPCAddr() net.Addr { 961 h.panicIfInvalid() 962 if h.srv.grpcPort != nil { 963 return h.srv.grpcPort.listener.Addr() 964 } 965 return nil 966 } 967 968 func (h *moduleHostImpl) Routes() *router.Router { 969 h.panicIfInvalid() 970 return h.srv.Routes 971 } 972 973 func (h *moduleHostImpl) RunInBackground(activity string, f func(context.Context)) { 974 h.panicIfInvalid() 975 h.srv.RunInBackground(activity, f) 976 } 977 978 func (h *moduleHostImpl) RegisterWarmup(cb func(context.Context)) { 979 h.panicIfInvalid() 980 h.srv.RegisterWarmup(cb) 981 } 982 983 func (h *moduleHostImpl) RegisterCleanup(cb func(context.Context)) { 984 h.panicIfInvalid() 985 h.srv.RegisterCleanup(cb) 986 } 987 988 func (h *moduleHostImpl) RegisterService(desc *grpc.ServiceDesc, impl any) { 989 h.panicIfInvalid() 990 h.srv.RegisterService(desc, impl) 991 } 992 993 func (h *moduleHostImpl) RegisterUnaryServerInterceptors(intr ...grpc.UnaryServerInterceptor) { 994 h.panicIfInvalid() 995 h.srv.RegisterUnaryServerInterceptors(intr...) 996 } 997 998 func (h *moduleHostImpl) RegisterStreamServerInterceptors(intr ...grpc.StreamServerInterceptor) { 999 h.panicIfInvalid() 1000 h.srv.RegisterStreamServerInterceptors(intr...) 1001 } 1002 1003 func (h *moduleHostImpl) RegisterCookieAuth(method auth.Method) { 1004 h.panicIfInvalid() 1005 h.cookieAuth = method 1006 } 1007 1008 // New constructs a new server instance. 1009 // 1010 // It hosts one or more HTTP servers and starts and stops them in unison. It is 1011 // also responsible for preparing contexts for incoming requests. 1012 // 1013 // The given context will become the root context of the server and will be 1014 // inherited by all handlers. 1015 // 1016 // On errors returns partially initialized server (always non-nil). At least 1017 // its logging will be configured and can be used to report the error. Trying 1018 // to use such partially initialized server for anything else is undefined 1019 // behavior. 1020 func New(ctx context.Context, opts Options, mods []module.Module) (srv *Server, err error) { 1021 seed := opts.testSeed 1022 if seed == 0 { 1023 if err := binary.Read(cryptorand.Reader, binary.BigEndian, &seed); err != nil { 1024 panic(err) 1025 } 1026 } 1027 1028 srv = &Server{ 1029 Context: ctx, 1030 Options: opts, 1031 startTime: clock.Now(ctx).UTC(), 1032 ready: make(chan struct{}), 1033 done: make(chan struct{}), 1034 rnd: rand.New(rand.NewSource(seed)), 1035 bgrDone: make(chan struct{}), 1036 } 1037 1038 // Cleanup what we can on failures. 1039 defer func() { 1040 if err != nil { 1041 srv.runCleanup() 1042 } 1043 }() 1044 1045 // Logging is needed to report any errors during the early initialization. 1046 srv.initLogging() 1047 1048 logging.Infof(srv.Context, "Server starting...") 1049 if srv.Options.ContainerImageID != "" { 1050 logging.Infof(srv.Context, "Container image is %s", srv.Options.ContainerImageID) 1051 } 1052 1053 // Need the hostname (e.g. pod name on k8s) for logs and metrics. 1054 if srv.Options.Hostname == "" { 1055 srv.Options.Hostname, err = os.Hostname() 1056 if err != nil { 1057 return srv, errors.Annotate(err, "failed to get own hostname").Err() 1058 } 1059 } 1060 1061 switch srv.Options.Serverless { 1062 case module.GAE: 1063 logging.Infof(srv.Context, "Running on %s", srv.Options.Hostname) 1064 logging.Infof(srv.Context, "Instance is %q", os.Getenv("GAE_INSTANCE")) 1065 if srv.Options.CloudRegion == "" { 1066 if appID := os.Getenv("GAE_APPLICATION"); appID != "" { 1067 logging.Warningf(srv.Context, "Could not figure out the primary Cloud region based "+ 1068 "on the region code in GAE_APPLICATION %q, consider passing the region name "+ 1069 "via -cloud-region flag explicitly", appID) 1070 } 1071 } else { 1072 logging.Infof(srv.Context, "Cloud region is %s", srv.Options.CloudRegion) 1073 } 1074 // Initialize default tickets for background activities. These tickets are 1075 // overridden in per-request contexts with request-specific tickets. 1076 srv.Context = gae.WithTickets(srv.Context, gae.DefaultTickets()) 1077 case module.CloudRun: 1078 logging.Infof(srv.Context, "Running on %s", srv.Options.Hostname) 1079 logging.Infof(srv.Context, "Revision is %q", os.Getenv("K_REVISION")) 1080 default: 1081 // On k8s log pod IPs too, this is useful when debugging k8s routing. 1082 logging.Infof(srv.Context, "Running on %s (%s)", srv.Options.Hostname, networkAddrsForLog()) 1083 } 1084 1085 // Log enabled experiments, warn if some of them are unknown now. 1086 var exps []experiments.ID 1087 for _, name := range opts.EnableExperiments { 1088 if exp, ok := experiments.GetByName(name); ok { 1089 logging.Infof(ctx, "Enabling experiment %q", name) 1090 exps = append(exps, exp) 1091 } else { 1092 logging.Warningf(ctx, "Skipping unknown experiment %q", name) 1093 } 1094 } 1095 srv.Context = experiments.Enable(srv.Context, exps...) 1096 1097 // Configure base server subsystems by injecting them into the root context 1098 // inherited later by all requests. 1099 srv.Context = caching.WithProcessCacheData(srv.Context, caching.NewProcessCacheData()) 1100 if err := srv.initAuthStart(); err != nil { 1101 return srv, errors.Annotate(err, "failed to initialize auth").Err() 1102 } 1103 if err := srv.initTSMon(); err != nil { 1104 return srv, errors.Annotate(err, "failed to initialize tsmon").Err() 1105 } 1106 if err := srv.initAuthFinish(); err != nil { 1107 return srv, errors.Annotate(err, "failed to finish auth initialization").Err() 1108 } 1109 if err := srv.initTracing(); err != nil { 1110 return srv, errors.Annotate(err, "failed to initialize tracing").Err() 1111 } 1112 if err := srv.initErrorReporting(); err != nil { 1113 return srv, errors.Annotate(err, "failed to initialize error reporting").Err() 1114 } 1115 if err := srv.initProfiling(); err != nil { 1116 return srv, errors.Annotate(err, "failed to initialize profiling").Err() 1117 } 1118 if err := srv.initMainPort(); err != nil { 1119 return srv, errors.Annotate(err, "failed to initialize the main port").Err() 1120 } 1121 if err := srv.initGrpcPort(); err != nil { 1122 return srv, errors.Annotate(err, "failed to initialize the gRPC port").Err() 1123 } 1124 if err := srv.initAdminPort(); err != nil { 1125 return srv, errors.Annotate(err, "failed to initialize the admin port").Err() 1126 } 1127 if err := srv.initWarmup(); err != nil { 1128 return srv, errors.Annotate(err, "failed to initialize warmup callbacks").Err() 1129 } 1130 1131 // Sort modules by their initialization order based on declared dependencies, 1132 // discover unfulfilled required dependencies. 1133 sorted, err := resolveDependencies(mods) 1134 if err != nil { 1135 return srv, err 1136 } 1137 1138 // Initialize all modules in their topological order. 1139 impls := make([]*moduleHostImpl, len(sorted)) 1140 for i, mod := range sorted { 1141 impls[i] = &moduleHostImpl{srv: srv, mod: mod} 1142 switch ctx, err := mod.Initialize(srv.Context, impls[i], srv.Options.hostOptions()); { 1143 case err != nil: 1144 return srv, errors.Annotate(err, "failed to initialize module %q", mod.Name()).Err() 1145 case ctx != nil: 1146 srv.Context = ctx 1147 } 1148 impls[i].invalid = true // make sure the module does not retain it 1149 } 1150 1151 // Ensure there's only one CookieAuth method registered. 1152 var cookieAuthMod module.Module 1153 for _, impl := range impls { 1154 if impl.cookieAuth != nil { 1155 if cookieAuthMod != nil { 1156 return srv, errors.Annotate(err, 1157 "conflict between %q and %q: both register a cookie auth scheme - pick one", 1158 cookieAuthMod.Name(), impl.mod.Name(), 1159 ).Err() 1160 } 1161 cookieAuthMod = impl.mod 1162 srv.CookieAuth = impl.cookieAuth 1163 } 1164 } 1165 1166 // Install the RPC Explorer, using the registered auth method if it is 1167 // compatible. 1168 rpcExpAuth, _ := srv.CookieAuth.(rpcexplorer.AuthMethod) 1169 rpcexplorer.Install(srv.Routes, rpcExpAuth) 1170 1171 return srv, nil 1172 } 1173 1174 // AddPort prepares and binds an additional serving HTTP port. 1175 // 1176 // Can be used to open more listening HTTP ports (in addition to opts.HTTPAddr 1177 // and opts.AdminAddr). The returned Port object can be used to populate the 1178 // router that serves requests hitting the added port. 1179 // 1180 // If opts.ListenAddr is '-', a dummy port will be added: it is a valid *Port 1181 // object, but it is not actually exposed as a listening TCP socket. This is 1182 // useful to disable listening ports without changing any code. 1183 // 1184 // Must be called before Serve (panics otherwise). 1185 func (s *Server) AddPort(opts PortOptions) (*Port, error) { 1186 port := &Port{ 1187 Routes: s.newRouter(opts), 1188 parent: s, 1189 opts: opts, 1190 allowH2C: s.Options.AllowH2C, 1191 } 1192 1193 s.mu.Lock() 1194 defer s.mu.Unlock() 1195 if s.started { 1196 s.Fatal(errors.Reason("the server has already been started").Err()) 1197 } 1198 1199 if opts.ListenAddr != "-" { 1200 var err error 1201 if port.listener, err = s.createListener(opts.ListenAddr); err != nil { 1202 return nil, errors.Annotate(err, "failed to bind the listening port for %q at %q", opts.Name, opts.ListenAddr).Err() 1203 } 1204 // Add to the list of ports that actually have sockets listening. 1205 s.ports = append(s.ports, port) 1206 } 1207 1208 return port, nil 1209 } 1210 1211 // VirtualHost returns a router (registering it if necessary) used for requests 1212 // that hit the main port (opts.HTTPAddr) and have the given Host header. 1213 // 1214 // Should be used in rare cases when the server is exposed through multiple 1215 // domain names and requests should be routed differently based on what domain 1216 // was used. If your server is serving only one domain name, or you don't care 1217 // what domain name is used to access it, do not use VirtualHost. 1218 // 1219 // Note that requests that match some registered virtual host router won't 1220 // reach the default router (server.Routes), even if the virtual host router 1221 // doesn't have a route for them. Such requests finish with HTTP 404. 1222 // 1223 // Also the router created by VirtualHost is initially completely empty: the 1224 // server and its modules don't install anything into it (there's intentionally 1225 // no mechanism to do this). For that reason VirtualHost should never by used to 1226 // register a router for the "main" domain name: it will make the default 1227 // server.Routes (and all handlers installed there by server modules) useless, 1228 // probably breaking the server. Put routes for the main server functionality 1229 // directly into server.Routes instead, using VirtualHost only for routes that 1230 // critically depend on Host header. 1231 // 1232 // Must be called before Serve (panics otherwise). 1233 func (s *Server) VirtualHost(host string) *router.Router { 1234 return s.mainPort.VirtualHost(host) 1235 } 1236 1237 // createListener creates a TCP listener on the given address. 1238 func (s *Server) createListener(addr string) (net.Listener, error) { 1239 // If not running tests, bind the socket as usual. 1240 if s.Options.testListeners == nil { 1241 return net.Listen("tcp", addr) 1242 } 1243 // In test mode the listener MUST be prepared already. 1244 l := s.Options.testListeners[addr] 1245 if l == nil { 1246 return nil, errors.Reason("test listener is not set").Err() 1247 } 1248 return l, nil 1249 } 1250 1251 // newRouter creates a Router with the default middleware chain and routes. 1252 func (s *Server) newRouter(opts PortOptions) *router.Router { 1253 s.mu.Lock() 1254 defer s.mu.Unlock() 1255 if s.started { 1256 s.Fatal(errors.Reason("the server has already been started").Err()) 1257 } 1258 1259 // This is a chain of router.Middleware. It is preceded by a chain of raw 1260 // net/http middlewares (see wrapHTTPHandler): 1261 // * s.httpRoot: initializes *incomingRequest in the context. 1262 // * otelhttp.NewHandler: opens a tracing span. 1263 // * s.httpDispatch: finishes the context initialization. 1264 mw := router.NewMiddlewareChain( 1265 middleware.WithPanicCatcher, // transforms panics into HTTP 500 1266 ) 1267 if s.tsmon != nil && !opts.DisableMetrics { 1268 mw = mw.Extend(s.tsmon.Middleware) // collect HTTP requests metrics 1269 } 1270 1271 // Setup middleware chain used by ALL requests. 1272 r := router.New() 1273 r.Use(mw) 1274 1275 // Mandatory health check/readiness probe endpoint. 1276 r.GET(healthEndpoint, nil, func(c *router.Context) { 1277 c.Writer.Write([]byte(s.healthResponse(c.Request.Context()))) 1278 }) 1279 1280 // Add NotFound handler wrapped in our middlewares so that unrecognized 1281 // requests are at least logged. If we don't do that they'll be handled 1282 // completely silently and this is very confusing when debugging 404s. 1283 r.NotFound(nil, func(c *router.Context) { 1284 http.NotFound(c.Writer, c.Request) 1285 }) 1286 1287 return r 1288 } 1289 1290 // RunInBackground launches the given callback in a separate goroutine right 1291 // before starting the serving loop. 1292 // 1293 // If the server is already running, launches it right away. If the server 1294 // fails to start, the goroutines will never be launched. 1295 // 1296 // Should be used for background asynchronous activities like reloading configs. 1297 // 1298 // All logs lines emitted by the callback are annotated with "activity" field 1299 // which can be arbitrary, but by convention has format "<namespace>.<name>", 1300 // where "luci" namespace is reserved for internal activities. 1301 // 1302 // The context passed to the callback is canceled when the server is shutting 1303 // down. It is expected the goroutine will exit soon after the context is 1304 // canceled. 1305 func (s *Server) RunInBackground(activity string, f func(context.Context)) { 1306 s.bgrWg.Add(1) 1307 go func() { 1308 defer s.bgrWg.Done() 1309 1310 select { 1311 case <-s.ready: 1312 // Construct the context after the server is fully initialized. Cancel it 1313 // as soon as bgrDone is signaled. 1314 ctx, cancel := context.WithCancel(s.Context) 1315 if activity != "" { 1316 ctx = logging.SetField(ctx, "activity", activity) 1317 } 1318 defer cancel() 1319 go func() { 1320 select { 1321 case <-s.bgrDone: 1322 cancel() 1323 case <-ctx.Done(): 1324 } 1325 }() 1326 f(ctx) 1327 1328 case <-s.bgrDone: 1329 // the server is closed, no need to run f() anymore 1330 } 1331 }() 1332 } 1333 1334 // RegisterService is part of grpc.ServiceRegistrar interface. 1335 // 1336 // The registered service will be exposed through both gRPC and pRPC protocols 1337 // on corresponding ports. See Server doc. 1338 // 1339 // Must be called before Serve (panics otherwise). 1340 func (s *Server) RegisterService(desc *grpc.ServiceDesc, impl any) { 1341 s.mu.Lock() 1342 defer s.mu.Unlock() 1343 if s.started { 1344 s.Fatal(errors.Reason("the server has already been started").Err()) 1345 } 1346 s.prpc.RegisterService(desc, impl) 1347 if s.grpcPort != nil { 1348 s.grpcPort.registerService(desc, impl) 1349 } 1350 } 1351 1352 // RegisterUnaryServerInterceptors registers grpc.UnaryServerInterceptor's 1353 // applied to all unary RPCs that hit the server. 1354 // 1355 // Interceptors are chained in order they are registered, i.e. the first 1356 // registered interceptor becomes the outermost. The initial chain already 1357 // contains some base interceptors (e.g. for monitoring) and all interceptors 1358 // registered by server modules. RegisterUnaryServerInterceptors extends this 1359 // chain. Subsequent calls to RegisterUnaryServerInterceptors adds more 1360 // interceptors into the chain. 1361 // 1362 // Must be called before Serve (panics otherwise). 1363 func (s *Server) RegisterUnaryServerInterceptors(intr ...grpc.UnaryServerInterceptor) { 1364 s.mu.Lock() 1365 defer s.mu.Unlock() 1366 if s.started { 1367 s.Fatal(errors.Reason("the server has already been started").Err()) 1368 } 1369 s.unaryInterceptors = append(s.unaryInterceptors, intr...) 1370 } 1371 1372 // RegisterStreamServerInterceptors registers grpc.StreamServerInterceptor's 1373 // applied to all streaming RPCs that hit the server. 1374 // 1375 // Interceptors are chained in order they are registered, i.e. the first 1376 // registered interceptor becomes the outermost. The initial chain already 1377 // contains some base interceptors (e.g. for monitoring) and all interceptors 1378 // registered by server modules. RegisterStreamServerInterceptors extends this 1379 // chain. Subsequent calls to RegisterStreamServerInterceptors adds more 1380 // interceptors into the chain. 1381 // 1382 // Must be called before Serve (panics otherwise). 1383 func (s *Server) RegisterStreamServerInterceptors(intr ...grpc.StreamServerInterceptor) { 1384 s.mu.Lock() 1385 defer s.mu.Unlock() 1386 if s.started { 1387 s.Fatal(errors.Reason("the server has already been started").Err()) 1388 } 1389 s.streamInterceptors = append(s.streamInterceptors, intr...) 1390 } 1391 1392 // RegisterUnifiedServerInterceptors registers given interceptors into both 1393 // unary and stream interceptor chains. 1394 // 1395 // It is just a convenience helper for UnifiedServerInterceptor's that usually 1396 // need to be registered in both unary and stream interceptor chains. This 1397 // method is equivalent to calling RegisterUnaryServerInterceptors and 1398 // RegisterStreamServerInterceptors, passing corresponding flavors of 1399 // interceptors to them. 1400 // 1401 // Must be called before Serve (panics otherwise). 1402 func (s *Server) RegisterUnifiedServerInterceptors(intr ...grpcutil.UnifiedServerInterceptor) { 1403 s.mu.Lock() 1404 defer s.mu.Unlock() 1405 if s.started { 1406 s.Fatal(errors.Reason("the server has already been started").Err()) 1407 } 1408 for _, cb := range intr { 1409 s.unaryInterceptors = append(s.unaryInterceptors, cb.Unary()) 1410 s.streamInterceptors = append(s.streamInterceptors, cb.Stream()) 1411 } 1412 } 1413 1414 // ConfigurePRPC allows tweaking pRPC-specific server configuration. 1415 // 1416 // Use it only for changing pRPC-specific options (usually ones that are related 1417 // to HTTP protocol in some way). This method **must not be used** for 1418 // registering interceptors or setting authentication options (changes to them 1419 // done here will cause a panic). Instead use RegisterUnaryServerInterceptors to 1420 // register interceptors or SetRPCAuthMethods to change how the server 1421 // authenticates RPC requests. Changes done through these methods will apply 1422 // to both gRPC and pRPC servers. 1423 // 1424 // Must be called before Serve (panics otherwise). 1425 func (s *Server) ConfigurePRPC(cb func(srv *prpc.Server)) { 1426 s.mu.Lock() 1427 defer s.mu.Unlock() 1428 if s.started { 1429 s.Fatal(errors.Reason("the server has already been started").Err()) 1430 } 1431 cb(s.prpc) 1432 if s.prpc.UnaryServerInterceptor != nil { 1433 panic("use Server.RegisterUnaryServerInterceptors to register interceptors") 1434 } 1435 } 1436 1437 // SetRPCAuthMethods overrides how the server authenticates incoming gRPC and 1438 // pRPC requests. 1439 // 1440 // It receives a list of auth.Method implementations which will be applied 1441 // one after another to try to authenticate the request until the first 1442 // successful hit. If all methods end up to be non-applicable (i.e. none of the 1443 // methods notice any headers they recognize), the request will be passed 1444 // through to the handler as anonymous (coming from an "anonymous identity"). 1445 // Rejecting anonymous requests (if necessary) is the job of an authorization 1446 // layer, often implemented as a gRPC interceptor. For simple cases use 1447 // go.chromium.org/luci/server/auth/rpcacl interceptor. 1448 // 1449 // By default (if SetRPCAuthMethods is never called) the server will check 1450 // incoming requests have an `Authorization` header with a Google OAuth2 access 1451 // token that has `https://www.googleapis.com/auth/userinfo.email` scope (see 1452 // auth.GoogleOAuth2Method). Requests without `Authorization` header will be 1453 // considered anonymous. 1454 // 1455 // If OpenIDRPCAuthEnable option is set (matching `-open-id-rpc-auth-enable` 1456 // flag), the service will recognize ID tokens as well. This is important for 1457 // e.g. Cloud Run where this is the only authentication method supported 1458 // natively by the platform. ID tokens are also generally faster to check than 1459 // access tokens. 1460 // 1461 // Note that this call completely overrides the previously configured list of 1462 // methods instead of appending to it, since chaining auth methods is often 1463 // tricky and it is safer to just always provide the whole list at once. 1464 // 1465 // Passing an empty list of methods is allowed. All requests will be considered 1466 // anonymous in that case. 1467 // 1468 // Note that this call **doesn't affect** how plain HTTP requests (hitting the 1469 // main HTTP port and routed through s.Router) are authenticated. Very often 1470 // RPC requests and plain HTTP requests need different authentication methods 1471 // and using an RPC authentication for everything is incorrect. To authenticate 1472 // plain HTTP requests use auth.Authenticate(...) HTTP router middleware, 1473 // perhaps in combination with s.CookieAuth (which is non-nil if there is a 1474 // server module installed that provides a cookie-based authentication scheme). 1475 // 1476 // Must be called before Serve (panics otherwise). 1477 func (s *Server) SetRPCAuthMethods(methods []auth.Method) { 1478 s.mu.Lock() 1479 defer s.mu.Unlock() 1480 if s.started { 1481 s.Fatal(errors.Reason("the server has already been started").Err()) 1482 } 1483 s.rpcAuthMethods = methods 1484 } 1485 1486 // Serve launches the serving loop. 1487 // 1488 // Blocks forever or until the server is stopped via Shutdown (from another 1489 // goroutine or from a SIGTERM handler). Returns nil if the server was shutdown 1490 // correctly or an error if it failed to start or unexpectedly died. The error 1491 // is logged inside. 1492 // 1493 // Should be called only once. Panics otherwise. 1494 func (s *Server) Serve() error { 1495 // Set s.started flag to "lock" the configuration. This would allow to read 1496 // fields like `s.ports` without the fear of a race conditions. 1497 s.mu.Lock() 1498 if s.started { 1499 s.mu.Unlock() 1500 s.Fatal(errors.Reason("the server has already been started").Err()) 1501 } 1502 s.started = true 1503 s.mu.Unlock() 1504 1505 // The configuration is "locked" now and we can finish the setup. 1506 authInterceptor := auth.AuthenticatingInterceptor(s.rpcAuthMethods) 1507 1508 // Assemble the final interceptor chains: base interceptors => auth => 1509 // whatever was installed by users of server.Server. Note we put grpcmon 1510 // before the panic catcher to make sure panics are actually reported to 1511 // the monitoring. grpcmon is also before the authentication to make sure 1512 // auth errors are reported as well. 1513 unaryInterceptors := append([]grpc.UnaryServerInterceptor{ 1514 grpcmon.UnaryServerInterceptor, 1515 grpcutil.UnaryServerPanicCatcherInterceptor, 1516 authInterceptor.Unary(), 1517 }, s.unaryInterceptors...) 1518 streamInterceptors := append([]grpc.StreamServerInterceptor{ 1519 grpcmon.StreamServerInterceptor, 1520 grpcutil.StreamServerPanicCatcherInterceptor, 1521 authInterceptor.Stream(), 1522 }, s.streamInterceptors...) 1523 1524 // Finish setting the pRPC server. It supports only unary RPCs. The root 1525 // request context is created in the HTTP land using base HTTP middlewares. 1526 s.prpc.UnaryServerInterceptor = grpcutil.ChainUnaryServerInterceptors(unaryInterceptors...) 1527 1528 // Finish setting the gRPC server, if enabled. 1529 if s.grpcPort != nil { 1530 grpcRoot := s.grpcRoot() 1531 grpcDispatch := s.grpcDispatch() 1532 s.grpcPort.addServerOptions( 1533 grpc.ChainUnaryInterceptor( 1534 grpcRoot.Unary(), 1535 otelgrpc.UnaryServerInterceptor(), 1536 grpcDispatch.Unary(), 1537 ), 1538 grpc.ChainUnaryInterceptor(unaryInterceptors...), 1539 grpc.ChainStreamInterceptor( 1540 grpcRoot.Stream(), 1541 otelgrpc.StreamServerInterceptor(), 1542 grpcDispatch.Stream(), 1543 ), 1544 grpc.ChainStreamInterceptor(streamInterceptors...), 1545 ) 1546 } 1547 1548 // Run registered best-effort warmup callbacks right before serving. 1549 s.runWarmup() 1550 1551 // Catch SIGTERM while inside the serving loop. Upon receiving SIGTERM, wait 1552 // until the pod is removed from the load balancer before actually shutting 1553 // down and refusing new connections. If we shutdown immediately, some clients 1554 // may see connection errors, because they are not aware yet the server is 1555 // closing: Pod shutdown sequence and Endpoints list updates are racing with 1556 // each other, we want Endpoints list updates to win, i.e. we want the pod to 1557 // actually be fully alive as long as it is still referenced in Endpoints 1558 // list. We can't guarantee this, but we can improve chances. 1559 stop := signals.HandleInterrupt(func() { 1560 if s.Options.Prod { 1561 s.waitUntilNotServing() 1562 } 1563 s.Shutdown() 1564 }) 1565 defer stop() 1566 1567 // Log how long it took from 'New' to the serving loop. 1568 logging.Infof(s.Context, "Startup done in %s", clock.Now(s.Context).Sub(s.startTime)) 1569 1570 // Unblock all pending RunInBackground goroutines, so they can start. 1571 close(s.ready) 1572 1573 // Run serving loops in parallel. 1574 errs := make(errors.MultiError, len(s.ports)) 1575 wg := sync.WaitGroup{} 1576 wg.Add(len(s.ports)) 1577 for i, port := range s.ports { 1578 logging.Infof(s.Context, "Serving %s", port.nameForLog()) 1579 i := i 1580 port := port 1581 go func() { 1582 defer wg.Done() 1583 if err := port.serve(func() context.Context { return s.Context }); err != nil { 1584 logging.WithError(err).Errorf(s.Context, "Server %s failed", port.nameForLog()) 1585 errs[i] = err 1586 s.Shutdown() // close all other servers 1587 } 1588 }() 1589 } 1590 wg.Wait() 1591 1592 // Per http.Server docs, we end up here *immediately* after Shutdown call was 1593 // initiated. Some requests can still be in-flight. We block until they are 1594 // done (as indicated by Shutdown call itself exiting). 1595 logging.Infof(s.Context, "Waiting for the server to stop...") 1596 <-s.done 1597 logging.Infof(s.Context, "The serving loop stopped, running the final cleanup...") 1598 s.runCleanup() 1599 logging.Infof(s.Context, "The server has stopped") 1600 1601 if errs.First() != nil { 1602 return errs 1603 } 1604 return nil 1605 } 1606 1607 // Shutdown gracefully stops the server if it was running. 1608 // 1609 // Blocks until the server is stopped. Can be called multiple times. 1610 func (s *Server) Shutdown() { 1611 s.mu.Lock() 1612 defer s.mu.Unlock() 1613 if s.stopped { 1614 return 1615 } 1616 1617 logging.Infof(s.Context, "Shutting down the server...") 1618 1619 // Tell all RunInBackground goroutines to stop. 1620 close(s.bgrDone) 1621 1622 // Stop all http.Servers in parallel. Each Shutdown call blocks until the 1623 // corresponding server is stopped. 1624 wg := sync.WaitGroup{} 1625 wg.Add(len(s.ports)) 1626 for _, port := range s.ports { 1627 port := port 1628 go func() { 1629 defer wg.Done() 1630 port.shutdown(s.Context) 1631 }() 1632 } 1633 wg.Wait() 1634 1635 // Wait for all background goroutines to stop. 1636 s.bgrWg.Wait() 1637 1638 // Notify Serve that it can exit now. 1639 s.stopped = true 1640 close(s.done) 1641 } 1642 1643 // Fatal logs the error and immediately shuts down the process with exit code 3. 1644 // 1645 // No cleanup is performed. Deferred statements are not run. Not recoverable. 1646 func (s *Server) Fatal(err error) { 1647 errors.Log(s.Context, err) 1648 os.Exit(3) 1649 } 1650 1651 // healthResponse prepares text/plan response for the health check endpoints. 1652 // 1653 // It additionally contains some easy to obtain information that may help in 1654 // debugging deployments. 1655 func (s *Server) healthResponse(c context.Context) string { 1656 maybeEmpty := func(s string) string { 1657 if s == "" { 1658 return "<unknown>" 1659 } 1660 return s 1661 } 1662 return strings.Join([]string{ 1663 "OK", 1664 "", 1665 "uptime: " + clock.Now(c).Sub(s.startTime).String(), 1666 "image: " + maybeEmpty(s.Options.ContainerImageID), 1667 "", 1668 "service: " + maybeEmpty(s.Options.TsMonServiceName), 1669 "job: " + maybeEmpty(s.Options.TsMonJobName), 1670 "host: " + s.Options.Hostname, 1671 "", 1672 }, "\n") 1673 } 1674 1675 // waitUntilNotServing is called during the graceful shutdown and it tries to 1676 // figure out when the traffic stops flowing to the server (i.e. when it is 1677 // removed from the load balancer). 1678 // 1679 // It's a heuristic optimization for the case when the load balancer keeps 1680 // sending traffic to a terminating Pod for some time after the Pod entered 1681 // "Terminating" state. It can happen due to latencies in Endpoints list 1682 // updates. We want to keep the listening socket open as long as there are 1683 // incoming requests (but no longer than 1 min). 1684 func (s *Server) waitUntilNotServing() { 1685 logging.Infof(s.Context, "Received SIGTERM, waiting for the traffic to stop...") 1686 1687 // When the server is idle the loop below exits immediately and the server 1688 // enters the shutdown path, rejecting new connections. Since we gave 1689 // Kubernetes no time to update the Endpoints list, it is possible someone 1690 // still might send a request to the server (and it will be rejected). 1691 // To avoid that we always sleep a bit here to give Kubernetes a chance to 1692 // propagate the Endpoints list update everywhere. The loop below then 1693 // verifies clients got the update and stopped sending requests. 1694 time.Sleep(s.Options.ShutdownDelay) 1695 1696 deadline := clock.Now(s.Context).Add(time.Minute) 1697 for { 1698 now := clock.Now(s.Context) 1699 lastReq, ok := s.lastReqTime.Load().(time.Time) 1700 if !ok || now.Sub(lastReq) > 15*time.Second { 1701 logging.Infof(s.Context, "No requests received in the last 15 sec, proceeding with the shutdown...") 1702 break 1703 } 1704 if now.After(deadline) { 1705 logging.Warningf(s.Context, "Gave up waiting for the traffic to stop, proceeding with the shutdown...") 1706 break 1707 } 1708 time.Sleep(100 * time.Millisecond) 1709 } 1710 } 1711 1712 // RegisterWarmup registers a callback that is run in server's Serve right 1713 // before the serving loop. 1714 // 1715 // It receives the global server context (including all customizations made 1716 // by the user code in server.Main). Intended for best-effort warmups: there's 1717 // no way to gracefully abort the server startup from a warmup callback. 1718 // 1719 // Registering a new warmup callback from within a warmup causes a deadlock, 1720 // don't do that. 1721 func (s *Server) RegisterWarmup(cb func(context.Context)) { 1722 s.warmupM.Lock() 1723 defer s.warmupM.Unlock() 1724 s.warmup = append(s.warmup, cb) 1725 } 1726 1727 // runWarmup runs all registered warmup functions (sequentially in registration 1728 // order). 1729 func (s *Server) runWarmup() { 1730 s.warmupM.Lock() 1731 defer s.warmupM.Unlock() 1732 ctx := logging.SetField(s.Context, "activity", "luci.warmup") 1733 for _, cb := range s.warmup { 1734 cb(ctx) 1735 } 1736 } 1737 1738 // RegisterCleanup registers a callback that is run in Serve after the server 1739 // has exited the serving loop. 1740 // 1741 // Registering a new cleanup callback from within a cleanup causes a deadlock, 1742 // don't do that. 1743 func (s *Server) RegisterCleanup(cb func(context.Context)) { 1744 s.cleanupM.Lock() 1745 defer s.cleanupM.Unlock() 1746 s.cleanup = append(s.cleanup, cb) 1747 } 1748 1749 // runCleanup runs all registered cleanup functions (sequentially in reverse 1750 // order). 1751 func (s *Server) runCleanup() { 1752 s.cleanupM.Lock() 1753 defer s.cleanupM.Unlock() 1754 for i := len(s.cleanup) - 1; i >= 0; i-- { 1755 s.cleanup[i](s.Context) 1756 } 1757 } 1758 1759 // genUniqueBlob writes a pseudo-random byte blob into the given slice. 1760 func (s *Server) genUniqueBlob(b []byte) { 1761 s.rndM.Lock() 1762 s.rnd.Read(b) 1763 s.rndM.Unlock() 1764 } 1765 1766 // genUniqueID returns pseudo-random hex string of given even length. 1767 func (s *Server) genUniqueID(l int) string { 1768 b := make([]byte, l/2) 1769 s.genUniqueBlob(b) 1770 return hex.EncodeToString(b) 1771 } 1772 1773 // incomingRequest is a request received by the server. 1774 // 1775 // It is either an HTTP or a gRPC request. 1776 type incomingRequest struct { 1777 url string // the full URL for logs 1778 method string // HTTP method verb for logs, e.g. "POST" 1779 metadata auth.RequestMetadata // headers etc. 1780 healthCheck bool // true if this is a health check request 1781 } 1782 1783 // requestResult is logged after completion of a request. 1784 type requestResult struct { 1785 statusCode int // the HTTP status code to log 1786 requestSize int64 // the request size in bytes if known 1787 responseSize int64 // the response size in bytes if known 1788 extraFields logging.Fields // extra fields to log (will be mutated!) 1789 } 1790 1791 // wrapHTTPHandler wraps port's router into net/http middlewares. 1792 // 1793 // TODO(vadimsh): Get rid of router.Middleware and move this to newRouter(...). 1794 // Since introduction of http.Request.Context() there's no reason for 1795 // router.Middleware to exist anymore. 1796 func (s *Server) wrapHTTPHandler(next http.Handler) http.Handler { 1797 return s.httpRoot( 1798 otelhttp.NewHandler( 1799 s.httpDispatch(next), 1800 "", 1801 otelhttp.WithMessageEvents(otelhttp.ReadEvents, otelhttp.WriteEvents), 1802 otelhttp.WithSpanNameFormatter(func(_ string, r *http.Request) string { 1803 return r.URL.Path 1804 }), 1805 ), 1806 ) 1807 } 1808 1809 // httpRoot is the entry point for non-gRPC HTTP requests. 1810 // 1811 // It is an http/net middleware for interoperability with other existing 1812 // http/net middlewares (currently only OpenTelemetry otelhttp middleware). 1813 // 1814 // Its job is to initialize *incomingRequest in the context which is then 1815 // examined by other middlewares (and the tracing sampler), in particular in 1816 // httpDispatch. 1817 // 1818 // See grpcRoot(...) for a gRPC counterpart. 1819 func (s *Server) httpRoot(next http.Handler) http.Handler { 1820 return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { 1821 // This context is derived from s.Context (see Serve) and has various server 1822 // systems injected into it already. Its only difference from s.Context is 1823 // that http.Server cancels it when the client disconnects, which we want. 1824 ctx := r.Context() 1825 1826 // Apply per-request HTTP timeout, if any. 1827 timeout := s.Options.DefaultRequestTimeout 1828 if strings.HasPrefix(r.URL.Path, "/internal/") { 1829 timeout = s.Options.InternalRequestTimeout 1830 } 1831 if timeout != 0 { 1832 var cancelCtx context.CancelFunc 1833 ctx, cancelCtx = context.WithTimeout(ctx, timeout) 1834 defer cancelCtx() 1835 } 1836 1837 // Reconstruct the original URL for logging. 1838 protocol := r.Header.Get("X-Forwarded-Proto") 1839 if protocol != "https" { 1840 protocol = "http" 1841 } 1842 url := fmt.Sprintf("%s://%s%s", protocol, r.Host, r.RequestURI) 1843 1844 // incomingRequest is used by middlewares that work with both HTTP and gRPC 1845 // requests, in particular it is used by startRequest(...). 1846 next.ServeHTTP(rw, r.WithContext(context.WithValue(ctx, &incomingRequestKey, &incomingRequest{ 1847 url: url, 1848 method: r.Method, 1849 metadata: auth.RequestMetadataForHTTP(r), 1850 healthCheck: r.RequestURI == healthEndpoint && isHealthCheckerUA(r.UserAgent()), 1851 }))) 1852 }) 1853 } 1854 1855 // httpDispatch finishes HTTP request context initialization. 1856 // 1857 // Its primary purpose it so setup logging, but it also does some other context 1858 // touches. See startRequest(...) where the bulk of work is happening. 1859 // 1860 // The next stop is the router.Middleware chain as registered in newRouter(...) 1861 // and by the user code. 1862 // 1863 // See grpcDispatch(...) for a gRPC counterpart. 1864 func (s *Server) httpDispatch(next http.Handler) http.Handler { 1865 return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { 1866 // Track how many response bytes are sent and what status is set, for logs. 1867 trackingRW := iotools.NewResponseWriter(rw) 1868 1869 // Initialize per-request context (logging, GAE tickets, etc). 1870 ctx, done := s.startRequest(r.Context()) 1871 1872 // Log the result when done. 1873 defer func() { 1874 done(&requestResult{ 1875 statusCode: trackingRW.Status(), 1876 requestSize: r.ContentLength, 1877 responseSize: trackingRW.ResponseSize(), 1878 }) 1879 }() 1880 1881 next.ServeHTTP(trackingRW, r.WithContext(ctx)) 1882 }) 1883 } 1884 1885 // grpcRoot is the entry point for gRPC requests. 1886 // 1887 // Its job is to initialize *incomingRequest in the context which is then 1888 // examined by other middlewares (and the tracing sampler), in particular in 1889 // grpcDispatch. 1890 // 1891 // See httpRoot(...) for a HTTP counterpart. 1892 func (s *Server) grpcRoot() grpcutil.UnifiedServerInterceptor { 1893 return func(ctx context.Context, fullMethod string, handler func(ctx context.Context) error) (err error) { 1894 // incomingRequest is used by middlewares that work with both HTTP and gRPC 1895 // requests, in particular it is used by startRequest(...). 1896 // 1897 // Note that here `ctx` is already derived from s.Context (except it is 1898 // canceled if the client disconnects). See grpcPort{} implementation. 1899 md := auth.RequestMetadataForGRPC(ctx) 1900 return handler(context.WithValue(ctx, &incomingRequestKey, &incomingRequest{ 1901 url: fmt.Sprintf("grpc://%s%s", md.Host(), fullMethod), 1902 method: "POST", 1903 metadata: md, 1904 healthCheck: strings.HasPrefix(fullMethod, "/grpc.health.") && isHealthCheckerUA(md.Header("User-Agent")), 1905 })) 1906 } 1907 } 1908 1909 // grpcDispatch finishes gRPC request context initialization. 1910 // 1911 // Its primary purpose it so setup logging, but it also does some other context 1912 // touches. See startRequest(...) where the bulk of work is happening. 1913 // 1914 // The next stop is the gRPC middleware chain as registered via server's API. 1915 // 1916 // See httpDispatch(...) for a HTTP counterpart. 1917 func (s *Server) grpcDispatch() grpcutil.UnifiedServerInterceptor { 1918 return func(ctx context.Context, fullMethod string, handler func(ctx context.Context) error) (err error) { 1919 // Initialize per-request context (logging, GAE tickets, etc). 1920 ctx, done := s.startRequest(ctx) 1921 1922 // Log the result when done. 1923 defer func() { 1924 code := status.Code(err) 1925 httpStatusCode := grpcutil.CodeStatus(code) 1926 1927 // Log errors (for parity with pRPC server behavior). 1928 switch { 1929 case httpStatusCode >= 400 && httpStatusCode < 500: 1930 logging.Warningf(ctx, "%s", err) 1931 case httpStatusCode >= 500: 1932 logging.Errorf(ctx, "%s", err) 1933 } 1934 1935 // Report canonical GRPC code as a log entry field for filtering by it. 1936 canonical, ok := codepb.Code_name[int32(code)] 1937 if !ok { 1938 canonical = fmt.Sprintf("%d", int64(code)) 1939 } 1940 1941 done(&requestResult{ 1942 statusCode: httpStatusCode, // this is an approximation 1943 extraFields: logging.Fields{"code": canonical}, 1944 }) 1945 }() 1946 1947 return handler(ctx) 1948 } 1949 } 1950 1951 // startRequest finishes preparing the per-request context. 1952 // 1953 // It returns a callback that must be called after finishing processing this 1954 // request. 1955 // 1956 // The incoming context is assumed to be derived by either httpRoot(...) or 1957 // grpcRoot(...) and have *incomingRequest inside. 1958 func (s *Server) startRequest(ctx context.Context) (context.Context, func(*requestResult)) { 1959 // The value *must* be there. Let it panic if it is not. 1960 req := ctx.Value(&incomingRequestKey).(*incomingRequest) 1961 1962 // If running on GAE, initialize the per-request API tickets needed to make 1963 // RPCs to the GAE service bridge. 1964 if s.Options.Serverless == module.GAE { 1965 ctx = gae.WithTickets(ctx, gae.RequestTickets(req.metadata)) 1966 } 1967 1968 // This is used in waitUntilNotServing. 1969 started := clock.Now(ctx) 1970 if !req.healthCheck { 1971 s.lastReqTime.Store(started) 1972 } 1973 1974 // If the tracing is completely disabled we'll have an empty span context. 1975 // But we need a trace ID in the context anyway for correlating logs (see 1976 // below). Open a noop non-recording span with random generated trace ID. 1977 span := oteltrace.SpanFromContext(ctx) 1978 spanCtx := span.SpanContext() 1979 if !spanCtx.HasTraceID() { 1980 var traceID oteltrace.TraceID 1981 s.genUniqueBlob(traceID[:]) 1982 spanCtx = oteltrace.NewSpanContext(oteltrace.SpanContextConfig{ 1983 TraceID: traceID, 1984 }) 1985 ctx = oteltrace.ContextWithSpanContext(ctx, spanCtx) 1986 } 1987 1988 // Associate all logs with one another by using the same trace ID, which also 1989 // matches the trace ID extracted by the propagator from incoming headers. 1990 // Make sure to use the full trace ID format that includes the project name. 1991 // This is important to group logs generated by us with logs generated by 1992 // the GCP (which uses the full trace ID) when running in Cloud. Outside of 1993 // Cloud it doesn't really matter what trace ID is used as long as all log 1994 // entries use the same one. 1995 traceID := spanCtx.TraceID().String() 1996 if s.Options.CloudProject != "" { 1997 traceID = fmt.Sprintf("projects/%s/traces/%s", s.Options.CloudProject, traceID) 1998 } 1999 2000 // SpanID can be missing if there's no actual tracing. This is fine. 2001 spanID := "" 2002 if spanCtx.HasSpanID() { 2003 spanID = spanCtx.SpanID().String() 2004 } 2005 2006 // When running in prod, make the logger emit log entries in JSON format that 2007 // Cloud Logger collectors understand natively. 2008 var severityTracker *sdlogger.SeverityTracker 2009 if s.Options.Prod { 2010 // Start assembling logging sink layers starting with the innermost one. 2011 logSink := s.stdout 2012 2013 // If we are going to log the overall request status, install the tracker 2014 // that observes the maximum emitted severity to use it as an overall 2015 // severity for the request log entry. 2016 if s.logRequestCB != nil { 2017 severityTracker = &sdlogger.SeverityTracker{Out: logSink} 2018 logSink = severityTracker 2019 } 2020 2021 // If have Cloud Error Reporting enabled, intercept errors to upload them. 2022 // TODO(vadimsh): Fill in `CloudErrorsSink.Request` with something. 2023 if s.errRptClient != nil { 2024 logSink = &sdlogger.CloudErrorsSink{ 2025 Client: s.errRptClient, 2026 Out: logSink, 2027 } 2028 } 2029 2030 // Associate log entries with the tracing span where they were emitted. 2031 annotateWithSpan := func(ctx context.Context, e *sdlogger.LogEntry) { 2032 if spanID := oteltrace.SpanContextFromContext(ctx).SpanID(); spanID.IsValid() { 2033 e.SpanID = spanID.String() 2034 } 2035 } 2036 2037 // Finally install all this into the request context. 2038 ctx = logging.SetFactory(ctx, sdlogger.Factory(logSink, sdlogger.LogEntry{ 2039 TraceID: traceID, 2040 Operation: &sdlogger.Operation{ID: s.genUniqueID(32)}, 2041 }, annotateWithSpan)) 2042 } 2043 2044 // Do final context touches. 2045 ctx = caching.WithRequestCache(ctx) 2046 2047 // This will be called once the request is fully processed. 2048 return ctx, func(res *requestResult) { 2049 now := clock.Now(ctx) 2050 latency := now.Sub(started) 2051 2052 if req.healthCheck { 2053 // Do not log fast health check calls AT ALL, they just spam logs. 2054 if latency < healthTimeLogThreshold { 2055 return 2056 } 2057 // Emit a warning if the health check is slow, this likely indicates 2058 // high CPU load. 2059 logging.Warningf(ctx, "Health check is slow: %s > %s", latency, healthTimeLogThreshold) 2060 } 2061 2062 // If there's no need to emit the overall request log entry, we are done. 2063 // See initLogging(...) for where this is decided. 2064 if s.logRequestCB == nil { 2065 return 2066 } 2067 2068 // When running behind Envoy, log its request IDs to simplify debugging. 2069 extraFields := res.extraFields 2070 if xrid := req.metadata.Header("X-Request-Id"); xrid != "" { 2071 if extraFields == nil { 2072 extraFields = make(logging.Fields, 1) 2073 } 2074 extraFields["requestId"] = xrid 2075 } 2076 2077 // If we were tracking the overall severity, collect the outcome. 2078 severity := sdlogger.InfoSeverity 2079 if severityTracker != nil { 2080 severity = severityTracker.MaxSeverity() 2081 } 2082 2083 // Log the final outcome of the processed request. 2084 s.logRequestCB(ctx, &sdlogger.LogEntry{ 2085 Severity: severity, 2086 Timestamp: sdlogger.ToTimestamp(now), 2087 TraceID: traceID, 2088 TraceSampled: span.IsRecording(), 2089 SpanID: spanID, // the top-level span ID if present 2090 Fields: extraFields, 2091 RequestInfo: &sdlogger.RequestInfo{ 2092 Method: req.method, 2093 URL: req.url, 2094 Status: res.statusCode, 2095 RequestSize: fmt.Sprintf("%d", res.requestSize), 2096 ResponseSize: fmt.Sprintf("%d", res.responseSize), 2097 UserAgent: req.metadata.Header("User-Agent"), 2098 RemoteIP: endUserIP(req.metadata), 2099 Latency: fmt.Sprintf("%fs", latency.Seconds()), 2100 }, 2101 }) 2102 } 2103 } 2104 2105 // initLogging initializes the server logging. 2106 // 2107 // Called very early during server startup process. Many server fields may not 2108 // be initialized yet, be careful. 2109 // 2110 // When running in production uses the ugly looking JSON format that is hard to 2111 // read by humans but which is parsed by google-fluentd and GCP serverless 2112 // hosting environment. 2113 // 2114 // To support per-request log grouping in Cloud Logging UI there must be 2115 // two different log streams: 2116 // - A stream with top-level HTTP request entries (conceptually like Apache's 2117 // access.log, i.e. with one log entry per request). 2118 // - A stream with logs produced within requests (correlated with HTTP request 2119 // logs via the trace ID field). 2120 // 2121 // Both streams are expected to have a particular format and use particular 2122 // fields for Cloud Logging UI to display them correctly. This technique is 2123 // primarily intended for GAE Flex, but it works in many Google environments: 2124 // https://cloud.google.com/appengine/articles/logging#linking_app_logs_and_requests 2125 // 2126 // On GKE we use 'stderr' stream for top-level HTTP request entries and 'stdout' 2127 // stream for logs produced by requests. 2128 // 2129 // On GAE and Cloud Run, the stream with top-level HTTP request entries is 2130 // produced by the GCP runtime itself. So we emit only logs produced within 2131 // requests (also to 'stdout', just like on GKE). 2132 // 2133 // In all environments 'stderr' stream is used to log all global activities that 2134 // happens outside of any request handler (stuff like initialization, shutdown, 2135 // background goroutines, etc). 2136 // 2137 // In non-production mode we use the human-friendly format and a single 'stderr' 2138 // log stream for everything. 2139 func (s *Server) initLogging() { 2140 if !s.Options.Prod { 2141 s.Context = gologger.StdConfig.Use(s.Context) 2142 s.Context = logging.SetLevel(s.Context, logging.Debug) 2143 s.logRequestCB = func(ctx context.Context, entry *sdlogger.LogEntry) { 2144 logging.Infof(ctx, "%d %s %q (%s)", 2145 entry.RequestInfo.Status, 2146 entry.RequestInfo.Method, 2147 entry.RequestInfo.URL, 2148 entry.RequestInfo.Latency, 2149 ) 2150 } 2151 return 2152 } 2153 2154 if s.Options.testStdout != nil { 2155 s.stdout = s.Options.testStdout 2156 } else { 2157 s.stdout = &sdlogger.Sink{Out: os.Stdout} 2158 } 2159 2160 if s.Options.testStderr != nil { 2161 s.stderr = s.Options.testStderr 2162 } else { 2163 s.stderr = &sdlogger.Sink{Out: os.Stderr} 2164 } 2165 2166 s.Context = logging.SetFactory(s.Context, 2167 sdlogger.Factory(s.stderr, sdlogger.LogEntry{ 2168 Operation: &sdlogger.Operation{ 2169 ID: s.genUniqueID(32), // correlate all global server logs together 2170 }, 2171 }, nil), 2172 ) 2173 s.Context = logging.SetLevel(s.Context, logging.Debug) 2174 2175 // Skip writing the root request log entry on Serverless GCP since the load 2176 // balancer there writes the entry itself. 2177 switch s.Options.Serverless { 2178 case module.GAE: 2179 // Skip. GAE writes it to "appengine.googleapis.com/request_log" itself. 2180 case module.CloudRun: 2181 // Skip. Cloud Run writes it to "run.googleapis.com/requests" itself. 2182 default: 2183 // Emit to stderr where Cloud Logging collectors pick it up. 2184 s.logRequestCB = func(_ context.Context, entry *sdlogger.LogEntry) { s.stderr.Write(entry) } 2185 } 2186 } 2187 2188 // initAuthStart initializes the core auth system by preparing the context 2189 // and verifying auth tokens can actually be minted (i.e. supplied credentials 2190 // are valid). 2191 // 2192 // It is called before the tsmon monitoring is initialized: tsmon needs auth. 2193 // The rest of the auth initialization (the part that needs tsmon) happens in 2194 // initAuthFinish after tsmon is initialized. 2195 func (s *Server) initAuthStart() error { 2196 // Make a transport that appends information about the server as User-Agent. 2197 ua := s.Options.userAgent() 2198 rootTransport := clientauth.NewModifyingTransport(http.DefaultTransport, func(req *http.Request) error { 2199 newUA := ua 2200 if cur := req.UserAgent(); cur != "" { 2201 newUA += " " + cur 2202 } 2203 req.Header.Set("User-Agent", newUA) 2204 return nil 2205 }) 2206 2207 // Initialize the token generator based on s.Options.ClientAuth. 2208 opts := s.Options.ClientAuth 2209 2210 // Use `rootTransport` for calls made by the token generator (e.g. when 2211 // refreshing tokens). 2212 opts.Transport = rootTransport 2213 2214 // We aren't going to use the authenticator's transport (and thus its 2215 // monitoring), only the token source. DisableMonitoring == true removes some 2216 // log spam. 2217 opts.DisableMonitoring = true 2218 2219 // GCP is very aggressive in caching the token internally (in the metadata 2220 // server) and refreshing it only when it is very close to its expiration. We 2221 // need to match this behavior in our in-process cache, otherwise 2222 // GetAccessToken complains that the token refresh procedure doesn't actually 2223 // change the token (because the metadata server returned the cached one). 2224 opts.MinTokenLifetime = 20 * time.Second 2225 2226 // The default value for ClientAuth.SecretsDir is usually hardcoded to point 2227 // to where the token cache is located on developer machines (~/.config/...). 2228 // This location often doesn't exist when running from inside a container. 2229 // The token cache is also not really needed for production services that use 2230 // service accounts (they don't need cached refresh tokens). So in production 2231 // mode totally ignore default ClientAuth.SecretsDir and use whatever was 2232 // passed as -token-cache-dir. If it is empty (default), then no on-disk token 2233 // cache is used at all. 2234 // 2235 // If -token-cache-dir was explicitly set, always use it (even in dev mode). 2236 // This is useful when running containers locally: developer's credentials 2237 // on the host machine can be mounted inside the container. 2238 if s.Options.Prod || s.Options.TokenCacheDir != "" { 2239 opts.SecretsDir = s.Options.TokenCacheDir 2240 } 2241 2242 // Annotate the context used for logging from the token generator. 2243 ctx := logging.SetField(s.Context, "activity", "luci.auth") 2244 tokens := clientauth.NewTokenGenerator(ctx, opts) 2245 2246 // Prepare partially initialized structs for the auth.Config. They will be 2247 // fully initialized in initAuthFinish once we have a sufficiently working 2248 // auth context that can call Cloud IAM. 2249 s.signer = &signerImpl{srv: s} 2250 s.actorTokens = &actorTokensImpl{} 2251 2252 // Either use the explicitly passed AuthDB provider or the one initialized 2253 // by initAuthDB. 2254 provider := s.Options.AuthDBProvider 2255 if provider == nil { 2256 provider = func(context.Context) (authdb.DB, error) { 2257 db, _ := s.authDB.Load().(authdb.DB) // refreshed asynchronously in refreshAuthDB 2258 return db, nil 2259 } 2260 } 2261 2262 // Initialize the state in the context. 2263 s.Context = auth.Initialize(s.Context, &auth.Config{ 2264 DBProvider: provider, 2265 Signer: s.signer, 2266 AccessTokenProvider: func(ctx context.Context, scopes []string) (*oauth2.Token, error) { 2267 return tokens.GenerateOAuthToken(ctx, scopes, 0) 2268 }, 2269 IDTokenProvider: func(ctx context.Context, audience string) (*oauth2.Token, error) { 2270 return tokens.GenerateIDToken(ctx, audience, 0) 2271 }, 2272 ActorTokensProvider: s.actorTokens, 2273 AnonymousTransport: func(context.Context) http.RoundTripper { return rootTransport }, 2274 FrontendClientID: func(context.Context) (string, error) { return s.Options.FrontendClientID, nil }, 2275 EndUserIP: endUserIP, 2276 IsDevMode: !s.Options.Prod, 2277 }) 2278 2279 // Note: we initialize a token source for one arbitrary set of scopes here. In 2280 // many practical cases this is sufficient to verify that credentials are 2281 // valid. For example, when we use service account JSON key, if we can 2282 // generate a token with *some* scope (meaning Cloud accepted our signature), 2283 // we can generate tokens with *any* scope, since there's no restrictions on 2284 // what scopes are accessible to a service account, as long as the private key 2285 // is valid (which we just verified by generating some token). 2286 _, err := tokens.GenerateOAuthToken(ctx, auth.CloudOAuthScopes, 0) 2287 if err != nil { 2288 // ErrLoginRequired may happen only when running the server locally using 2289 // developer's credentials. Let them know how the problem can be fixed. 2290 if !s.Options.Prod && err == clientauth.ErrLoginRequired { 2291 scopes := fmt.Sprintf("-scopes %q", strings.Join(auth.CloudOAuthScopes, " ")) 2292 if opts.ActAsServiceAccount != "" && opts.ActViaLUCIRealm == "" { 2293 scopes = "-scopes-iam" 2294 } 2295 logging.Errorf(s.Context, "Looks like you run the server locally and it doesn't have credentials for some OAuth scopes") 2296 logging.Errorf(s.Context, "Run the following command to set them up: ") 2297 logging.Errorf(s.Context, " $ luci-auth login %s", scopes) 2298 } 2299 return errors.Annotate(err, "failed to initialize the token source").Err() 2300 } 2301 2302 // Report who we are running as. Useful when debugging access issues. 2303 switch email, err := tokens.GetEmail(); { 2304 case err == nil: 2305 logging.Infof(s.Context, "Running as %s", email) 2306 s.runningAs = email 2307 case err == clientauth.ErrNoEmail: 2308 logging.Warningf(s.Context, "Running as <unknown>, cautiously proceeding...") 2309 case err != nil: 2310 return errors.Annotate(err, "failed to check the service account email").Err() 2311 } 2312 2313 return nil 2314 } 2315 2316 // initAuthFinish finishes auth system initialization. 2317 // 2318 // It is called after tsmon is initialized. 2319 func (s *Server) initAuthFinish() error { 2320 // We should be able to make basic authenticated requests now and can 2321 // construct a token source used by server's own guts to call Cloud APIs, 2322 // such us Cloud Trace and Cloud Error Reporting (and others). 2323 var err error 2324 s.cloudTS, err = auth.GetTokenSource(s.Context, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...)) 2325 if err != nil { 2326 return errors.Annotate(err, "failed to initialize the cloud token source").Err() 2327 } 2328 2329 // Finish constructing `signer` and `actorTokens` that were waiting for 2330 // an IAM client. 2331 iamClient, err := credentials.NewIamCredentialsClient( 2332 s.Context, 2333 option.WithTokenSource(s.cloudTS), 2334 option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})), 2335 option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())), 2336 option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())), 2337 ) 2338 if err != nil { 2339 return errors.Annotate(err, "failed to construct IAM client").Err() 2340 } 2341 s.RegisterCleanup(func(ctx context.Context) { iamClient.Close() }) 2342 s.signer.iamClient = iamClient 2343 s.actorTokens.iamClient = iamClient 2344 2345 // If not using a custom AuthDB provider, initialize the standard one that 2346 // fetches AuthDB (a database with groups and auth config) from a central 2347 // place. This also starts a goroutine to periodically refresh it. 2348 if s.Options.AuthDBProvider == nil { 2349 if err := s.initAuthDB(); err != nil { 2350 return errors.Annotate(err, "failed to initialize AuthDB").Err() 2351 } 2352 } 2353 2354 // Default RPC authentication methods. See also SetRPCAuthMethods. 2355 s.rpcAuthMethods = make([]auth.Method, 0, 2) 2356 if s.Options.OpenIDRPCAuthEnable { 2357 // The preferred authentication method. 2358 s.rpcAuthMethods = append(s.rpcAuthMethods, &openid.GoogleIDTokenAuthMethod{ 2359 AudienceCheck: openid.AudienceMatchesHost, 2360 Audience: s.Options.OpenIDRPCAuthAudience, 2361 SkipNonJWT: true, // pass OAuth2 access tokens through 2362 }) 2363 } 2364 // Backward compatibility for the RPC Explorer and old clients. 2365 s.rpcAuthMethods = append(s.rpcAuthMethods, &auth.GoogleOAuth2Method{ 2366 Scopes: []string{clientauth.OAuthScopeEmail}, 2367 }) 2368 2369 return nil 2370 } 2371 2372 // initAuthDB interprets -auth-db-* flags and sets up fetching of AuthDB. 2373 func (s *Server) initAuthDB() error { 2374 // Check flags are compatible. 2375 switch { 2376 case s.Options.AuthDBPath != "" && s.Options.AuthServiceHost != "": 2377 return errors.Reason("-auth-db-path and -auth-service-host can't be used together").Err() 2378 case s.Options.AuthServiceHost == "" && (s.Options.AuthDBDump != "" || s.Options.AuthDBSigner != ""): 2379 return errors.Reason("-auth-db-dump and -auth-db-signer can be used only with -auth-service-host").Err() 2380 case s.Options.AuthDBDump != "" && !strings.HasPrefix(s.Options.AuthDBDump, "gs://"): 2381 return errors.Reason("-auth-db-dump value should start with gs://, got %q", s.Options.AuthDBDump).Err() 2382 case strings.Contains(s.Options.AuthServiceHost, "/"): 2383 return errors.Reason("-auth-service-host should be a plain hostname, got %q", s.Options.AuthServiceHost).Err() 2384 } 2385 2386 // Fill in defaults. 2387 if s.Options.AuthServiceHost != "" { 2388 if s.Options.AuthDBDump == "" { 2389 s.Options.AuthDBDump = fmt.Sprintf("gs://%s/auth-db", s.Options.AuthServiceHost) 2390 } 2391 if s.Options.AuthDBSigner == "" { 2392 if !strings.HasSuffix(s.Options.AuthServiceHost, ".appspot.com") { 2393 return errors.Reason("-auth-db-signer is required if -auth-service-host is not *.appspot.com").Err() 2394 } 2395 s.Options.AuthDBSigner = fmt.Sprintf("%s@appspot.gserviceaccount.com", 2396 strings.TrimSuffix(s.Options.AuthServiceHost, ".appspot.com")) 2397 } 2398 } 2399 2400 // Fetch the initial copy of AuthDB. Note that this happens before we start 2401 // the serving loop, to make sure incoming requests have some AuthDB to use. 2402 if err := s.refreshAuthDB(s.Context); err != nil { 2403 return errors.Annotate(err, "failed to load the initial AuthDB version").Err() 2404 } 2405 2406 // Periodically refresh it in the background. 2407 s.RunInBackground("luci.authdb", func(c context.Context) { 2408 for { 2409 jitter := time.Duration(rand.Int63n(int64(10 * time.Second))) 2410 if r := <-clock.After(c, 30*time.Second+jitter); r.Err != nil { 2411 return // the context is canceled 2412 } 2413 if err := s.refreshAuthDB(c); err != nil { 2414 // Don't log the error if the server is shutting down. 2415 if !errors.Is(err, context.Canceled) { 2416 logging.WithError(err).Errorf(c, "Failed to reload AuthDB, using the cached one") 2417 } 2418 } 2419 } 2420 }) 2421 return nil 2422 } 2423 2424 // refreshAuthDB reloads AuthDB from the source and stores it in memory. 2425 func (s *Server) refreshAuthDB(c context.Context) error { 2426 cur, _ := s.authDB.Load().(authdb.DB) 2427 db, err := s.fetchAuthDB(c, cur) 2428 if err != nil { 2429 return err 2430 } 2431 s.authDB.Store(db) 2432 return nil 2433 } 2434 2435 // fetchAuthDB fetches the most recent copy of AuthDB from the external source. 2436 // 2437 // Used only if Options.AuthDBProvider is nil. 2438 // 2439 // 'cur' is the currently used AuthDB or nil if fetching it for the first time. 2440 // Returns 'cur' as is if it's already fresh. 2441 func (s *Server) fetchAuthDB(c context.Context, cur authdb.DB) (authdb.DB, error) { 2442 // Loading from a local file (useful in integration tests). 2443 if s.Options.AuthDBPath != "" { 2444 r, err := os.Open(s.Options.AuthDBPath) 2445 if err != nil { 2446 return nil, errors.Annotate(err, "failed to open AuthDB file").Err() 2447 } 2448 defer r.Close() 2449 db, err := authdb.SnapshotDBFromTextProto(r) 2450 if err != nil { 2451 return nil, errors.Annotate(err, "failed to load AuthDB file").Err() 2452 } 2453 return db, nil 2454 } 2455 2456 // Loading from a GCS dump (s.Options.AuthDB* are validated here already). 2457 if s.Options.AuthDBDump != "" { 2458 c, cancel := clock.WithTimeout(c, 5*time.Minute) 2459 defer cancel() 2460 fetcher := dump.Fetcher{ 2461 StorageDumpPath: s.Options.AuthDBDump[len("gs://"):], 2462 AuthServiceURL: "https://" + s.Options.AuthServiceHost, 2463 AuthServiceAccount: s.Options.AuthDBSigner, 2464 OAuthScopes: auth.CloudOAuthScopes, 2465 } 2466 curSnap, _ := cur.(*authdb.SnapshotDB) 2467 snap, err := fetcher.FetchAuthDB(c, curSnap) 2468 if err != nil { 2469 return nil, errors.Annotate(err, "fetching from GCS dump failed").Err() 2470 } 2471 return snap, nil 2472 } 2473 2474 // In dev mode default to "allow everything". 2475 if !s.Options.Prod { 2476 return authdb.DevServerDB{}, nil 2477 } 2478 2479 // In prod mode default to "fail on any non-trivial check". Some services may 2480 // not need to use AuthDB at all and configuring it for them is a hassle. If 2481 // they try to use it for something vital, they'll see the error. 2482 return authdb.UnconfiguredDB{ 2483 Error: errors.Reason("a source of AuthDB is not configured, see -auth-* server flags").Err(), 2484 }, nil 2485 } 2486 2487 // initTSMon initializes time series monitoring state. 2488 func (s *Server) initTSMon() error { 2489 // We keep tsmon always enabled (flushing to /dev/null if no -ts-mon-* flags 2490 // are set) so that tsmon's in-process store is populated, and metrics there 2491 // can be examined via /admin/tsmon. This is useful when developing/debugging 2492 // tsmon metrics. 2493 var customMonitor monitor.Monitor 2494 if s.Options.TsMonAccount == "" || s.Options.TsMonServiceName == "" || s.Options.TsMonJobName == "" { 2495 logging.Infof(s.Context, "tsmon is in the debug mode: metrics are collected, but flushed to /dev/null (pass -ts-mon-* flags to start uploading metrics)") 2496 customMonitor = monitor.NewNilMonitor() 2497 } 2498 2499 interval := int(s.Options.TsMonFlushInterval.Seconds()) 2500 if interval == 0 { 2501 interval = int(defaultTsMonFlushInterval.Seconds()) 2502 } 2503 timeout := int(s.Options.TsMonFlushTimeout.Seconds()) 2504 if timeout == 0 { 2505 timeout = int(defaultTsMonFlushTimeout.Seconds()) 2506 } 2507 if timeout >= interval { 2508 return errors.Reason("-ts-mon-flush-timeout (%ds) must be shorter than -ts-mon-flush-interval (%ds)", timeout, interval).Err() 2509 } 2510 s.tsmon = &tsmon.State{ 2511 CustomMonitor: customMonitor, 2512 Settings: &tsmon.Settings{ 2513 Enabled: true, 2514 ProdXAccount: s.Options.TsMonAccount, 2515 FlushIntervalSec: interval, 2516 FlushTimeoutSec: timeout, 2517 ReportRuntimeStats: true, 2518 }, 2519 Target: func(c context.Context) target.Task { 2520 // TODO(vadimsh): We pretend to be a GAE app for now to be able to 2521 // reuse existing dashboards. Each pod pretends to be a separate GAE 2522 // version. That way we can stop worrying about TaskNumAllocator and just 2523 // use 0 (since there'll be only one task per "version"). This looks 2524 // chaotic for deployments with large number of pods. 2525 return target.Task{ 2526 DataCenter: "appengine", 2527 ServiceName: s.Options.TsMonServiceName, 2528 JobName: s.Options.TsMonJobName, 2529 HostName: s.Options.Hostname, 2530 } 2531 }, 2532 } 2533 if customMonitor != nil { 2534 tsmon.PortalPage.SetReadOnlySettings(s.tsmon.Settings, 2535 "Running in the debug mode. Pass all -ts-mon-* command line flags to start uploading metrics.") 2536 } else { 2537 tsmon.PortalPage.SetReadOnlySettings(s.tsmon.Settings, 2538 "Settings are controlled through -ts-mon-* command line flags.") 2539 } 2540 2541 // Enable this configuration in s.Context so all transports created during 2542 // the server startup have tsmon instrumentation. 2543 s.tsmon.Activate(s.Context) 2544 2545 // Report our image version as a metric, useful to monitor rollouts. 2546 tsmoncommon.RegisterCallbackIn(s.Context, func(ctx context.Context) { 2547 versionMetric.Set(ctx, s.Options.ImageVersion()) 2548 }) 2549 2550 // Periodically flush metrics. 2551 s.RunInBackground("luci.tsmon", s.tsmon.FlushPeriodically) 2552 return nil 2553 } 2554 2555 // otelResource returns an OTEL resource identifying this server instance. 2556 // 2557 // It is just a bunch of labels essentially reported to monitoring backends 2558 // together with traces. 2559 func (s *Server) otelResource(ctx context.Context) (*resource.Resource, error) { 2560 return resource.New( 2561 ctx, 2562 resource.WithTelemetrySDK(), 2563 resource.WithDetectors(gcp.NewDetector()), 2564 resource.WithAttributes( 2565 semconv.ServiceName(fmt.Sprintf("%s/%s", s.Options.TsMonServiceName, s.Options.TsMonJobName)), 2566 semconv.ServiceInstanceID(s.Options.Hostname), 2567 semconv.ContainerImageName(s.Options.ImageName()), 2568 semconv.ContainerImageTag(s.Options.ImageVersion()), 2569 ), 2570 ) 2571 } 2572 2573 // otelErrorHandler returns a top-level OTEL error catcher. 2574 // 2575 // It just logs errors (with some dedupping to avoid spam). 2576 func (s *Server) otelErrorHandler(ctx context.Context) otel.ErrorHandlerFunc { 2577 // State for suppressing repeated ResourceExhausted error messages, otherwise 2578 // logs may get flooded with them. They are usually not super important, but 2579 // ignoring them completely is also not great. 2580 errorDedup := struct { 2581 lock sync.Mutex 2582 report time.Time 2583 count int 2584 }{} 2585 return func(err error) { 2586 if !strings.Contains(err.Error(), "ResourceExhausted") { 2587 logging.Warningf(ctx, "Error in Cloud Trace exporter: %s", err) 2588 return 2589 } 2590 2591 errorDedup.lock.Lock() 2592 defer errorDedup.lock.Unlock() 2593 2594 errorDedup.count++ 2595 2596 if errorDedup.report.IsZero() || time.Since(errorDedup.report) > 5*time.Minute { 2597 if errorDedup.report.IsZero() { 2598 logging.Warningf(ctx, "Error in Cloud Trace exporter: %s", err) 2599 } else { 2600 logging.Warningf(ctx, "Error in Cloud Trace exporter: %s (%d occurrences in %s since the last report)", err, errorDedup.count, time.Since(errorDedup.report)) 2601 } 2602 errorDedup.report = time.Now() 2603 errorDedup.count = 0 2604 } 2605 } 2606 } 2607 2608 // otelSampler prepares a sampler based on CLI flags and environment. 2609 func (s *Server) otelSampler(ctx context.Context) (trace.Sampler, error) { 2610 // On GCP Serverless let the GCP load balancer make decisions about 2611 // sampling. If it decides to sample a trace, it will let us know through 2612 // options of the parent span in X-Cloud-Trace-Context. We will collect only 2613 // traces from requests that GCP wants to sample itself. Traces without 2614 // a parent context are never sampled. This also means traces from random 2615 // background goroutines aren't sampled either (i.e. we don't need GateSampler 2616 // as used below). 2617 if s.Options.Serverless.IsGCP() { 2618 logging.Infof(ctx, "Setting up Cloud Trace exports to %q using GCP Serverless sampling strategy", s.Options.CloudProject) 2619 return trace.ParentBased(trace.NeverSample()), nil 2620 } 2621 2622 // Parse -trace-sampling spec to get the base sampler. 2623 sampling := s.Options.TraceSampling 2624 if sampling == "" { 2625 sampling = "0.1qps" 2626 } 2627 logging.Infof(ctx, "Setting up Cloud Trace exports to %q (%s)", s.Options.CloudProject, sampling) 2628 sampler, err := internal.BaseSampler(sampling) 2629 if err != nil { 2630 return nil, errors.Annotate(err, "bad -trace-sampling").Err() 2631 } 2632 2633 // Sample only if the context is an incoming request context. This is needed 2634 // to avoid various background goroutines spamming with top-level spans. This 2635 // usually happens if a library is oblivious of tracing, but uses an 2636 // instrumented HTTP or gRPC client it got from outside, and the passes 2637 // context.Background() (or some unrelated context) to it. The end result is 2638 // lots and lots of non-informative disconnected top-level spans. 2639 // 2640 // Also skip sampling health check requests, they end up being spammy as well. 2641 sampler = internal.GateSampler(sampler, func(ctx context.Context) bool { 2642 req, _ := ctx.Value(&incomingRequestKey).(*incomingRequest) 2643 return req != nil && !req.healthCheck 2644 }) 2645 2646 // Inherit the sampling decision from a parent span. Note this totally ignores 2647 // `sampler` if there's a parent span (local or remote). This is usually what 2648 // we want to get complete trace trees with well-defined root and no gaps. 2649 return trace.ParentBased(sampler), nil 2650 } 2651 2652 // otelSpanExporter initializes a trace spans exporter. 2653 func (s *Server) otelSpanExporter(ctx context.Context) (trace.SpanExporter, error) { 2654 return texporter.New( 2655 texporter.WithContext(ctx), 2656 texporter.WithProjectID(s.Options.CloudProject), 2657 texporter.WithTraceClientOptions([]option.ClientOption{ 2658 option.WithTokenSource(s.cloudTS), 2659 }), 2660 ) 2661 } 2662 2663 // initTracing initializes Cloud Trace exporter via OpenTelemetry. 2664 func (s *Server) initTracing() error { 2665 // Initialize a transformer that knows how to extract span info from the 2666 // context and serialize it as a bunch of headers and vice-versa. It is 2667 // invoked by otelhttp and otelgrpc middleware and when creating instrumented 2668 // HTTP clients. Recognize X-Cloud-Trace-Context for compatibility with traces 2669 // created by GCLB. 2670 // 2671 // It is used to parse incoming headers even when tracing is disabled, so 2672 // initialize it unconditionally, just don't install as a global propagator. 2673 s.propagator = propagation.NewCompositeTextMapPropagator( 2674 gcppropagator.CloudTraceOneWayPropagator{}, 2675 propagation.TraceContext{}, 2676 ) 2677 2678 // If tracing is disabled, just don't initialize OpenTelemetry library. All 2679 // tracing machinery would still nominally "work", just do nothing in a 2680 // relatively efficient way. 2681 if !s.Options.shouldEnableTracing() { 2682 return nil 2683 } 2684 2685 // Annotate logs from OpenTelemetry so they can be filtered in Cloud Logging. 2686 ctx := logging.SetField(s.Context, "activity", "luci.trace") 2687 2688 // TODO(vadimsh): Install OpenTelemetry global logger using otel.SetLogger(). 2689 // This will require implementing a hefty logr.LogSink interface on top of 2690 // the LUCI logger. Not doing that results in garbled stderr when OTEL wants 2691 // to log something (unclear when it happens exactly, if at all). 2692 2693 res, err := s.otelResource(ctx) 2694 if err != nil { 2695 return errors.Annotate(err, "failed to init OpenTelemetry resource").Err() 2696 } 2697 sampler, err := s.otelSampler(ctx) 2698 if err != nil { 2699 return errors.Annotate(err, "failed to init OpenTelemetry sampler").Err() 2700 } 2701 exp, err := s.otelSpanExporter(ctx) 2702 if err != nil { 2703 return errors.Annotate(err, "failed to init OpenTelemetry span exporter").Err() 2704 } 2705 2706 tp := trace.NewTracerProvider( 2707 trace.WithResource(res), 2708 trace.WithSampler(sampler), 2709 trace.WithBatcher(exp, 2710 trace.WithMaxQueueSize(8192), // how much to buffer before dropping 2711 trace.WithBatchTimeout(30*time.Second), // how long to buffer before flushing 2712 trace.WithExportTimeout(time.Minute), // deadline for the export RPC call 2713 trace.WithMaxExportBatchSize(2048), // size of a single RPC 2714 ), 2715 ) 2716 2717 s.RegisterCleanup(func(ctx context.Context) { 2718 ctx = logging.SetField(ctx, "activity", "luci.trace") 2719 if err := tp.ForceFlush(ctx); err != nil { 2720 logging.Errorf(ctx, "Final trace flush failed: %s", err) 2721 } 2722 if err := tp.Shutdown(ctx); err != nil { 2723 logging.Errorf(ctx, "Error shutting down TracerProvider: %s", err) 2724 } 2725 }) 2726 2727 // Register all globals to make them be used by default. 2728 otel.SetErrorHandler(s.otelErrorHandler(ctx)) 2729 otel.SetTracerProvider(tp) 2730 otel.SetTextMapPropagator(s.propagator) 2731 2732 return nil 2733 } 2734 2735 // initProfiling initialized Cloud Profiler. 2736 func (s *Server) initProfiling() error { 2737 // Skip if not enough configuration is given. 2738 switch { 2739 case !s.Options.Prod: 2740 return nil // silently skip, no need for log spam in dev mode 2741 case s.Options.CloudProject == "": 2742 logging.Infof(s.Context, "Cloud Profiler is disabled: -cloud-project is not set") 2743 return nil 2744 case s.Options.ProfilingServiceID == "" && s.Options.TsMonJobName == "": 2745 logging.Infof(s.Context, "Cloud Profiler is disabled: neither -profiling-service-id nor -ts-mon-job-name are set") 2746 return nil 2747 } 2748 2749 // Enable profiler based on a given probability. Low probabilities are useful 2750 // to avoid hitting Cloud Profiler quotas when running services with many 2751 // replicas. Profiles are aggregated anyway, for large enough number of 2752 // servers it doesn't matter if only a random subset of them is sampled. 2753 sample := rand.Float64() 2754 if sample < s.Options.ProfilingProbability { 2755 if s.Options.ProfilingProbability >= 1.0 { 2756 logging.Infof(s.Context, "Cloud Profiler is enabled") 2757 } else { 2758 logging.Infof(s.Context, 2759 "Cloud Profiler is enabled: rand %.2f < profiling-probability %.2f", 2760 sample, s.Options.ProfilingProbability) 2761 } 2762 } else { 2763 if s.Options.ProfilingProbability <= 0 { 2764 logging.Infof(s.Context, "Cloud Profiler is disabled") 2765 } else { 2766 logging.Infof(s.Context, 2767 "Cloud Profiler is disabled: rand %.2f >= profiling-probability %.2f", 2768 sample, s.Options.ProfilingProbability) 2769 } 2770 return nil 2771 } 2772 2773 cfg := profiler.Config{ 2774 ProjectID: s.Options.CloudProject, 2775 Service: s.getServiceID(), 2776 ServiceVersion: s.Options.ImageVersion(), 2777 Instance: s.Options.Hostname, 2778 // Note: these two options may potentially have impact on performance, but 2779 // it is likely small enough not to bother. 2780 MutexProfiling: true, 2781 AllocForceGC: true, 2782 } 2783 2784 // Launch the agent that runs in the background and periodically collects and 2785 // uploads profiles. It fails to launch if Service or ServiceVersion do not 2786 // pass regexp validation. Make it non-fatal, but still log. 2787 if err := profiler.Start(cfg, option.WithTokenSource(s.cloudTS)); err != nil { 2788 logging.Errorf(s.Context, "Cloud Profiler is disabled: failed do start - %s", err) 2789 return nil 2790 } 2791 2792 logging.Infof(s.Context, "Set up Cloud Profiler (service %q, version %q)", cfg.Service, cfg.ServiceVersion) 2793 return nil 2794 } 2795 2796 // getServiceID get the service id from either ProfilingServiceID or TsMonJobName. 2797 func (s *Server) getServiceID() string { 2798 // Prefer ProfilingServiceID if given, fall back to TsMonJobName. Replace 2799 // forbidden '/' symbol. 2800 serviceID := s.Options.ProfilingServiceID 2801 if serviceID == "" { 2802 serviceID = s.Options.TsMonJobName 2803 } 2804 serviceID = strings.ReplaceAll(serviceID, "/", "-") 2805 return serviceID 2806 } 2807 2808 // initMainPort initializes the server on options.HTTPAddr port. 2809 func (s *Server) initMainPort() error { 2810 var err error 2811 s.mainPort, err = s.AddPort(PortOptions{ 2812 Name: "main", 2813 ListenAddr: s.Options.HTTPAddr, 2814 }) 2815 if err != nil { 2816 return err 2817 } 2818 s.Routes = s.mainPort.Routes 2819 2820 // Install auth info handlers (under "/auth/api/v1/server/"). 2821 auth.InstallHandlers(s.Routes, nil) 2822 2823 // Prepare the pRPC server. Its configuration will be finished in Serve after 2824 // all interceptors and authentication methods are registered. 2825 s.prpc = &prpc.Server{ 2826 // Allow compression when not running on GAE. On GAE compression for text 2827 // responses is done by GAE itself and doing it in our code would be 2828 // wasteful. 2829 EnableResponseCompression: s.Options.Serverless != module.GAE, 2830 } 2831 discovery.Enable(s.prpc) 2832 s.prpc.InstallHandlers(s.Routes, nil) 2833 2834 return nil 2835 } 2836 2837 // initGrpcPort initializes the listening gRPC port. 2838 func (s *Server) initGrpcPort() error { 2839 if s.Options.GRPCAddr == "" || s.Options.GRPCAddr == "-" { 2840 return nil // the gRPC port is disabled 2841 } 2842 listener, err := s.createListener(s.Options.GRPCAddr) 2843 if err != nil { 2844 return errors.Annotate(err, `failed to bind the listening port for "grpc" at %q`, s.Options.GRPCAddr).Err() 2845 } 2846 s.grpcPort = &grpcPort{listener: listener} 2847 s.ports = append(s.ports, s.grpcPort) 2848 return nil 2849 } 2850 2851 // initAdminPort initializes the server on options.AdminAddr port. 2852 func (s *Server) initAdminPort() error { 2853 if s.Options.AdminAddr == "-" { 2854 return nil // the admin port is disabled 2855 } 2856 2857 // Admin portal uses XSRF tokens that require a secret key. We generate this 2858 // key randomly during process startup (i.e. now). It means XSRF tokens in 2859 // admin HTML pages rendered by a server process are understood only by the 2860 // exact same process. This is OK for admin pages (they are not behind load 2861 // balancers and we don't care that a server restart invalidates all tokens). 2862 secret := make([]byte, 20) 2863 if _, err := cryptorand.Read(secret); err != nil { 2864 return err 2865 } 2866 store := secrets.NewDerivedStore(secrets.Secret{Active: secret}) 2867 withAdminSecret := router.NewMiddlewareChain(func(c *router.Context, next router.Handler) { 2868 c.Request = c.Request.WithContext(secrets.Use(c.Request.Context(), store)) 2869 next(c) 2870 }) 2871 2872 // Install endpoints accessible through the admin port only. 2873 adminPort, err := s.AddPort(PortOptions{ 2874 Name: "admin", 2875 ListenAddr: s.Options.AdminAddr, 2876 DisableMetrics: true, // do not pollute HTTP metrics with admin-only routes 2877 }) 2878 if err != nil { 2879 return err 2880 } 2881 routes := adminPort.Routes 2882 2883 routes.GET("/", nil, func(c *router.Context) { 2884 http.Redirect(c.Writer, c.Request, "/admin/portal", http.StatusFound) 2885 }) 2886 portal.InstallHandlers(routes, withAdminSecret, portal.AssumeTrustedPort) 2887 2888 // Install pprof endpoints on the admin port. Note that they must not be 2889 // exposed via the main serving port, since they do no authentication and 2890 // may leak internal information. Also note that pprof handlers rely on 2891 // routing structure not supported by our router, so we do a bit of manual 2892 // routing. 2893 // 2894 // See also internal/pprof.go for more profiling goodies exposed through the 2895 // admin portal. 2896 routes.GET("/debug/pprof/*path", nil, func(c *router.Context) { 2897 switch strings.TrimPrefix(c.Params.ByName("path"), "/") { 2898 case "cmdline": 2899 pprof.Cmdline(c.Writer, c.Request) 2900 case "profile": 2901 pprof.Profile(c.Writer, c.Request) 2902 case "symbol": 2903 pprof.Symbol(c.Writer, c.Request) 2904 case "trace": 2905 pprof.Trace(c.Writer, c.Request) 2906 default: 2907 pprof.Index(c.Writer, c.Request) 2908 } 2909 }) 2910 return nil 2911 } 2912 2913 // initErrorReporting initializes an Error Report client. 2914 func (s *Server) initErrorReporting() error { 2915 if !s.Options.CloudErrorReporting || s.Options.CloudProject == "" { 2916 return nil 2917 } 2918 2919 // Get token source to call Error Reporting API. 2920 var err error 2921 s.errRptClient, err = errorreporting.NewClient(s.Context, s.Options.CloudProject, errorreporting.Config{ 2922 ServiceName: s.getServiceID(), 2923 ServiceVersion: s.Options.ImageVersion(), 2924 OnError: func(err error) { 2925 // TODO(crbug/1204640): s/Warningf/Errorf once "Error Reporting" is itself 2926 // more reliable. 2927 logging.Warningf(s.Context, "Error Reporting could not log error: %s", err) 2928 }, 2929 }, option.WithTokenSource(s.cloudTS)) 2930 if err != nil { 2931 return err 2932 } 2933 2934 s.RegisterCleanup(func(ctx context.Context) { s.errRptClient.Close() }) 2935 return nil 2936 } 2937 2938 // initWarmup schedules execution of global warmup callbacks. 2939 // 2940 // On GAE also registers /_ah/warmup route. 2941 func (s *Server) initWarmup() error { 2942 // See https://cloud.google.com/appengine/docs/standard/go/configuring-warmup-requests. 2943 // All warmups should happen *before* the serving loop and /_ah/warmup should 2944 // just always return OK. 2945 if s.Options.Serverless == module.GAE { 2946 s.Routes.GET("/_ah/warmup", nil, func(*router.Context) {}) 2947 } 2948 s.RegisterWarmup(func(ctx context.Context) { warmup.Warmup(ctx) }) 2949 return nil 2950 } 2951 2952 // signerImpl implements signing.Signer on top of *Server. 2953 type signerImpl struct { 2954 srv *Server 2955 iamClient *credentials.IamCredentialsClient 2956 } 2957 2958 // SignBytes signs the blob with some active private key. 2959 func (s *signerImpl) SignBytes(ctx context.Context, blob []byte) (keyName string, signature []byte, err error) { 2960 resp, err := s.iamClient.SignBlob(ctx, &credentialspb.SignBlobRequest{ 2961 Name: "projects/-/serviceAccounts/" + s.srv.runningAs, 2962 Payload: blob, 2963 }) 2964 if err != nil { 2965 return "", nil, grpcutil.WrapIfTransient(err) 2966 } 2967 return resp.KeyId, resp.SignedBlob, nil 2968 } 2969 2970 // Certificates returns a bundle with public certificates for all active keys. 2971 func (s *signerImpl) Certificates(ctx context.Context) (*signing.PublicCertificates, error) { 2972 return signing.FetchCertificatesForServiceAccount(ctx, s.srv.runningAs) 2973 } 2974 2975 // ServiceInfo returns information about the current service. 2976 func (s *signerImpl) ServiceInfo(ctx context.Context) (*signing.ServiceInfo, error) { 2977 return &signing.ServiceInfo{ 2978 AppID: s.srv.Options.CloudProject, 2979 AppRuntime: "go", 2980 AppRuntimeVersion: runtime.Version(), 2981 AppVersion: s.srv.Options.ImageVersion(), 2982 ServiceAccountName: s.srv.runningAs, 2983 }, nil 2984 } 2985 2986 // actorTokensImpl implements auth.ActorTokensProvider using IAM Credentials. 2987 type actorTokensImpl struct { 2988 iamClient *credentials.IamCredentialsClient 2989 } 2990 2991 // GenerateAccessToken generates an access token for the given account. 2992 func (a *actorTokensImpl) GenerateAccessToken(ctx context.Context, serviceAccount string, scopes, delegates []string) (*oauth2.Token, error) { 2993 resp, err := a.iamClient.GenerateAccessToken(ctx, &credentialspb.GenerateAccessTokenRequest{ 2994 Name: "projects/-/serviceAccounts/" + serviceAccount, 2995 Scope: scopes, 2996 Delegates: delegatesList(delegates), 2997 }) 2998 if err != nil { 2999 return nil, grpcutil.WrapIfTransient(err) 3000 } 3001 return &oauth2.Token{ 3002 AccessToken: resp.AccessToken, 3003 TokenType: "Bearer", 3004 Expiry: resp.ExpireTime.AsTime(), 3005 }, nil 3006 } 3007 3008 // GenerateIDToken generates an ID token for the given account. 3009 func (a *actorTokensImpl) GenerateIDToken(ctx context.Context, serviceAccount, audience string, delegates []string) (string, error) { 3010 resp, err := a.iamClient.GenerateIdToken(ctx, &credentialspb.GenerateIdTokenRequest{ 3011 Name: "projects/-/serviceAccounts/" + serviceAccount, 3012 Audience: audience, 3013 Delegates: delegatesList(delegates), 3014 IncludeEmail: true, 3015 }) 3016 if err != nil { 3017 return "", grpcutil.WrapIfTransient(err) 3018 } 3019 return resp.Token, nil 3020 } 3021 3022 // delegatesList prepends `projects/-/serviceAccounts/` to emails. 3023 func delegatesList(emails []string) []string { 3024 if len(emails) == 0 { 3025 return nil 3026 } 3027 out := make([]string, len(emails)) 3028 for i, email := range emails { 3029 out[i] = "projects/-/serviceAccounts/" + email 3030 } 3031 return out 3032 } 3033 3034 // networkAddrsForLog returns a string with IPv4 addresses of local network 3035 // interfaces, if possible. 3036 func networkAddrsForLog() string { 3037 addrs, err := net.InterfaceAddrs() 3038 if err != nil { 3039 return fmt.Sprintf("failed to enumerate network interfaces: %s", err) 3040 } 3041 var ips []string 3042 for _, address := range addrs { 3043 if ipnet, ok := address.(*net.IPNet); ok && !ipnet.IP.IsLoopback() { 3044 if ipv4 := ipnet.IP.To4(); ipv4 != nil { 3045 ips = append(ips, ipv4.String()) 3046 } 3047 } 3048 } 3049 if len(ips) == 0 { 3050 return "<no IPv4 interfaces>" 3051 } 3052 return strings.Join(ips, ", ") 3053 } 3054 3055 // endUserIP extracts end-user IP address from X-Forwarded-For header. 3056 func endUserIP(r auth.RequestMetadata) string { 3057 // X-Forwarded-For header is set by Cloud Load Balancer and GCP Serverless 3058 // load balancer and has format: 3059 // [<untrusted part>,]<IP that connected to LB>,<unimportant>[,<more>]. 3060 // 3061 // <untrusted part> may be present if the original request from the Internet 3062 // comes with X-Forwarded-For header. We can't trust IPs specified there. We 3063 // assume GCP load balancers sanitize the format of this field though. 3064 // 3065 // <IP that connected to LB> is what we are after. 3066 // 3067 // <unimportant> is "global forwarding rule external IP" for GKE or 3068 // the constant "169.254.1.1" for GCP Serverless. We don't care about these. 3069 // 3070 // <more> is present only if we proxy the request through more layers of 3071 // load balancers *while it is already inside GKE cluster*. We assume we don't 3072 // do that (if we ever do, Options{...} should be extended with a setting that 3073 // specifies how many layers of load balancers to skip to get to the original 3074 // IP). On GCP Serverless <more> is always empty. 3075 // 3076 // See https://cloud.google.com/load-balancing/docs/https for more info. 3077 forwardedFor := strings.Split(r.Header("X-Forwarded-For"), ",") 3078 if len(forwardedFor) >= 2 { 3079 return strings.TrimSpace(forwardedFor[len(forwardedFor)-2]) 3080 } 3081 3082 // Fallback to the peer IP if X-Forwarded-For is not set. Happens when 3083 // connecting to the server's port directly from within the cluster. 3084 ip, _, err := net.SplitHostPort(r.RemoteAddr()) 3085 if err != nil { 3086 return "0.0.0.0" 3087 } 3088 return ip 3089 } 3090 3091 // isHealthCheckerUA returns true for known user agents of health probers. 3092 func isHealthCheckerUA(ua string) bool { 3093 switch { 3094 case strings.HasPrefix(ua, "kube-probe/"): // Kubernetes 3095 return true 3096 case strings.HasPrefix(ua, "GoogleHC"): // Cloud Load Balancer 3097 return true 3098 default: 3099 return false 3100 } 3101 } 3102 3103 // resolveDependencies sorts modules based on their dependencies. 3104 // 3105 // Discovers unfulfilled required dependencies. 3106 func resolveDependencies(mods []module.Module) ([]module.Module, error) { 3107 // Build a map: module.Name => module.Module 3108 modules := make(map[module.Name]module.Module, len(mods)) 3109 for _, m := range mods { 3110 if _, ok := modules[m.Name()]; ok { 3111 return nil, errors.Reason("duplicate module %q", m.Name()).Err() 3112 } 3113 modules[m.Name()] = m 3114 } 3115 3116 // Ensure all required dependencies exist, throw away missing optional 3117 // dependencies. The result is a directed graph that can be topo-sorted. 3118 graph := map[module.Name][]module.Name{} 3119 for _, m := range mods { 3120 for _, d := range m.Dependencies() { 3121 name := d.Dependency() 3122 if _, exists := modules[name]; !exists { 3123 if !d.Required() { 3124 continue 3125 } 3126 return nil, errors.Reason("module %q requires module %q which is not provided", m.Name(), name).Err() 3127 } 3128 graph[m.Name()] = append(graph[m.Name()], name) 3129 } 3130 } 3131 3132 sorted := make([]module.Module, 0, len(graph)) 3133 visited := make(map[module.Name]bool, len(graph)) 3134 3135 var visit func(n module.Name) 3136 visit = func(n module.Name) { 3137 if !visited[n] { 3138 visited[n] = true 3139 for _, dep := range graph[n] { 3140 visit(dep) 3141 } 3142 sorted = append(sorted, modules[n]) 3143 } 3144 } 3145 3146 for _, m := range mods { 3147 visit(m.Name()) 3148 } 3149 return sorted, nil 3150 }