go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/server.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/server.go (about)

     1  // Copyright 2019 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package server implements an environment for running LUCI servers.
    16  //
    17  // It interprets command line flags and initializes the serving environment with
    18  // the following core services:
    19  //
    20  //   - go.chromium.org/luci/common/logging: logging via Google Cloud Logging.
    21  //   - go.opentelemetry.io/otel/trace: OpenTelemetry tracing with export to
    22  //     Google Cloud Trace.
    23  //   - go.chromium.org/luci/server/tsmon: monitoring metrics via ProdX.
    24  //   - go.chromium.org/luci/server/auth: sending and receiving RPCs
    25  //     authenticated with Google OAuth2 or OpenID tokens. Support for
    26  //     authorization via LUCI groups and LUCI realms.
    27  //   - go.chromium.org/luci/server/caching: in-process caching.
    28  //   - go.chromium.org/luci/server/warmup: allows other server components to
    29  //     register warmup callbacks that run before the server starts handling
    30  //     requests.
    31  //   - go.chromium.org/luci/server/experiments: simple feature flags support.
    32  //   - go.chromium.org/luci/grpc/prpc: pRPC server and RPC Explorer UI.
    33  //   - Error reporting via Google Cloud Error Reporting.
    34  //   - Continuous profiling via Google Cloud Profiler.
    35  //
    36  // Other functionality is optional and provided by modules (objects implementing
    37  // module.Module interface). They should be passed to the server when it starts
    38  // (see the example below). Modules usually expose their configuration via
    39  // command line flags, and provide functionality by injecting state into
    40  // the server's global context.Context or by exposing gRPC endpoints.
    41  //
    42  // Usage example:
    43  //
    44  //	import (
    45  //	  ...
    46  //
    47  //	  "go.chromium.org/luci/server"
    48  //	  "go.chromium.org/luci/server/gaeemulation"
    49  //	  "go.chromium.org/luci/server/module"
    50  //	  "go.chromium.org/luci/server/redisconn"
    51  //	)
    52  //
    53  //	func main() {
    54  //	  modules := []module.Module{
    55  //	    gaeemulation.NewModuleFromFlags(),
    56  //	    redisconn.NewModuleFromFlags(),
    57  //	  }
    58  //	  server.Main(nil, modules, func(srv *server.Server) error {
    59  //	    // Initialize global state, change root context (if necessary).
    60  //	    if err := initializeGlobalStuff(srv.Context); err != nil {
    61  //	      return err
    62  //	    }
    63  //	    srv.Context = injectGlobalStuff(srv.Context)
    64  //
    65  //	    // Install regular HTTP routes.
    66  //	    srv.Routes.GET("/", nil, func(c *router.Context) {
    67  //	      // ...
    68  //	    })
    69  //
    70  //	    // Install gRPC services.
    71  //	    servicepb.RegisterSomeServer(srv, &SomeServer{})
    72  //	    return nil
    73  //	  })
    74  //	}
    75  //
    76  // More examples can be found in the code search: https://source.chromium.org/search?q=%22server.Main%28nil%2C%20modules%2C%22
    77  //
    78  // # Known modules
    79  //
    80  // The following modules (in alphabetical order) are a part of the LUCI
    81  // repository and can be used in any server binary:
    82  //
    83  //   - go.chromium.org/luci/config/server/cfgmodule: provides LUCI Config
    84  //     client, exposes config validation endpoints used by LUCI Config service.
    85  //   - go.chromium.org/luci/server/analytics: generates Google Analytics js
    86  //     snippets for inclusion in a service's web pages.
    87  //   - go.chromium.org/luci/server/bqlog: implements best effort low-overhead
    88  //     structured logging to BigQuery suitable for debug data like access logs.
    89  //   - go.chromium.org/luci/server/cron: allows registering Cloud Scheduler (aka
    90  //     Appengine cron.yaml) handlers, with proper authentication and monitoring
    91  //     metrics.
    92  //   - go.chromium.org/luci/server/encryptedcookies: implements an
    93  //     authentication scheme for HTTP routes based on encrypted cookies and user
    94  //     sessions in some session store.
    95  //   - go.chromium.org/luci/server/dsmapper: provides a way to apply some
    96  //     function to all datastore entities of some particular kind, in parallel,
    97  //     distributing work via Cloud Tasks.
    98  //   - go.chromium.org/luci/server/gaeemulation: implements
    99  //     go.chromium.org/luci/gae Datastore interface via Google Cloud Datastore
   100  //     API. Named so because because it enables migration of GAEv1 apps to GAEv2
   101  //     without touching datastore-related code.
   102  //   - go.chromium.org/luci/server/gerritauth: implements authentication using
   103  //     Gerrit JWTs. Useful if a service is used by a Gerrit frontend plugin.
   104  //   - go.chromium.org/luci/server/limiter: a simple load shedding mechanism
   105  //     that puts a limit on a number of concurrent gRPC requests the server
   106  //     is handling.
   107  //   - go.chromium.org/luci/server/mailer: sending simple emails.
   108  //   - go.chromium.org/luci/server/redisconn: a Redis client. Also enables Redis
   109  //     as a caching backend for go.chromium.org/luci/server/caching and for
   110  //     go.chromium.org/luci/gae/filter/dscache.
   111  //   - go.chromium.org/luci/server/secrets: enables generation and validation of
   112  //     HMAC-tagged tokens via go.chromium.org/luci/server/tokens.
   113  //   - go.chromium.org/luci/server/span: a Cloud Spanner client. Wraps Spanner
   114  //     API a bit to improve interoperability with other modules (in particular
   115  //     the TQ module).
   116  //   - go.chromium.org/luci/server/tq: implements a task queue mechanism on top
   117  //     of Cloud Tasks and Cloud PubSub. Also implements transactional task
   118  //     enqueuing when submitting tasks in a Cloud Datastore or a Cloud Spanner
   119  //     transaction.
   120  //
   121  // Most of them need to be configured via corresponding CLI flags to be useful.
   122  // See implementation of individual modules for details.
   123  //
   124  // An up-to-date list of all known module implementations can be found here:
   125  // https://source.chromium.org/search?q=%22NewModuleFromFlags()%20module.Module%22
   126  //
   127  // # gRPC services
   128  //
   129  // The server implements grpc.ServiceRegistrar interface which means it can be
   130  // used to register gRPC service implementations in. The registered services
   131  // will be exposed via gRPC protocol over the gRPC port (if the gRPC serving
   132  // port is configured in options) and via pRPC protocol over the main HTTP port
   133  // (if the main HTTP serving port is configured in options). The server is also
   134  // pre-configured with a set of gRPC interceptors that collect performance
   135  // metrics, catch panics and authenticate requests. More interceptors can be
   136  // added via RegisterUnaryServerInterceptors.
   137  //
   138  // # Security considerations
   139  //
   140  // The expected deployment environments are Kubernetes, Google App Engine and
   141  // Google Cloud Run. In all cases the server is expected to be behind a load
   142  // balancer or proxy (or a series of load balancers and proxies) that terminate
   143  // TLS and set `X-Forwarded-For` and `X-Forwarded-Proto` headers. In particular
   144  // `X-Forwarded-For` header should look like:
   145  //
   146  //	[<untrusted part>,]<IP that connected to the LB>,<unimportant>[,<more>].
   147  //
   148  // Where `<untrusted part>` may be present if the original request from the
   149  // Internet comes with `X-Forwarded-For` header. The IP specified there is not
   150  // trusted, but the server assumes the load balancer at least sanitizes the
   151  // format of this field.
   152  //
   153  // `<IP that connected to the LB>` is the end-client IP that can be used by the
   154  // server for logs and for IP-allowlist checks.
   155  //
   156  // `<unimportant>` is a "global forwarding rule external IP" for GKE or
   157  // the constant "169.254.1.1" for GAE and Cloud Run. It is unused. See
   158  // https://cloud.google.com/load-balancing/docs/https for more info.
   159  //
   160  // `<more>` may be present if the request was proxied through more layers of
   161  // load balancers while already inside the cluster. The server currently assumes
   162  // this is not happening (i.e. `<more>` is absent, or, in other words, the
   163  // client IP is the second to last in the `X-Forwarded-For` list). If you need
   164  // to recognize more layers of load balancing, please file a feature request to
   165  // add a CLI flag specifying how many layers of load balancers to skip to get to
   166  // the original IP.
   167  package server
   168  
   169  import (
   170  	"context"
   171  	cryptorand "crypto/rand"
   172  	"crypto/sha256"
   173  	"encoding/binary"
   174  	"encoding/hex"
   175  	"flag"
   176  	"fmt"
   177  	"math/rand"
   178  	"net"
   179  	"net/http"
   180  	"net/http/pprof"
   181  	"os"
   182  	"runtime"
   183  	"strings"
   184  	"sync"
   185  	"sync/atomic"
   186  	"time"
   187  
   188  	gcemetadata "cloud.google.com/go/compute/metadata"
   189  	"cloud.google.com/go/errorreporting"
   190  	credentials "cloud.google.com/go/iam/credentials/apiv1"
   191  	"cloud.google.com/go/iam/credentials/apiv1/credentialspb"
   192  	"cloud.google.com/go/profiler"
   193  	texporter "github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/trace"
   194  	gcppropagator "github.com/GoogleCloudPlatform/opentelemetry-operations-go/propagator"
   195  	"go.opentelemetry.io/contrib/detectors/gcp"
   196  	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
   197  	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
   198  	"go.opentelemetry.io/otel"
   199  	"go.opentelemetry.io/otel/propagation"
   200  	"go.opentelemetry.io/otel/sdk/resource"
   201  	"go.opentelemetry.io/otel/sdk/trace"
   202  	semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
   203  	oteltrace "go.opentelemetry.io/otel/trace"
   204  	"golang.org/x/oauth2"
   205  	"google.golang.org/api/option"
   206  	codepb "google.golang.org/genproto/googleapis/rpc/code"
   207  	"google.golang.org/grpc"
   208  	"google.golang.org/grpc/status"
   209  
   210  	clientauth "go.chromium.org/luci/auth"
   211  	"go.chromium.org/luci/common/clock"
   212  	"go.chromium.org/luci/common/errors"
   213  	luciflag "go.chromium.org/luci/common/flag"
   214  	"go.chromium.org/luci/common/flag/stringlistflag"
   215  	"go.chromium.org/luci/common/iotools"
   216  	"go.chromium.org/luci/common/logging"
   217  	"go.chromium.org/luci/common/logging/gologger"
   218  	"go.chromium.org/luci/common/logging/sdlogger"
   219  	"go.chromium.org/luci/common/system/signals"
   220  	tsmoncommon "go.chromium.org/luci/common/tsmon"
   221  	"go.chromium.org/luci/common/tsmon/metric"
   222  	"go.chromium.org/luci/common/tsmon/monitor"
   223  	"go.chromium.org/luci/common/tsmon/target"
   224  	"go.chromium.org/luci/grpc/discovery"
   225  	"go.chromium.org/luci/grpc/grpcmon"
   226  	"go.chromium.org/luci/grpc/grpcutil"
   227  	"go.chromium.org/luci/grpc/prpc"
   228  	"go.chromium.org/luci/hardcoded/chromeinfra" // should be used ONLY in Main()
   229  	"go.chromium.org/luci/web/rpcexplorer"
   230  
   231  	"go.chromium.org/luci/server/auth"
   232  	"go.chromium.org/luci/server/auth/authdb"
   233  	"go.chromium.org/luci/server/auth/authdb/dump"
   234  	"go.chromium.org/luci/server/auth/openid"
   235  	"go.chromium.org/luci/server/auth/signing"
   236  	"go.chromium.org/luci/server/caching"
   237  	"go.chromium.org/luci/server/experiments"
   238  	"go.chromium.org/luci/server/internal"
   239  	"go.chromium.org/luci/server/internal/gae"
   240  	"go.chromium.org/luci/server/middleware"
   241  	"go.chromium.org/luci/server/module"
   242  	"go.chromium.org/luci/server/portal"
   243  	"go.chromium.org/luci/server/router"
   244  	"go.chromium.org/luci/server/secrets"
   245  	"go.chromium.org/luci/server/tsmon"
   246  	"go.chromium.org/luci/server/warmup"
   247  )
   248  
   249  const (
   250  	// Path of the health check endpoint.
   251  	healthEndpoint = "/healthz"
   252  
   253  	// Log a warning if health check is slower than this.
   254  	healthTimeLogThreshold    = 50 * time.Millisecond
   255  	defaultTsMonFlushInterval = 60 * time.Second
   256  	defaultTsMonFlushTimeout  = 15 * time.Second
   257  )
   258  
   259  var (
   260  	versionMetric = metric.NewString(
   261  		"server/version",
   262  		"Version of the running container image (taken from -container-image-id).",
   263  		nil)
   264  )
   265  
   266  // cloudRegionFromGAERegion maps GAE region codes (e.g. `s`) to corresponding
   267  // cloud regions (e.g. `us-central1`), which may be defined as regions where GAE
   268  // creates resources associated with the app, such as Task Queues or Flex VMs.
   269  //
   270  // Sadly this mapping is not documented, thus the below map is incomplete. Feel
   271  // free to modify it if you deployed to some new GAE region.
   272  //
   273  // This mapping is unused if `-cloud-region` flag is passed explicitly.
   274  var cloudRegionFromGAERegion = map[string]string{
   275  	"e": "europe-west1",
   276  	"g": "europe-west2",
   277  	"h": "europe-west3",
   278  	"m": "us-west2",
   279  	"p": "us-east1",
   280  	"s": "us-central1",
   281  }
   282  
   283  // Context key of *incomingRequest{...}, see httpRoot(...) and grpcRoot(...).
   284  var incomingRequestKey = "go.chromium.org/luci/server.incomingRequest"
   285  
   286  // Main initializes the server and runs its serving loop until SIGTERM.
   287  //
   288  // Registers all options in the default flag set and uses `flag.Parse` to parse
   289  // them. If 'opts' is nil, the default options will be used. Only flags are
   290  // allowed in the command line (no positional arguments).
   291  //
   292  // Additionally recognizes GAE_* and K_* env vars as an indicator that the
   293  // server is running in the corresponding serverless runtime. This slightly
   294  // tweaks its behavior to match what these runtimes expects from servers.
   295  //
   296  // On errors, logs them and aborts the process with non-zero exit code.
   297  func Main(opts *Options, mods []module.Module, init func(srv *Server) error) {
   298  	// Prepopulate defaults for flags based on the runtime environment.
   299  	opts, err := OptionsFromEnv(opts)
   300  	if err != nil {
   301  		fmt.Fprintf(os.Stderr, "When constructing options: %s\n", err)
   302  		os.Exit(3)
   303  	}
   304  
   305  	// Register and parse server flags.
   306  	opts.Register(flag.CommandLine)
   307  	flag.Parse()
   308  	if args := flag.Args(); len(args) > 0 {
   309  		fmt.Fprintf(os.Stderr, "got unexpected positional command line arguments: %v\n", args)
   310  		os.Exit(3)
   311  	}
   312  
   313  	// Construct the server and run its serving loop.
   314  	srv, err := New(context.Background(), *opts, mods)
   315  	if err != nil {
   316  		srv.Fatal(err)
   317  	}
   318  	if init != nil {
   319  		if err = init(srv); err != nil {
   320  			srv.Fatal(err)
   321  		}
   322  	}
   323  	if err = srv.Serve(); err != nil {
   324  		srv.Fatal(err)
   325  	}
   326  }
   327  
   328  // Options are used to configure the server.
   329  //
   330  // Most of them are exposed as command line flags (see Register implementation).
   331  // Some (specific to serverless runtimes) are only settable through code or are
   332  // derived from the environment.
   333  type Options struct {
   334  	Prod       bool              // set when running in production (not on a dev workstation)
   335  	Serverless module.Serverless // set when running in a serverless environment, implies Prod
   336  	Hostname   string            // used for logging and metric fields, default is os.Hostname
   337  
   338  	HTTPAddr  string // address to bind the main listening socket to
   339  	GRPCAddr  string // address to bind the gRPC listening socket to
   340  	AdminAddr string // address to bind the admin socket to, ignored on GAE and Cloud Run
   341  	AllowH2C  bool   // if true, allow HTTP/2 Cleartext traffic on non-gRPC HTTP ports
   342  
   343  	DefaultRequestTimeout  time.Duration // how long non-internal HTTP handlers are allowed to run, 1 min by default
   344  	InternalRequestTimeout time.Duration // how long "/internal/*" HTTP handlers are allowed to run, 10 min by default
   345  	ShutdownDelay          time.Duration // how long to wait after SIGTERM before shutting down
   346  
   347  	ClientAuth       clientauth.Options // base settings for client auth options
   348  	TokenCacheDir    string             // where to cache auth tokens (optional)
   349  	AuthDBProvider   auth.DBProvider    // source of the AuthDB: if set all Auth* options below are ignored
   350  	AuthDBPath       string             // if set, load AuthDB from a file
   351  	AuthServiceHost  string             // hostname of an Auth Service to use
   352  	AuthDBDump       string             // Google Storage path to fetch AuthDB dumps from
   353  	AuthDBSigner     string             // service account that signs AuthDB dumps
   354  	FrontendClientID string             // OAuth2 ClientID for frontend (e.g. user sign in)
   355  
   356  	OpenIDRPCAuthEnable   bool                // if true, use OIDC identity tokens for RPC authentication
   357  	OpenIDRPCAuthAudience stringlistflag.Flag // additional allowed OIDC token audiences
   358  
   359  	CloudProject string // name of the hosting Google Cloud Project
   360  	CloudRegion  string // name of the hosting Google Cloud region
   361  
   362  	TraceSampling string // what portion of traces to upload to Cloud Trace (ignored on GAE and Cloud Run)
   363  
   364  	TsMonAccount       string        // service account to flush metrics as
   365  	TsMonServiceName   string        // service name of tsmon target
   366  	TsMonJobName       string        // job name of tsmon target
   367  	TsMonFlushInterval time.Duration // how often to flush metrics
   368  	TsMonFlushTimeout  time.Duration // timeout for flushing
   369  
   370  	ProfilingProbability float64 // an [0; 1.0] float with a chance to enable Cloud Profiler in the process
   371  	ProfilingServiceID   string  // service name to associated with profiles in Cloud Profiler
   372  
   373  	ContainerImageID string // ID of the container image with this binary, for logs (optional)
   374  
   375  	EnableExperiments []string // names of go.chromium.org/luci/server/experiments to enable
   376  
   377  	CloudErrorReporting bool // set to true to enable Cloud Error Reporting
   378  
   379  	testSeed           int64                   // used to seed rng in tests
   380  	testStdout         sdlogger.LogEntryWriter // mocks stdout in tests
   381  	testStderr         sdlogger.LogEntryWriter // mocks stderr in tests
   382  	testListeners      map[string]net.Listener // addr => net.Listener, for tests
   383  	testDisableTracing bool                    // don't install a tracing backend
   384  }
   385  
   386  // OptionsFromEnv prepopulates options based on the runtime environment.
   387  //
   388  // It detects if the process is running on GAE or Cloud Run and adjust options
   389  // accordingly. See FromGAEEnv and FromCloudRunEnv for exact details of how it
   390  // happens.
   391  //
   392  // Either mutates give `opts`, returning it in the end, or (if `opts` is nil)
   393  // create new Options.
   394  func OptionsFromEnv(opts *Options) (*Options, error) {
   395  	if opts == nil {
   396  		opts = &Options{}
   397  	}
   398  
   399  	// Populate unset ClientAuth fields with hardcoded defaults.
   400  	authDefaults := chromeinfra.DefaultAuthOptions()
   401  	if opts.ClientAuth.ClientID == "" {
   402  		opts.ClientAuth.ClientID = authDefaults.ClientID
   403  		opts.ClientAuth.ClientSecret = authDefaults.ClientSecret
   404  	}
   405  	if opts.ClientAuth.TokenServerHost == "" {
   406  		opts.ClientAuth.TokenServerHost = authDefaults.TokenServerHost
   407  	}
   408  	if opts.ClientAuth.SecretsDir == "" {
   409  		opts.ClientAuth.SecretsDir = authDefaults.SecretsDir
   410  	}
   411  
   412  	// Use CloudOAuthScopes by default when using UserCredentialsMethod auth mode.
   413  	// This is ignored when running in the cloud (the server uses the ambient
   414  	// credentials provided by the environment).
   415  	if len(opts.ClientAuth.Scopes) == 0 {
   416  		opts.ClientAuth.Scopes = auth.CloudOAuthScopes
   417  	}
   418  
   419  	// Prepopulate defaults for flags based on the runtime environment.
   420  	opts.FromGAEEnv()
   421  	if err := opts.FromCloudRunEnv(); err != nil {
   422  		return nil, errors.Annotate(err, "failed to probe Cloud Run environment").Err()
   423  	}
   424  	return opts, nil
   425  }
   426  
   427  // Register registers the command line flags.
   428  func (o *Options) Register(f *flag.FlagSet) {
   429  	if o.HTTPAddr == "" {
   430  		o.HTTPAddr = "localhost:8800"
   431  	}
   432  	if o.GRPCAddr == "" {
   433  		o.GRPCAddr = "-" // disabled by default
   434  	}
   435  	if o.AdminAddr == "" {
   436  		o.AdminAddr = "localhost:8900"
   437  	}
   438  	if o.DefaultRequestTimeout == 0 {
   439  		o.DefaultRequestTimeout = time.Minute
   440  	}
   441  	if o.InternalRequestTimeout == 0 {
   442  		o.InternalRequestTimeout = 10 * time.Minute
   443  	}
   444  	if o.ShutdownDelay == 0 {
   445  		o.ShutdownDelay = 15 * time.Second
   446  	}
   447  	if o.TsMonFlushInterval == 0 {
   448  		o.TsMonFlushInterval = defaultTsMonFlushInterval
   449  	}
   450  	if o.TsMonFlushTimeout == 0 {
   451  		o.TsMonFlushTimeout = defaultTsMonFlushTimeout
   452  	}
   453  	if o.ProfilingProbability == 0 {
   454  		o.ProfilingProbability = 1.0
   455  	} else if o.ProfilingProbability < 0 {
   456  		o.ProfilingProbability = 0
   457  	}
   458  	f.BoolVar(&o.Prod, "prod", o.Prod, "Switch the server into production mode")
   459  	f.StringVar(&o.HTTPAddr, "http-addr", o.HTTPAddr, "Address to bind the main listening socket to or '-' to disable")
   460  	f.StringVar(&o.GRPCAddr, "grpc-addr", o.GRPCAddr, "Address to bind the gRPC listening socket to or '-' to disable")
   461  	f.StringVar(&o.AdminAddr, "admin-addr", o.AdminAddr, "Address to bind the admin socket to or '-' to disable")
   462  	f.BoolVar(&o.AllowH2C, "allow-h2c", o.AllowH2C, "If set, allow HTTP/2 Cleartext traffic on non-gRPC HTTP ports (in addition to HTTP/1 traffic). The gRPC port always allows it, it is essential for gRPC")
   463  	f.DurationVar(&o.DefaultRequestTimeout, "default-request-timeout", o.DefaultRequestTimeout, "How long incoming HTTP requests are allowed to run before being canceled (or 0 for infinity)")
   464  	f.DurationVar(&o.InternalRequestTimeout, "internal-request-timeout", o.InternalRequestTimeout, "How long incoming /internal/* HTTP requests are allowed to run before being canceled (or 0 for infinity)")
   465  	f.DurationVar(&o.ShutdownDelay, "shutdown-delay", o.ShutdownDelay, "How long to wait after SIGTERM before shutting down")
   466  	f.StringVar(
   467  		&o.ClientAuth.ServiceAccountJSONPath,
   468  		"service-account-json",
   469  		o.ClientAuth.ServiceAccountJSONPath,
   470  		"Path to a JSON file with service account private key",
   471  	)
   472  	f.StringVar(
   473  		&o.ClientAuth.ActAsServiceAccount,
   474  		"act-as",
   475  		o.ClientAuth.ActAsServiceAccount,
   476  		"Act as this service account",
   477  	)
   478  	f.StringVar(
   479  		&o.TokenCacheDir,
   480  		"token-cache-dir",
   481  		o.TokenCacheDir,
   482  		"Where to cache auth tokens (optional)",
   483  	)
   484  	f.StringVar(
   485  		&o.AuthDBPath,
   486  		"auth-db-path",
   487  		o.AuthDBPath,
   488  		"If set, load AuthDB text proto from this file (incompatible with -auth-service-host)",
   489  	)
   490  	f.StringVar(
   491  		&o.AuthServiceHost,
   492  		"auth-service-host",
   493  		o.AuthServiceHost,
   494  		"Hostname of an Auth Service to use (incompatible with -auth-db-path)",
   495  	)
   496  	f.StringVar(
   497  		&o.AuthDBDump,
   498  		"auth-db-dump",
   499  		o.AuthDBDump,
   500  		"Google Storage path to fetch AuthDB dumps from. Default is gs://<auth-service-host>/auth-db",
   501  	)
   502  	f.StringVar(
   503  		&o.AuthDBSigner,
   504  		"auth-db-signer",
   505  		o.AuthDBSigner,
   506  		"Service account that signs AuthDB dumps. Default is derived from -auth-service-host if it is *.appspot.com",
   507  	)
   508  	f.StringVar(
   509  		&o.FrontendClientID,
   510  		"frontend-client-id",
   511  		o.FrontendClientID,
   512  		"OAuth2 clientID for use in frontend, e.g. for user sign in (optional)",
   513  	)
   514  	f.BoolVar(
   515  		&o.OpenIDRPCAuthEnable,
   516  		"open-id-rpc-auth-enable",
   517  		o.OpenIDRPCAuthEnable,
   518  		"If set accept OpenID Connect ID tokens as per-RPC credentials",
   519  	)
   520  	f.Var(
   521  		&o.OpenIDRPCAuthAudience,
   522  		"open-id-rpc-auth-audience",
   523  		"Additional accepted value of `aud` claim in OpenID tokens, can be repeated",
   524  	)
   525  	f.StringVar(
   526  		&o.CloudProject,
   527  		"cloud-project",
   528  		o.CloudProject,
   529  		"Name of hosting Google Cloud Project (optional)",
   530  	)
   531  	f.StringVar(
   532  		&o.CloudRegion,
   533  		"cloud-region",
   534  		o.CloudRegion,
   535  		"Name of hosting Google Cloud region, e.g. 'us-central1' (optional)",
   536  	)
   537  	f.StringVar(
   538  		&o.TraceSampling,
   539  		"trace-sampling",
   540  		o.TraceSampling,
   541  		"What portion of traces to upload to Cloud Trace. Either a percent (i.e. '0.1%') or a QPS (i.e. '1qps'). Ignored on GAE and Cloud Run. Default is 0.1qps.",
   542  	)
   543  	f.StringVar(
   544  		&o.TsMonAccount,
   545  		"ts-mon-account",
   546  		o.TsMonAccount,
   547  		"Collect and flush tsmon metrics using this account for auth (disables tsmon if not set)",
   548  	)
   549  	f.StringVar(
   550  		&o.TsMonServiceName,
   551  		"ts-mon-service-name",
   552  		o.TsMonServiceName,
   553  		"Service name of tsmon target (disables tsmon if not set)",
   554  	)
   555  	f.StringVar(
   556  		&o.TsMonJobName,
   557  		"ts-mon-job-name",
   558  		o.TsMonJobName,
   559  		"Job name of tsmon target (disables tsmon if not set)",
   560  	)
   561  	f.DurationVar(
   562  		&o.TsMonFlushInterval,
   563  		"ts-mon-flush-interval",
   564  		o.TsMonFlushInterval,
   565  		fmt.Sprintf("How often to flush tsmon metrics. Default to %s if < 1s or unset", o.TsMonFlushInterval),
   566  	)
   567  	f.DurationVar(
   568  		&o.TsMonFlushTimeout,
   569  		"ts-mon-flush-timeout",
   570  		o.TsMonFlushTimeout,
   571  		fmt.Sprintf("Timeout for tsmon flush. Default to %s if < 1s or unset. Must be shorter than --ts-mon-flush-interval.", o.TsMonFlushTimeout),
   572  	)
   573  	f.Float64Var(
   574  		&o.ProfilingProbability,
   575  		"profiling-probability",
   576  		o.ProfilingProbability,
   577  		fmt.Sprintf("A float [0; 1.0] with probability to enable Cloud Profiler for the current process. Default is %f.", o.ProfilingProbability),
   578  	)
   579  	f.StringVar(
   580  		&o.ProfilingServiceID,
   581  		"profiling-service-id",
   582  		o.ProfilingServiceID,
   583  		"Service name to associated with profiles in Cloud Profiler. Defaults to the value of -ts-mon-job-name.",
   584  	)
   585  	f.StringVar(
   586  		&o.ContainerImageID,
   587  		"container-image-id",
   588  		o.ContainerImageID,
   589  		"ID of the container image with this binary, for logs (optional)",
   590  	)
   591  	f.BoolVar(
   592  		&o.CloudErrorReporting,
   593  		"cloud-error-reporting",
   594  		o.CloudErrorReporting,
   595  		"Enable Cloud Error Reporting",
   596  	)
   597  
   598  	// See go.chromium.org/luci/server/experiments.
   599  	f.Var(luciflag.StringSlice(&o.EnableExperiments), "enable-experiment",
   600  		`A name of the experiment to enable. May be repeated.`)
   601  }
   602  
   603  // FromGAEEnv uses the GAE_* env vars to configure the server for the GAE
   604  // environment.
   605  //
   606  // Does nothing if GAE_VERSION is not set.
   607  //
   608  // Equivalent to passing the following flags:
   609  //
   610  //	-prod
   611  //	-http-addr 0.0.0.0:${PORT}
   612  //	-admin-addr -
   613  //	-shutdown-delay 1s
   614  //	-cloud-project ${GOOGLE_CLOUD_PROJECT}
   615  //	-cloud-region <derived from the region code in GAE_APPLICATION>
   616  //	-service-account-json :gce
   617  //	-ts-mon-service-name ${GOOGLE_CLOUD_PROJECT}
   618  //	-ts-mon-job-name ${GAE_SERVICE}
   619  //
   620  // Additionally the hostname and -container-image-id (used in metric and trace
   621  // fields) are derived from available GAE_* env vars to be semantically similar
   622  // to what they represent in the GKE environment.
   623  //
   624  // Note that a mapping between a region code in GAE_APPLICATION and
   625  // the corresponding cloud region is not documented anywhere, so if you see
   626  // warnings when your app starts up either update the code to recognize your
   627  // region code or pass '-cloud-region' argument explicitly in app.yaml.
   628  //
   629  // See https://cloud.google.com/appengine/docs/standard/go/runtime.
   630  func (o *Options) FromGAEEnv() {
   631  	if os.Getenv("GAE_VERSION") == "" {
   632  		return
   633  	}
   634  	o.Serverless = module.GAE
   635  	o.Prod = true
   636  	o.Hostname = uniqueServerlessHostname(
   637  		os.Getenv("GAE_SERVICE"),
   638  		os.Getenv("GAE_DEPLOYMENT_ID"),
   639  		os.Getenv("GAE_INSTANCE"),
   640  	)
   641  	o.HTTPAddr = fmt.Sprintf("0.0.0.0:%s", os.Getenv("PORT"))
   642  	o.GRPCAddr = "-"
   643  	o.AdminAddr = "-"
   644  	o.ShutdownDelay = time.Second
   645  	o.CloudProject = os.Getenv("GOOGLE_CLOUD_PROJECT")
   646  	o.ClientAuth.ServiceAccountJSONPath = clientauth.GCEServiceAccount
   647  	o.TsMonServiceName = os.Getenv("GOOGLE_CLOUD_PROJECT")
   648  	o.TsMonJobName = os.Getenv("GAE_SERVICE")
   649  	o.ContainerImageID = fmt.Sprintf("appengine/%s/%s:%s",
   650  		os.Getenv("GOOGLE_CLOUD_PROJECT"),
   651  		os.Getenv("GAE_SERVICE"),
   652  		os.Getenv("GAE_VERSION"),
   653  	)
   654  	// Note: GAE_APPLICATION is missing on Flex.
   655  	if appID := os.Getenv("GAE_APPLICATION"); appID != "" && o.CloudRegion == "" {
   656  		o.CloudRegion = cloudRegionFromGAERegion[strings.Split(appID, "~")[0]]
   657  	}
   658  }
   659  
   660  // FromCloudRunEnv recognized K_SERVICE environment variable and configures
   661  // some options based on what it discovers in the environment.
   662  //
   663  // Does nothing if K_SERVICE is not set.
   664  //
   665  // Equivalent to passing the following flags:
   666  //
   667  //	-prod
   668  //	-http-addr -
   669  //	-grpc-addr -
   670  //	-admin-addr -
   671  //	-allow-h2c
   672  //	-shutdown-delay 1s
   673  //	-cloud-project <cloud project Cloud Run container is running in>
   674  //	-cloud-region <cloud region Cloud Run container is running in>
   675  //	-service-account-json :gce
   676  //	-open-id-rpc-auth-enable
   677  //	-ts-mon-service-name <cloud project Cloud Run container is running in>
   678  //	-ts-mon-job-name ${K_SERVICE}
   679  //
   680  // Flags passed via the actual command line in the Cloud Run manifest override
   681  // these prefilled defaults. In particular pass either `-http-addr` or
   682  // `-grpc-addr` (or both) to enable corresponding ports.
   683  //
   684  // Additionally the hostname (used in metric and trace fields) is derived from
   685  // environment to be semantically similar to what it looks like in the GKE
   686  // environment.
   687  func (o *Options) FromCloudRunEnv() error {
   688  	if os.Getenv("K_SERVICE") == "" {
   689  		return nil
   690  	}
   691  
   692  	// See https://cloud.google.com/run/docs/container-contract.
   693  	project, err := gcemetadata.Get("project/project-id")
   694  	if err != nil {
   695  		return errors.Annotate(err, "failed to get the project ID").Err()
   696  	}
   697  	region, err := gcemetadata.Get("instance/region")
   698  	if err != nil {
   699  		return errors.Annotate(err, "failed to get the cloud region").Err()
   700  	}
   701  	// Region format returned by Cloud Run is `projects/PROJECT-NUMBER/regions/REGION`
   702  	parts := strings.Split(region, "/")
   703  	region = parts[len(parts)-1]
   704  	instance, err := gcemetadata.Get("instance/id")
   705  	if err != nil {
   706  		return errors.Annotate(err, "failed to get the instance ID").Err()
   707  	}
   708  
   709  	o.Serverless = module.CloudRun
   710  	o.Prod = true
   711  	o.Hostname = uniqueServerlessHostname(os.Getenv("K_REVISION"), instance)
   712  	o.HTTPAddr = "-"
   713  	o.GRPCAddr = "-"
   714  	o.AdminAddr = "-"
   715  	o.AllowH2C = true // to allow using HTTP2 end-to-end with `--use-http2` deployment flag
   716  	o.ShutdownDelay = time.Second
   717  	o.CloudProject = project
   718  	o.CloudRegion = region
   719  	o.ClientAuth.ServiceAccountJSONPath = clientauth.GCEServiceAccount
   720  	o.OpenIDRPCAuthEnable = true
   721  	o.TsMonServiceName = project
   722  	o.TsMonJobName = os.Getenv("K_SERVICE")
   723  
   724  	return nil
   725  }
   726  
   727  // uniqueServerlessHostname generates a hostname to use when running in a GCP
   728  // serverless environment.
   729  //
   730  // Unlike GKE or GCE environments, serverless containers do not have a proper
   731  // unique hostname set, but we still need to identify them uniquely in logs
   732  // and monitoring metrics. They do have a giant hex instance ID string, but it
   733  // is not informative on its own and cumbersome to use.
   734  //
   735  // This functions produces a reasonably readable and unique string that looks
   736  // like `parts[0]-parts[1]-...-hash(parts[last])`. It assumes the last string
   737  // in `parts` is the giant instance ID.
   738  func uniqueServerlessHostname(parts ...string) string {
   739  	id := sha256.Sum256([]byte(parts[len(parts)-1]))
   740  	parts[len(parts)-1] = hex.EncodeToString(id[:])[:16]
   741  	return strings.Join(parts, "-")
   742  }
   743  
   744  // ImageVersion extracts image tag or digest from ContainerImageID.
   745  //
   746  // This is eventually reported as a value of 'server/version' metric.
   747  //
   748  // On GAE it would return the service version name based on GAE_VERSION env var,
   749  // since ContainerImageID is artificially constructed to look like
   750  // "appengine/${CLOUD_PROJECT}/${GAE_SERVICE}:${GAE_VERSION}".
   751  //
   752  // On Cloud Run it is responsibility of the deployment layer to correctly
   753  // populate -container-image-id command line flag.
   754  //
   755  // Returns "unknown" if ContainerImageID is empty or malformed.
   756  func (o *Options) ImageVersion() string {
   757  	// Recognize "<path>@sha256:<digest>" and "<path>:<tag>".
   758  	idx := strings.LastIndex(o.ContainerImageID, "@")
   759  	if idx == -1 {
   760  		idx = strings.LastIndex(o.ContainerImageID, ":")
   761  	}
   762  	if idx == -1 {
   763  		return "unknown"
   764  	}
   765  	return o.ContainerImageID[idx+1:]
   766  }
   767  
   768  // ImageName extracts image name from ContainerImageID.
   769  //
   770  // This is the part of ContainerImageID before ':' or '@'.
   771  func (o *Options) ImageName() string {
   772  	// Recognize "<path>@sha256:<digest>" and "<path>:<tag>".
   773  	idx := strings.LastIndex(o.ContainerImageID, "@")
   774  	if idx == -1 {
   775  		idx = strings.LastIndex(o.ContainerImageID, ":")
   776  	}
   777  	if idx == -1 {
   778  		return "unknown"
   779  	}
   780  	return o.ContainerImageID[:idx]
   781  }
   782  
   783  // userAgent derives a user-agent like string identifying the server.
   784  func (o *Options) userAgent() string {
   785  	return fmt.Sprintf("LUCI-Server (service: %s; job: %s; ver: %s);", o.TsMonServiceName, o.TsMonJobName, o.ImageVersion())
   786  }
   787  
   788  // shouldEnableTracing is true if options indicate we should enable tracing.
   789  func (o *Options) shouldEnableTracing() bool {
   790  	switch {
   791  	case o.CloudProject == "":
   792  		return false // nowhere to upload traces to
   793  	case !o.Prod && o.TraceSampling == "":
   794  		return false // in dev mode don't upload samples by default
   795  	default:
   796  		return !o.testDisableTracing
   797  	}
   798  }
   799  
   800  // hostOptions constructs HostOptions for module.Initialize(...).
   801  func (o *Options) hostOptions() module.HostOptions {
   802  	return module.HostOptions{
   803  		Prod:         o.Prod,
   804  		Serverless:   o.Serverless,
   805  		CloudProject: o.CloudProject,
   806  		CloudRegion:  o.CloudRegion,
   807  	}
   808  }
   809  
   810  // Server is responsible for initializing and launching the serving environment.
   811  //
   812  // Generally assumed to be a singleton: do not launch multiple Server instances
   813  // within the same process, use AddPort instead if you want to expose multiple
   814  // HTTP ports with different routers.
   815  //
   816  // Server can serve plain HTTP endpoints, routing them trough a router.Router,
   817  // and gRPC APIs (exposing them over gRPC and pRPC protocols). Use an instance
   818  // of Server as a grpc.ServiceRegistrar when registering gRPC services. Services
   819  // registered that way will be available via gRPC protocol over the gRPC port
   820  // and via pRPC protocol over the main HTTP port. Interceptors can be added via
   821  // RegisterUnaryServerInterceptors. RPC authentication can be configured via
   822  // SetRPCAuthMethods.
   823  //
   824  // pRPC protocol is served on the same port as the main HTTP router, making it
   825  // possible to expose just a single HTTP port for everything (which is a
   826  // requirement on Appengine).
   827  //
   828  // Native gRPC protocol is always served though a dedicated gRPC h2c port since
   829  // the gRPC library has its own HTTP/2 server implementation not compatible
   830  // with net/http package used everywhere else. There's an assortments of hacks
   831  // to workaround this, but many ultimately depend on experimental and slow
   832  // grpc.Server.ServeHTTP method. See https://github.com/grpc/grpc-go/issues/586
   833  // and https://github.com/grpc/grpc-go/issues/4620. Another often recommended
   834  // workaround is https://github.com/soheilhy/cmux, which decides if a new
   835  // connection is a gRPC one or a regular HTTP/2 one. It doesn't work when the
   836  // server is running behind a load balancer that understand HTTP/2, since it
   837  // just opens a **single** backend connection and sends both gRPC and regular
   838  // HTTP/2 requests over it. This happens on Cloud Run, for example. See e.g.
   839  // https://ahmet.im/blog/grpc-http-mux-go/.
   840  //
   841  // If you want to serve HTTP and gRPC over the same public port, configure your
   842  // HTTP load balancer (e.g. https://cloud.google.com/load-balancing/docs/https)
   843  // to route requests into appropriate containers and ports. Another alternative
   844  // is to put an HTTP/2 proxy (e.g. Envoy) right into the pod with the server
   845  // process and route traffic "locally" there. This option would also allow to
   846  // add local grpc-web proxy into the mix if necessary.
   847  //
   848  // The server doesn't do TLS termination (even for gRPC traffic). It must be
   849  // sitting behind a load balancer or a proxy that terminates TLS and sends clear
   850  // text (HTTP/1 or HTTP/2 for gRPC) requests to corresponding ports, injecting
   851  // `X-Forwarded-*` headers. See "Security considerations" section above for more
   852  // details.
   853  type Server struct {
   854  	// Context is the root context used by all requests and background activities.
   855  	//
   856  	// Can be replaced (by a derived context) before Serve call, for example to
   857  	// inject values accessible to all request handlers.
   858  	Context context.Context
   859  
   860  	// Routes is a router for requests hitting HTTPAddr port.
   861  	//
   862  	// This router is used for all requests whose Host header does not match any
   863  	// specially registered per-host routers (see VirtualHost). Normally, there
   864  	// are no such per-host routers, so usually Routes is used for all requests.
   865  	//
   866  	// This router is also accessible to the server modules and they can install
   867  	// routes into it.
   868  	//
   869  	// Should be populated before Serve call.
   870  	Routes *router.Router
   871  
   872  	// CookieAuth is an authentication method implemented via cookies.
   873  	//
   874  	// It is initialized only if the server has a module implementing such scheme
   875  	// (e.g. "go.chromium.org/luci/server/encryptedcookies").
   876  	CookieAuth auth.Method
   877  
   878  	// Options is a copy of options passed to New.
   879  	Options Options
   880  
   881  	startTime   time.Time    // for calculating uptime for /healthz
   882  	lastReqTime atomic.Value // time.Time when the last request started
   883  
   884  	stdout       sdlogger.LogEntryWriter                   // for logging to stdout, nil in dev mode
   885  	stderr       sdlogger.LogEntryWriter                   // for logging to stderr, nil in dev mode
   886  	errRptClient *errorreporting.Client                    // for reporting to the cloud Error Reporting
   887  	logRequestCB func(context.Context, *sdlogger.LogEntry) // if non-nil, need to emit request log entries via it
   888  
   889  	mainPort *Port        // pre-registered main HTTP port, see initMainPort
   890  	grpcPort *grpcPort    // non-nil when exposing a gRPC port
   891  	prpc     *prpc.Server // pRPC server implementation exposed on the main port
   892  
   893  	mu      sync.Mutex    // protects fields below
   894  	ports   []servingPort // all non-dummy ports (each one bound to a TCP socket)
   895  	started bool          // true inside and after Serve
   896  	stopped bool          // true inside and after Shutdown
   897  	ready   chan struct{} // closed right before starting the serving loop
   898  	done    chan struct{} // closed after Shutdown returns
   899  
   900  	// gRPC/pRPC configuration.
   901  	unaryInterceptors  []grpc.UnaryServerInterceptor
   902  	streamInterceptors []grpc.StreamServerInterceptor
   903  	rpcAuthMethods     []auth.Method
   904  
   905  	rndM sync.Mutex // protects rnd
   906  	rnd  *rand.Rand // used to generate trace and operation IDs
   907  
   908  	bgrDone chan struct{}  // closed to stop background activities
   909  	bgrWg   sync.WaitGroup // waits for RunInBackground goroutines to stop
   910  
   911  	warmupM sync.Mutex // protects 'warmup' and the actual warmup critical section
   912  	warmup  []func(context.Context)
   913  
   914  	cleanupM sync.Mutex // protects 'cleanup' and the actual cleanup critical section
   915  	cleanup  []func(context.Context)
   916  
   917  	tsmon      *tsmon.State                  // manages flushing of tsmon metrics
   918  	propagator propagation.TextMapPropagator // knows how to propagate trace headers
   919  
   920  	cloudTS     oauth2.TokenSource // source of cloud-scoped tokens for Cloud APIs
   921  	signer      *signerImpl        // the signer used by the auth system
   922  	actorTokens *actorTokensImpl   // for impersonating service accounts
   923  	authDB      atomic.Value       // if not using AuthDBProvider, the last known good authdb.DB instance
   924  
   925  	runningAs string // email of an account the server runs as
   926  }
   927  
   928  // servingPort represents either an HTTP or gRPC serving port.
   929  type servingPort interface {
   930  	nameForLog() string
   931  	serve(baseCtx func() context.Context) error
   932  	shutdown(ctx context.Context)
   933  }
   934  
   935  // moduleHostImpl implements module.Host via server.Server.
   936  //
   937  // Just a tiny wrapper to make sure modules consume only curated limited set of
   938  // the server API and do not retain the pointer to the server.
   939  type moduleHostImpl struct {
   940  	srv        *Server
   941  	mod        module.Module
   942  	invalid    bool
   943  	cookieAuth auth.Method
   944  }
   945  
   946  func (h *moduleHostImpl) panicIfInvalid() {
   947  	if h.invalid {
   948  		panic("module.Host must not be used outside of Initialize")
   949  	}
   950  }
   951  
   952  func (h *moduleHostImpl) HTTPAddr() net.Addr {
   953  	h.panicIfInvalid()
   954  	if h.srv.mainPort.listener != nil {
   955  		return h.srv.mainPort.listener.Addr()
   956  	}
   957  	return nil
   958  }
   959  
   960  func (h *moduleHostImpl) GRPCAddr() net.Addr {
   961  	h.panicIfInvalid()
   962  	if h.srv.grpcPort != nil {
   963  		return h.srv.grpcPort.listener.Addr()
   964  	}
   965  	return nil
   966  }
   967  
   968  func (h *moduleHostImpl) Routes() *router.Router {
   969  	h.panicIfInvalid()
   970  	return h.srv.Routes
   971  }
   972  
   973  func (h *moduleHostImpl) RunInBackground(activity string, f func(context.Context)) {
   974  	h.panicIfInvalid()
   975  	h.srv.RunInBackground(activity, f)
   976  }
   977  
   978  func (h *moduleHostImpl) RegisterWarmup(cb func(context.Context)) {
   979  	h.panicIfInvalid()
   980  	h.srv.RegisterWarmup(cb)
   981  }
   982  
   983  func (h *moduleHostImpl) RegisterCleanup(cb func(context.Context)) {
   984  	h.panicIfInvalid()
   985  	h.srv.RegisterCleanup(cb)
   986  }
   987  
   988  func (h *moduleHostImpl) RegisterService(desc *grpc.ServiceDesc, impl any) {
   989  	h.panicIfInvalid()
   990  	h.srv.RegisterService(desc, impl)
   991  }
   992  
   993  func (h *moduleHostImpl) RegisterUnaryServerInterceptors(intr ...grpc.UnaryServerInterceptor) {
   994  	h.panicIfInvalid()
   995  	h.srv.RegisterUnaryServerInterceptors(intr...)
   996  }
   997  
   998  func (h *moduleHostImpl) RegisterStreamServerInterceptors(intr ...grpc.StreamServerInterceptor) {
   999  	h.panicIfInvalid()
  1000  	h.srv.RegisterStreamServerInterceptors(intr...)
  1001  }
  1002  
  1003  func (h *moduleHostImpl) RegisterCookieAuth(method auth.Method) {
  1004  	h.panicIfInvalid()
  1005  	h.cookieAuth = method
  1006  }
  1007  
  1008  // New constructs a new server instance.
  1009  //
  1010  // It hosts one or more HTTP servers and starts and stops them in unison. It is
  1011  // also responsible for preparing contexts for incoming requests.
  1012  //
  1013  // The given context will become the root context of the server and will be
  1014  // inherited by all handlers.
  1015  //
  1016  // On errors returns partially initialized server (always non-nil). At least
  1017  // its logging will be configured and can be used to report the error. Trying
  1018  // to use such partially initialized server for anything else is undefined
  1019  // behavior.
  1020  func New(ctx context.Context, opts Options, mods []module.Module) (srv *Server, err error) {
  1021  	seed := opts.testSeed
  1022  	if seed == 0 {
  1023  		if err := binary.Read(cryptorand.Reader, binary.BigEndian, &seed); err != nil {
  1024  			panic(err)
  1025  		}
  1026  	}
  1027  
  1028  	srv = &Server{
  1029  		Context:   ctx,
  1030  		Options:   opts,
  1031  		startTime: clock.Now(ctx).UTC(),
  1032  		ready:     make(chan struct{}),
  1033  		done:      make(chan struct{}),
  1034  		rnd:       rand.New(rand.NewSource(seed)),
  1035  		bgrDone:   make(chan struct{}),
  1036  	}
  1037  
  1038  	// Cleanup what we can on failures.
  1039  	defer func() {
  1040  		if err != nil {
  1041  			srv.runCleanup()
  1042  		}
  1043  	}()
  1044  
  1045  	// Logging is needed to report any errors during the early initialization.
  1046  	srv.initLogging()
  1047  
  1048  	logging.Infof(srv.Context, "Server starting...")
  1049  	if srv.Options.ContainerImageID != "" {
  1050  		logging.Infof(srv.Context, "Container image is %s", srv.Options.ContainerImageID)
  1051  	}
  1052  
  1053  	// Need the hostname (e.g. pod name on k8s) for logs and metrics.
  1054  	if srv.Options.Hostname == "" {
  1055  		srv.Options.Hostname, err = os.Hostname()
  1056  		if err != nil {
  1057  			return srv, errors.Annotate(err, "failed to get own hostname").Err()
  1058  		}
  1059  	}
  1060  
  1061  	switch srv.Options.Serverless {
  1062  	case module.GAE:
  1063  		logging.Infof(srv.Context, "Running on %s", srv.Options.Hostname)
  1064  		logging.Infof(srv.Context, "Instance is %q", os.Getenv("GAE_INSTANCE"))
  1065  		if srv.Options.CloudRegion == "" {
  1066  			if appID := os.Getenv("GAE_APPLICATION"); appID != "" {
  1067  				logging.Warningf(srv.Context, "Could not figure out the primary Cloud region based "+
  1068  					"on the region code in GAE_APPLICATION %q, consider passing the region name "+
  1069  					"via -cloud-region flag explicitly", appID)
  1070  			}
  1071  		} else {
  1072  			logging.Infof(srv.Context, "Cloud region is %s", srv.Options.CloudRegion)
  1073  		}
  1074  		// Initialize default tickets for background activities. These tickets are
  1075  		// overridden in per-request contexts with request-specific tickets.
  1076  		srv.Context = gae.WithTickets(srv.Context, gae.DefaultTickets())
  1077  	case module.CloudRun:
  1078  		logging.Infof(srv.Context, "Running on %s", srv.Options.Hostname)
  1079  		logging.Infof(srv.Context, "Revision is %q", os.Getenv("K_REVISION"))
  1080  	default:
  1081  		// On k8s log pod IPs too, this is useful when debugging k8s routing.
  1082  		logging.Infof(srv.Context, "Running on %s (%s)", srv.Options.Hostname, networkAddrsForLog())
  1083  	}
  1084  
  1085  	// Log enabled experiments, warn if some of them are unknown now.
  1086  	var exps []experiments.ID
  1087  	for _, name := range opts.EnableExperiments {
  1088  		if exp, ok := experiments.GetByName(name); ok {
  1089  			logging.Infof(ctx, "Enabling experiment %q", name)
  1090  			exps = append(exps, exp)
  1091  		} else {
  1092  			logging.Warningf(ctx, "Skipping unknown experiment %q", name)
  1093  		}
  1094  	}
  1095  	srv.Context = experiments.Enable(srv.Context, exps...)
  1096  
  1097  	// Configure base server subsystems by injecting them into the root context
  1098  	// inherited later by all requests.
  1099  	srv.Context = caching.WithProcessCacheData(srv.Context, caching.NewProcessCacheData())
  1100  	if err := srv.initAuthStart(); err != nil {
  1101  		return srv, errors.Annotate(err, "failed to initialize auth").Err()
  1102  	}
  1103  	if err := srv.initTSMon(); err != nil {
  1104  		return srv, errors.Annotate(err, "failed to initialize tsmon").Err()
  1105  	}
  1106  	if err := srv.initAuthFinish(); err != nil {
  1107  		return srv, errors.Annotate(err, "failed to finish auth initialization").Err()
  1108  	}
  1109  	if err := srv.initTracing(); err != nil {
  1110  		return srv, errors.Annotate(err, "failed to initialize tracing").Err()
  1111  	}
  1112  	if err := srv.initErrorReporting(); err != nil {
  1113  		return srv, errors.Annotate(err, "failed to initialize error reporting").Err()
  1114  	}
  1115  	if err := srv.initProfiling(); err != nil {
  1116  		return srv, errors.Annotate(err, "failed to initialize profiling").Err()
  1117  	}
  1118  	if err := srv.initMainPort(); err != nil {
  1119  		return srv, errors.Annotate(err, "failed to initialize the main port").Err()
  1120  	}
  1121  	if err := srv.initGrpcPort(); err != nil {
  1122  		return srv, errors.Annotate(err, "failed to initialize the gRPC port").Err()
  1123  	}
  1124  	if err := srv.initAdminPort(); err != nil {
  1125  		return srv, errors.Annotate(err, "failed to initialize the admin port").Err()
  1126  	}
  1127  	if err := srv.initWarmup(); err != nil {
  1128  		return srv, errors.Annotate(err, "failed to initialize warmup callbacks").Err()
  1129  	}
  1130  
  1131  	// Sort modules by their initialization order based on declared dependencies,
  1132  	// discover unfulfilled required dependencies.
  1133  	sorted, err := resolveDependencies(mods)
  1134  	if err != nil {
  1135  		return srv, err
  1136  	}
  1137  
  1138  	// Initialize all modules in their topological order.
  1139  	impls := make([]*moduleHostImpl, len(sorted))
  1140  	for i, mod := range sorted {
  1141  		impls[i] = &moduleHostImpl{srv: srv, mod: mod}
  1142  		switch ctx, err := mod.Initialize(srv.Context, impls[i], srv.Options.hostOptions()); {
  1143  		case err != nil:
  1144  			return srv, errors.Annotate(err, "failed to initialize module %q", mod.Name()).Err()
  1145  		case ctx != nil:
  1146  			srv.Context = ctx
  1147  		}
  1148  		impls[i].invalid = true // make sure the module does not retain it
  1149  	}
  1150  
  1151  	// Ensure there's only one CookieAuth method registered.
  1152  	var cookieAuthMod module.Module
  1153  	for _, impl := range impls {
  1154  		if impl.cookieAuth != nil {
  1155  			if cookieAuthMod != nil {
  1156  				return srv, errors.Annotate(err,
  1157  					"conflict between %q and %q: both register a cookie auth scheme - pick one",
  1158  					cookieAuthMod.Name(), impl.mod.Name(),
  1159  				).Err()
  1160  			}
  1161  			cookieAuthMod = impl.mod
  1162  			srv.CookieAuth = impl.cookieAuth
  1163  		}
  1164  	}
  1165  
  1166  	// Install the RPC Explorer, using the registered auth method if it is
  1167  	// compatible.
  1168  	rpcExpAuth, _ := srv.CookieAuth.(rpcexplorer.AuthMethod)
  1169  	rpcexplorer.Install(srv.Routes, rpcExpAuth)
  1170  
  1171  	return srv, nil
  1172  }
  1173  
  1174  // AddPort prepares and binds an additional serving HTTP port.
  1175  //
  1176  // Can be used to open more listening HTTP ports (in addition to opts.HTTPAddr
  1177  // and opts.AdminAddr). The returned Port object can be used to populate the
  1178  // router that serves requests hitting the added port.
  1179  //
  1180  // If opts.ListenAddr is '-', a dummy port will be added: it is a valid *Port
  1181  // object, but it is not actually exposed as a listening TCP socket. This is
  1182  // useful to disable listening ports without changing any code.
  1183  //
  1184  // Must be called before Serve (panics otherwise).
  1185  func (s *Server) AddPort(opts PortOptions) (*Port, error) {
  1186  	port := &Port{
  1187  		Routes:   s.newRouter(opts),
  1188  		parent:   s,
  1189  		opts:     opts,
  1190  		allowH2C: s.Options.AllowH2C,
  1191  	}
  1192  
  1193  	s.mu.Lock()
  1194  	defer s.mu.Unlock()
  1195  	if s.started {
  1196  		s.Fatal(errors.Reason("the server has already been started").Err())
  1197  	}
  1198  
  1199  	if opts.ListenAddr != "-" {
  1200  		var err error
  1201  		if port.listener, err = s.createListener(opts.ListenAddr); err != nil {
  1202  			return nil, errors.Annotate(err, "failed to bind the listening port for %q at %q", opts.Name, opts.ListenAddr).Err()
  1203  		}
  1204  		// Add to the list of ports that actually have sockets listening.
  1205  		s.ports = append(s.ports, port)
  1206  	}
  1207  
  1208  	return port, nil
  1209  }
  1210  
  1211  // VirtualHost returns a router (registering it if necessary) used for requests
  1212  // that hit the main port (opts.HTTPAddr) and have the given Host header.
  1213  //
  1214  // Should be used in rare cases when the server is exposed through multiple
  1215  // domain names and requests should be routed differently based on what domain
  1216  // was used. If your server is serving only one domain name, or you don't care
  1217  // what domain name is used to access it, do not use VirtualHost.
  1218  //
  1219  // Note that requests that match some registered virtual host router won't
  1220  // reach the default router (server.Routes), even if the virtual host router
  1221  // doesn't have a route for them. Such requests finish with HTTP 404.
  1222  //
  1223  // Also the router created by VirtualHost is initially completely empty: the
  1224  // server and its modules don't install anything into it (there's intentionally
  1225  // no mechanism to do this). For that reason VirtualHost should never by used to
  1226  // register a router for the "main" domain name: it will make the default
  1227  // server.Routes (and all handlers installed there by server modules) useless,
  1228  // probably breaking the server. Put routes for the main server functionality
  1229  // directly into server.Routes instead, using VirtualHost only for routes that
  1230  // critically depend on Host header.
  1231  //
  1232  // Must be called before Serve (panics otherwise).
  1233  func (s *Server) VirtualHost(host string) *router.Router {
  1234  	return s.mainPort.VirtualHost(host)
  1235  }
  1236  
  1237  // createListener creates a TCP listener on the given address.
  1238  func (s *Server) createListener(addr string) (net.Listener, error) {
  1239  	// If not running tests, bind the socket as usual.
  1240  	if s.Options.testListeners == nil {
  1241  		return net.Listen("tcp", addr)
  1242  	}
  1243  	// In test mode the listener MUST be prepared already.
  1244  	l := s.Options.testListeners[addr]
  1245  	if l == nil {
  1246  		return nil, errors.Reason("test listener is not set").Err()
  1247  	}
  1248  	return l, nil
  1249  }
  1250  
  1251  // newRouter creates a Router with the default middleware chain and routes.
  1252  func (s *Server) newRouter(opts PortOptions) *router.Router {
  1253  	s.mu.Lock()
  1254  	defer s.mu.Unlock()
  1255  	if s.started {
  1256  		s.Fatal(errors.Reason("the server has already been started").Err())
  1257  	}
  1258  
  1259  	// This is a chain of router.Middleware. It is preceded by a chain of raw
  1260  	// net/http middlewares (see wrapHTTPHandler):
  1261  	//   * s.httpRoot: initializes *incomingRequest in the context.
  1262  	//   * otelhttp.NewHandler: opens a tracing span.
  1263  	//   * s.httpDispatch: finishes the context initialization.
  1264  	mw := router.NewMiddlewareChain(
  1265  		middleware.WithPanicCatcher, // transforms panics into HTTP 500
  1266  	)
  1267  	if s.tsmon != nil && !opts.DisableMetrics {
  1268  		mw = mw.Extend(s.tsmon.Middleware) // collect HTTP requests metrics
  1269  	}
  1270  
  1271  	// Setup middleware chain used by ALL requests.
  1272  	r := router.New()
  1273  	r.Use(mw)
  1274  
  1275  	// Mandatory health check/readiness probe endpoint.
  1276  	r.GET(healthEndpoint, nil, func(c *router.Context) {
  1277  		c.Writer.Write([]byte(s.healthResponse(c.Request.Context())))
  1278  	})
  1279  
  1280  	// Add NotFound handler wrapped in our middlewares so that unrecognized
  1281  	// requests are at least logged. If we don't do that they'll be handled
  1282  	// completely silently and this is very confusing when debugging 404s.
  1283  	r.NotFound(nil, func(c *router.Context) {
  1284  		http.NotFound(c.Writer, c.Request)
  1285  	})
  1286  
  1287  	return r
  1288  }
  1289  
  1290  // RunInBackground launches the given callback in a separate goroutine right
  1291  // before starting the serving loop.
  1292  //
  1293  // If the server is already running, launches it right away. If the server
  1294  // fails to start, the goroutines will never be launched.
  1295  //
  1296  // Should be used for background asynchronous activities like reloading configs.
  1297  //
  1298  // All logs lines emitted by the callback are annotated with "activity" field
  1299  // which can be arbitrary, but by convention has format "<namespace>.<name>",
  1300  // where "luci" namespace is reserved for internal activities.
  1301  //
  1302  // The context passed to the callback is canceled when the server is shutting
  1303  // down. It is expected the goroutine will exit soon after the context is
  1304  // canceled.
  1305  func (s *Server) RunInBackground(activity string, f func(context.Context)) {
  1306  	s.bgrWg.Add(1)
  1307  	go func() {
  1308  		defer s.bgrWg.Done()
  1309  
  1310  		select {
  1311  		case <-s.ready:
  1312  			// Construct the context after the server is fully initialized. Cancel it
  1313  			// as soon as bgrDone is signaled.
  1314  			ctx, cancel := context.WithCancel(s.Context)
  1315  			if activity != "" {
  1316  				ctx = logging.SetField(ctx, "activity", activity)
  1317  			}
  1318  			defer cancel()
  1319  			go func() {
  1320  				select {
  1321  				case <-s.bgrDone:
  1322  					cancel()
  1323  				case <-ctx.Done():
  1324  				}
  1325  			}()
  1326  			f(ctx)
  1327  
  1328  		case <-s.bgrDone:
  1329  			// the server is closed, no need to run f() anymore
  1330  		}
  1331  	}()
  1332  }
  1333  
  1334  // RegisterService is part of grpc.ServiceRegistrar interface.
  1335  //
  1336  // The registered service will be exposed through both gRPC and pRPC protocols
  1337  // on corresponding ports. See Server doc.
  1338  //
  1339  // Must be called before Serve (panics otherwise).
  1340  func (s *Server) RegisterService(desc *grpc.ServiceDesc, impl any) {
  1341  	s.mu.Lock()
  1342  	defer s.mu.Unlock()
  1343  	if s.started {
  1344  		s.Fatal(errors.Reason("the server has already been started").Err())
  1345  	}
  1346  	s.prpc.RegisterService(desc, impl)
  1347  	if s.grpcPort != nil {
  1348  		s.grpcPort.registerService(desc, impl)
  1349  	}
  1350  }
  1351  
  1352  // RegisterUnaryServerInterceptors registers grpc.UnaryServerInterceptor's
  1353  // applied to all unary RPCs that hit the server.
  1354  //
  1355  // Interceptors are chained in order they are registered, i.e. the first
  1356  // registered interceptor becomes the outermost. The initial chain already
  1357  // contains some base interceptors (e.g. for monitoring) and all interceptors
  1358  // registered by server modules. RegisterUnaryServerInterceptors extends this
  1359  // chain. Subsequent calls to RegisterUnaryServerInterceptors adds more
  1360  // interceptors into the chain.
  1361  //
  1362  // Must be called before Serve (panics otherwise).
  1363  func (s *Server) RegisterUnaryServerInterceptors(intr ...grpc.UnaryServerInterceptor) {
  1364  	s.mu.Lock()
  1365  	defer s.mu.Unlock()
  1366  	if s.started {
  1367  		s.Fatal(errors.Reason("the server has already been started").Err())
  1368  	}
  1369  	s.unaryInterceptors = append(s.unaryInterceptors, intr...)
  1370  }
  1371  
  1372  // RegisterStreamServerInterceptors registers grpc.StreamServerInterceptor's
  1373  // applied to all streaming RPCs that hit the server.
  1374  //
  1375  // Interceptors are chained in order they are registered, i.e. the first
  1376  // registered interceptor becomes the outermost. The initial chain already
  1377  // contains some base interceptors (e.g. for monitoring) and all interceptors
  1378  // registered by server modules. RegisterStreamServerInterceptors extends this
  1379  // chain. Subsequent calls to RegisterStreamServerInterceptors adds more
  1380  // interceptors into the chain.
  1381  //
  1382  // Must be called before Serve (panics otherwise).
  1383  func (s *Server) RegisterStreamServerInterceptors(intr ...grpc.StreamServerInterceptor) {
  1384  	s.mu.Lock()
  1385  	defer s.mu.Unlock()
  1386  	if s.started {
  1387  		s.Fatal(errors.Reason("the server has already been started").Err())
  1388  	}
  1389  	s.streamInterceptors = append(s.streamInterceptors, intr...)
  1390  }
  1391  
  1392  // RegisterUnifiedServerInterceptors registers given interceptors into both
  1393  // unary and stream interceptor chains.
  1394  //
  1395  // It is just a convenience helper for UnifiedServerInterceptor's that usually
  1396  // need to be registered in both unary and stream interceptor chains. This
  1397  // method is equivalent to calling RegisterUnaryServerInterceptors and
  1398  // RegisterStreamServerInterceptors, passing corresponding flavors of
  1399  // interceptors to them.
  1400  //
  1401  // Must be called before Serve (panics otherwise).
  1402  func (s *Server) RegisterUnifiedServerInterceptors(intr ...grpcutil.UnifiedServerInterceptor) {
  1403  	s.mu.Lock()
  1404  	defer s.mu.Unlock()
  1405  	if s.started {
  1406  		s.Fatal(errors.Reason("the server has already been started").Err())
  1407  	}
  1408  	for _, cb := range intr {
  1409  		s.unaryInterceptors = append(s.unaryInterceptors, cb.Unary())
  1410  		s.streamInterceptors = append(s.streamInterceptors, cb.Stream())
  1411  	}
  1412  }
  1413  
  1414  // ConfigurePRPC allows tweaking pRPC-specific server configuration.
  1415  //
  1416  // Use it only for changing pRPC-specific options (usually ones that are related
  1417  // to HTTP protocol in some way). This method **must not be used** for
  1418  // registering interceptors or setting authentication options (changes to them
  1419  // done here will cause a panic). Instead use RegisterUnaryServerInterceptors to
  1420  // register interceptors or SetRPCAuthMethods to change how the server
  1421  // authenticates RPC requests. Changes done through these methods will apply
  1422  // to both gRPC and pRPC servers.
  1423  //
  1424  // Must be called before Serve (panics otherwise).
  1425  func (s *Server) ConfigurePRPC(cb func(srv *prpc.Server)) {
  1426  	s.mu.Lock()
  1427  	defer s.mu.Unlock()
  1428  	if s.started {
  1429  		s.Fatal(errors.Reason("the server has already been started").Err())
  1430  	}
  1431  	cb(s.prpc)
  1432  	if s.prpc.UnaryServerInterceptor != nil {
  1433  		panic("use Server.RegisterUnaryServerInterceptors to register interceptors")
  1434  	}
  1435  }
  1436  
  1437  // SetRPCAuthMethods overrides how the server authenticates incoming gRPC and
  1438  // pRPC requests.
  1439  //
  1440  // It receives a list of auth.Method implementations which will be applied
  1441  // one after another to try to authenticate the request until the first
  1442  // successful hit. If all methods end up to be non-applicable (i.e. none of the
  1443  // methods notice any headers they recognize), the request will be passed
  1444  // through to the handler as anonymous (coming from an "anonymous identity").
  1445  // Rejecting anonymous requests (if necessary) is the job of an authorization
  1446  // layer, often implemented as a gRPC interceptor. For simple cases use
  1447  // go.chromium.org/luci/server/auth/rpcacl interceptor.
  1448  //
  1449  // By default (if SetRPCAuthMethods is never called) the server will check
  1450  // incoming requests have an `Authorization` header with a Google OAuth2 access
  1451  // token that has `https://www.googleapis.com/auth/userinfo.email` scope (see
  1452  // auth.GoogleOAuth2Method). Requests without `Authorization` header will be
  1453  // considered anonymous.
  1454  //
  1455  // If OpenIDRPCAuthEnable option is set (matching `-open-id-rpc-auth-enable`
  1456  // flag), the service will recognize ID tokens as well. This is important for
  1457  // e.g. Cloud Run where this is the only authentication method supported
  1458  // natively by the platform. ID tokens are also generally faster to check than
  1459  // access tokens.
  1460  //
  1461  // Note that this call completely overrides the previously configured list of
  1462  // methods instead of appending to it, since chaining auth methods is often
  1463  // tricky and it is safer to just always provide the whole list at once.
  1464  //
  1465  // Passing an empty list of methods is allowed. All requests will be considered
  1466  // anonymous in that case.
  1467  //
  1468  // Note that this call **doesn't affect** how plain HTTP requests (hitting the
  1469  // main HTTP port and routed through s.Router) are authenticated. Very often
  1470  // RPC requests and plain HTTP requests need different authentication methods
  1471  // and using an RPC authentication for everything is incorrect. To authenticate
  1472  // plain HTTP requests use auth.Authenticate(...) HTTP router middleware,
  1473  // perhaps in combination with s.CookieAuth (which is non-nil if there is a
  1474  // server module installed that provides a cookie-based authentication scheme).
  1475  //
  1476  // Must be called before Serve (panics otherwise).
  1477  func (s *Server) SetRPCAuthMethods(methods []auth.Method) {
  1478  	s.mu.Lock()
  1479  	defer s.mu.Unlock()
  1480  	if s.started {
  1481  		s.Fatal(errors.Reason("the server has already been started").Err())
  1482  	}
  1483  	s.rpcAuthMethods = methods
  1484  }
  1485  
  1486  // Serve launches the serving loop.
  1487  //
  1488  // Blocks forever or until the server is stopped via Shutdown (from another
  1489  // goroutine or from a SIGTERM handler). Returns nil if the server was shutdown
  1490  // correctly or an error if it failed to start or unexpectedly died. The error
  1491  // is logged inside.
  1492  //
  1493  // Should be called only once. Panics otherwise.
  1494  func (s *Server) Serve() error {
  1495  	// Set s.started flag to "lock" the configuration. This would allow to read
  1496  	// fields like `s.ports` without the fear of a race conditions.
  1497  	s.mu.Lock()
  1498  	if s.started {
  1499  		s.mu.Unlock()
  1500  		s.Fatal(errors.Reason("the server has already been started").Err())
  1501  	}
  1502  	s.started = true
  1503  	s.mu.Unlock()
  1504  
  1505  	// The configuration is "locked" now and we can finish the setup.
  1506  	authInterceptor := auth.AuthenticatingInterceptor(s.rpcAuthMethods)
  1507  
  1508  	// Assemble the final interceptor chains: base interceptors => auth =>
  1509  	// whatever was installed by users of server.Server. Note we put grpcmon
  1510  	// before the panic catcher to make sure panics are actually reported to
  1511  	// the monitoring. grpcmon is also before the authentication to make sure
  1512  	// auth errors are reported as well.
  1513  	unaryInterceptors := append([]grpc.UnaryServerInterceptor{
  1514  		grpcmon.UnaryServerInterceptor,
  1515  		grpcutil.UnaryServerPanicCatcherInterceptor,
  1516  		authInterceptor.Unary(),
  1517  	}, s.unaryInterceptors...)
  1518  	streamInterceptors := append([]grpc.StreamServerInterceptor{
  1519  		grpcmon.StreamServerInterceptor,
  1520  		grpcutil.StreamServerPanicCatcherInterceptor,
  1521  		authInterceptor.Stream(),
  1522  	}, s.streamInterceptors...)
  1523  
  1524  	// Finish setting the pRPC server. It supports only unary RPCs. The root
  1525  	// request context is created in the HTTP land using base HTTP middlewares.
  1526  	s.prpc.UnaryServerInterceptor = grpcutil.ChainUnaryServerInterceptors(unaryInterceptors...)
  1527  
  1528  	// Finish setting the gRPC server, if enabled.
  1529  	if s.grpcPort != nil {
  1530  		grpcRoot := s.grpcRoot()
  1531  		grpcDispatch := s.grpcDispatch()
  1532  		s.grpcPort.addServerOptions(
  1533  			grpc.ChainUnaryInterceptor(
  1534  				grpcRoot.Unary(),
  1535  				otelgrpc.UnaryServerInterceptor(),
  1536  				grpcDispatch.Unary(),
  1537  			),
  1538  			grpc.ChainUnaryInterceptor(unaryInterceptors...),
  1539  			grpc.ChainStreamInterceptor(
  1540  				grpcRoot.Stream(),
  1541  				otelgrpc.StreamServerInterceptor(),
  1542  				grpcDispatch.Stream(),
  1543  			),
  1544  			grpc.ChainStreamInterceptor(streamInterceptors...),
  1545  		)
  1546  	}
  1547  
  1548  	// Run registered best-effort warmup callbacks right before serving.
  1549  	s.runWarmup()
  1550  
  1551  	// Catch SIGTERM while inside the serving loop. Upon receiving SIGTERM, wait
  1552  	// until the pod is removed from the load balancer before actually shutting
  1553  	// down and refusing new connections. If we shutdown immediately, some clients
  1554  	// may see connection errors, because they are not aware yet the server is
  1555  	// closing: Pod shutdown sequence and Endpoints list updates are racing with
  1556  	// each other, we want Endpoints list updates to win, i.e. we want the pod to
  1557  	// actually be fully alive as long as it is still referenced in Endpoints
  1558  	// list. We can't guarantee this, but we can improve chances.
  1559  	stop := signals.HandleInterrupt(func() {
  1560  		if s.Options.Prod {
  1561  			s.waitUntilNotServing()
  1562  		}
  1563  		s.Shutdown()
  1564  	})
  1565  	defer stop()
  1566  
  1567  	// Log how long it took from 'New' to the serving loop.
  1568  	logging.Infof(s.Context, "Startup done in %s", clock.Now(s.Context).Sub(s.startTime))
  1569  
  1570  	// Unblock all pending RunInBackground goroutines, so they can start.
  1571  	close(s.ready)
  1572  
  1573  	// Run serving loops in parallel.
  1574  	errs := make(errors.MultiError, len(s.ports))
  1575  	wg := sync.WaitGroup{}
  1576  	wg.Add(len(s.ports))
  1577  	for i, port := range s.ports {
  1578  		logging.Infof(s.Context, "Serving %s", port.nameForLog())
  1579  		i := i
  1580  		port := port
  1581  		go func() {
  1582  			defer wg.Done()
  1583  			if err := port.serve(func() context.Context { return s.Context }); err != nil {
  1584  				logging.WithError(err).Errorf(s.Context, "Server %s failed", port.nameForLog())
  1585  				errs[i] = err
  1586  				s.Shutdown() // close all other servers
  1587  			}
  1588  		}()
  1589  	}
  1590  	wg.Wait()
  1591  
  1592  	// Per http.Server docs, we end up here *immediately* after Shutdown call was
  1593  	// initiated. Some requests can still be in-flight. We block until they are
  1594  	// done (as indicated by Shutdown call itself exiting).
  1595  	logging.Infof(s.Context, "Waiting for the server to stop...")
  1596  	<-s.done
  1597  	logging.Infof(s.Context, "The serving loop stopped, running the final cleanup...")
  1598  	s.runCleanup()
  1599  	logging.Infof(s.Context, "The server has stopped")
  1600  
  1601  	if errs.First() != nil {
  1602  		return errs
  1603  	}
  1604  	return nil
  1605  }
  1606  
  1607  // Shutdown gracefully stops the server if it was running.
  1608  //
  1609  // Blocks until the server is stopped. Can be called multiple times.
  1610  func (s *Server) Shutdown() {
  1611  	s.mu.Lock()
  1612  	defer s.mu.Unlock()
  1613  	if s.stopped {
  1614  		return
  1615  	}
  1616  
  1617  	logging.Infof(s.Context, "Shutting down the server...")
  1618  
  1619  	// Tell all RunInBackground goroutines to stop.
  1620  	close(s.bgrDone)
  1621  
  1622  	// Stop all http.Servers in parallel. Each Shutdown call blocks until the
  1623  	// corresponding server is stopped.
  1624  	wg := sync.WaitGroup{}
  1625  	wg.Add(len(s.ports))
  1626  	for _, port := range s.ports {
  1627  		port := port
  1628  		go func() {
  1629  			defer wg.Done()
  1630  			port.shutdown(s.Context)
  1631  		}()
  1632  	}
  1633  	wg.Wait()
  1634  
  1635  	// Wait for all background goroutines to stop.
  1636  	s.bgrWg.Wait()
  1637  
  1638  	// Notify Serve that it can exit now.
  1639  	s.stopped = true
  1640  	close(s.done)
  1641  }
  1642  
  1643  // Fatal logs the error and immediately shuts down the process with exit code 3.
  1644  //
  1645  // No cleanup is performed. Deferred statements are not run. Not recoverable.
  1646  func (s *Server) Fatal(err error) {
  1647  	errors.Log(s.Context, err)
  1648  	os.Exit(3)
  1649  }
  1650  
  1651  // healthResponse prepares text/plan response for the health check endpoints.
  1652  //
  1653  // It additionally contains some easy to obtain information that may help in
  1654  // debugging deployments.
  1655  func (s *Server) healthResponse(c context.Context) string {
  1656  	maybeEmpty := func(s string) string {
  1657  		if s == "" {
  1658  			return "<unknown>"
  1659  		}
  1660  		return s
  1661  	}
  1662  	return strings.Join([]string{
  1663  		"OK",
  1664  		"",
  1665  		"uptime:  " + clock.Now(c).Sub(s.startTime).String(),
  1666  		"image:   " + maybeEmpty(s.Options.ContainerImageID),
  1667  		"",
  1668  		"service: " + maybeEmpty(s.Options.TsMonServiceName),
  1669  		"job:     " + maybeEmpty(s.Options.TsMonJobName),
  1670  		"host:    " + s.Options.Hostname,
  1671  		"",
  1672  	}, "\n")
  1673  }
  1674  
  1675  // waitUntilNotServing is called during the graceful shutdown and it tries to
  1676  // figure out when the traffic stops flowing to the server (i.e. when it is
  1677  // removed from the load balancer).
  1678  //
  1679  // It's a heuristic optimization for the case when the load balancer keeps
  1680  // sending traffic to a terminating Pod for some time after the Pod entered
  1681  // "Terminating" state. It can happen due to latencies in Endpoints list
  1682  // updates. We want to keep the listening socket open as long as there are
  1683  // incoming requests (but no longer than 1 min).
  1684  func (s *Server) waitUntilNotServing() {
  1685  	logging.Infof(s.Context, "Received SIGTERM, waiting for the traffic to stop...")
  1686  
  1687  	// When the server is idle the loop below exits immediately and the server
  1688  	// enters the shutdown path, rejecting new connections. Since we gave
  1689  	// Kubernetes no time to update the Endpoints list, it is possible someone
  1690  	// still might send a request to the server (and it will be rejected).
  1691  	// To avoid that we always sleep a bit here to give Kubernetes a chance to
  1692  	// propagate the Endpoints list update everywhere. The loop below then
  1693  	// verifies clients got the update and stopped sending requests.
  1694  	time.Sleep(s.Options.ShutdownDelay)
  1695  
  1696  	deadline := clock.Now(s.Context).Add(time.Minute)
  1697  	for {
  1698  		now := clock.Now(s.Context)
  1699  		lastReq, ok := s.lastReqTime.Load().(time.Time)
  1700  		if !ok || now.Sub(lastReq) > 15*time.Second {
  1701  			logging.Infof(s.Context, "No requests received in the last 15 sec, proceeding with the shutdown...")
  1702  			break
  1703  		}
  1704  		if now.After(deadline) {
  1705  			logging.Warningf(s.Context, "Gave up waiting for the traffic to stop, proceeding with the shutdown...")
  1706  			break
  1707  		}
  1708  		time.Sleep(100 * time.Millisecond)
  1709  	}
  1710  }
  1711  
  1712  // RegisterWarmup registers a callback that is run in server's Serve right
  1713  // before the serving loop.
  1714  //
  1715  // It receives the global server context (including all customizations made
  1716  // by the user code in server.Main). Intended for best-effort warmups: there's
  1717  // no way to gracefully abort the server startup from a warmup callback.
  1718  //
  1719  // Registering a new warmup callback from within a warmup causes a deadlock,
  1720  // don't do that.
  1721  func (s *Server) RegisterWarmup(cb func(context.Context)) {
  1722  	s.warmupM.Lock()
  1723  	defer s.warmupM.Unlock()
  1724  	s.warmup = append(s.warmup, cb)
  1725  }
  1726  
  1727  // runWarmup runs all registered warmup functions (sequentially in registration
  1728  // order).
  1729  func (s *Server) runWarmup() {
  1730  	s.warmupM.Lock()
  1731  	defer s.warmupM.Unlock()
  1732  	ctx := logging.SetField(s.Context, "activity", "luci.warmup")
  1733  	for _, cb := range s.warmup {
  1734  		cb(ctx)
  1735  	}
  1736  }
  1737  
  1738  // RegisterCleanup registers a callback that is run in Serve after the server
  1739  // has exited the serving loop.
  1740  //
  1741  // Registering a new cleanup callback from within a cleanup causes a deadlock,
  1742  // don't do that.
  1743  func (s *Server) RegisterCleanup(cb func(context.Context)) {
  1744  	s.cleanupM.Lock()
  1745  	defer s.cleanupM.Unlock()
  1746  	s.cleanup = append(s.cleanup, cb)
  1747  }
  1748  
  1749  // runCleanup runs all registered cleanup functions (sequentially in reverse
  1750  // order).
  1751  func (s *Server) runCleanup() {
  1752  	s.cleanupM.Lock()
  1753  	defer s.cleanupM.Unlock()
  1754  	for i := len(s.cleanup) - 1; i >= 0; i-- {
  1755  		s.cleanup[i](s.Context)
  1756  	}
  1757  }
  1758  
  1759  // genUniqueBlob writes a pseudo-random byte blob into the given slice.
  1760  func (s *Server) genUniqueBlob(b []byte) {
  1761  	s.rndM.Lock()
  1762  	s.rnd.Read(b)
  1763  	s.rndM.Unlock()
  1764  }
  1765  
  1766  // genUniqueID returns pseudo-random hex string of given even length.
  1767  func (s *Server) genUniqueID(l int) string {
  1768  	b := make([]byte, l/2)
  1769  	s.genUniqueBlob(b)
  1770  	return hex.EncodeToString(b)
  1771  }
  1772  
  1773  // incomingRequest is a request received by the server.
  1774  //
  1775  // It is either an HTTP or a gRPC request.
  1776  type incomingRequest struct {
  1777  	url         string               // the full URL for logs
  1778  	method      string               // HTTP method verb for logs, e.g. "POST"
  1779  	metadata    auth.RequestMetadata // headers etc.
  1780  	healthCheck bool                 // true if this is a health check request
  1781  }
  1782  
  1783  // requestResult is logged after completion of a request.
  1784  type requestResult struct {
  1785  	statusCode   int            // the HTTP status code to log
  1786  	requestSize  int64          // the request size in bytes if known
  1787  	responseSize int64          // the response size in bytes if known
  1788  	extraFields  logging.Fields // extra fields to log (will be mutated!)
  1789  }
  1790  
  1791  // wrapHTTPHandler wraps port's router into net/http middlewares.
  1792  //
  1793  // TODO(vadimsh): Get rid of router.Middleware and move this to newRouter(...).
  1794  // Since introduction of http.Request.Context() there's no reason for
  1795  // router.Middleware to exist anymore.
  1796  func (s *Server) wrapHTTPHandler(next http.Handler) http.Handler {
  1797  	return s.httpRoot(
  1798  		otelhttp.NewHandler(
  1799  			s.httpDispatch(next),
  1800  			"",
  1801  			otelhttp.WithMessageEvents(otelhttp.ReadEvents, otelhttp.WriteEvents),
  1802  			otelhttp.WithSpanNameFormatter(func(_ string, r *http.Request) string {
  1803  				return r.URL.Path
  1804  			}),
  1805  		),
  1806  	)
  1807  }
  1808  
  1809  // httpRoot is the entry point for non-gRPC HTTP requests.
  1810  //
  1811  // It is an http/net middleware for interoperability with other existing
  1812  // http/net middlewares (currently only OpenTelemetry otelhttp middleware).
  1813  //
  1814  // Its job is to initialize *incomingRequest in the context which is then
  1815  // examined by other middlewares (and the tracing sampler), in particular in
  1816  // httpDispatch.
  1817  //
  1818  // See grpcRoot(...) for a gRPC counterpart.
  1819  func (s *Server) httpRoot(next http.Handler) http.Handler {
  1820  	return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
  1821  		// This context is derived from s.Context (see Serve) and has various server
  1822  		// systems injected into it already. Its only difference from s.Context is
  1823  		// that http.Server cancels it when the client disconnects, which we want.
  1824  		ctx := r.Context()
  1825  
  1826  		// Apply per-request HTTP timeout, if any.
  1827  		timeout := s.Options.DefaultRequestTimeout
  1828  		if strings.HasPrefix(r.URL.Path, "/internal/") {
  1829  			timeout = s.Options.InternalRequestTimeout
  1830  		}
  1831  		if timeout != 0 {
  1832  			var cancelCtx context.CancelFunc
  1833  			ctx, cancelCtx = context.WithTimeout(ctx, timeout)
  1834  			defer cancelCtx()
  1835  		}
  1836  
  1837  		// Reconstruct the original URL for logging.
  1838  		protocol := r.Header.Get("X-Forwarded-Proto")
  1839  		if protocol != "https" {
  1840  			protocol = "http"
  1841  		}
  1842  		url := fmt.Sprintf("%s://%s%s", protocol, r.Host, r.RequestURI)
  1843  
  1844  		// incomingRequest is used by middlewares that work with both HTTP and gRPC
  1845  		// requests, in particular it is used by startRequest(...).
  1846  		next.ServeHTTP(rw, r.WithContext(context.WithValue(ctx, &incomingRequestKey, &incomingRequest{
  1847  			url:         url,
  1848  			method:      r.Method,
  1849  			metadata:    auth.RequestMetadataForHTTP(r),
  1850  			healthCheck: r.RequestURI == healthEndpoint && isHealthCheckerUA(r.UserAgent()),
  1851  		})))
  1852  	})
  1853  }
  1854  
  1855  // httpDispatch finishes HTTP request context initialization.
  1856  //
  1857  // Its primary purpose it so setup logging, but it also does some other context
  1858  // touches. See startRequest(...) where the bulk of work is happening.
  1859  //
  1860  // The next stop is the router.Middleware chain as registered in newRouter(...)
  1861  // and by the user code.
  1862  //
  1863  // See grpcDispatch(...) for a gRPC counterpart.
  1864  func (s *Server) httpDispatch(next http.Handler) http.Handler {
  1865  	return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
  1866  		// Track how many response bytes are sent and what status is set, for logs.
  1867  		trackingRW := iotools.NewResponseWriter(rw)
  1868  
  1869  		// Initialize per-request context (logging, GAE tickets, etc).
  1870  		ctx, done := s.startRequest(r.Context())
  1871  
  1872  		// Log the result when done.
  1873  		defer func() {
  1874  			done(&requestResult{
  1875  				statusCode:   trackingRW.Status(),
  1876  				requestSize:  r.ContentLength,
  1877  				responseSize: trackingRW.ResponseSize(),
  1878  			})
  1879  		}()
  1880  
  1881  		next.ServeHTTP(trackingRW, r.WithContext(ctx))
  1882  	})
  1883  }
  1884  
  1885  // grpcRoot is the entry point for gRPC requests.
  1886  //
  1887  // Its job is to initialize *incomingRequest in the context which is then
  1888  // examined by other middlewares (and the tracing sampler), in particular in
  1889  // grpcDispatch.
  1890  //
  1891  // See httpRoot(...) for a HTTP counterpart.
  1892  func (s *Server) grpcRoot() grpcutil.UnifiedServerInterceptor {
  1893  	return func(ctx context.Context, fullMethod string, handler func(ctx context.Context) error) (err error) {
  1894  		// incomingRequest is used by middlewares that work with both HTTP and gRPC
  1895  		// requests, in particular it is used by startRequest(...).
  1896  		//
  1897  		// Note that here `ctx` is already derived from s.Context (except it is
  1898  		// canceled if the client disconnects). See grpcPort{} implementation.
  1899  		md := auth.RequestMetadataForGRPC(ctx)
  1900  		return handler(context.WithValue(ctx, &incomingRequestKey, &incomingRequest{
  1901  			url:         fmt.Sprintf("grpc://%s%s", md.Host(), fullMethod),
  1902  			method:      "POST",
  1903  			metadata:    md,
  1904  			healthCheck: strings.HasPrefix(fullMethod, "/grpc.health.") && isHealthCheckerUA(md.Header("User-Agent")),
  1905  		}))
  1906  	}
  1907  }
  1908  
  1909  // grpcDispatch finishes gRPC request context initialization.
  1910  //
  1911  // Its primary purpose it so setup logging, but it also does some other context
  1912  // touches. See startRequest(...) where the bulk of work is happening.
  1913  //
  1914  // The next stop is the gRPC middleware chain as registered via server's API.
  1915  //
  1916  // See httpDispatch(...) for a HTTP counterpart.
  1917  func (s *Server) grpcDispatch() grpcutil.UnifiedServerInterceptor {
  1918  	return func(ctx context.Context, fullMethod string, handler func(ctx context.Context) error) (err error) {
  1919  		// Initialize per-request context (logging, GAE tickets, etc).
  1920  		ctx, done := s.startRequest(ctx)
  1921  
  1922  		// Log the result when done.
  1923  		defer func() {
  1924  			code := status.Code(err)
  1925  			httpStatusCode := grpcutil.CodeStatus(code)
  1926  
  1927  			// Log errors (for parity with pRPC server behavior).
  1928  			switch {
  1929  			case httpStatusCode >= 400 && httpStatusCode < 500:
  1930  				logging.Warningf(ctx, "%s", err)
  1931  			case httpStatusCode >= 500:
  1932  				logging.Errorf(ctx, "%s", err)
  1933  			}
  1934  
  1935  			// Report canonical GRPC code as a log entry field for filtering by it.
  1936  			canonical, ok := codepb.Code_name[int32(code)]
  1937  			if !ok {
  1938  				canonical = fmt.Sprintf("%d", int64(code))
  1939  			}
  1940  
  1941  			done(&requestResult{
  1942  				statusCode:  httpStatusCode, // this is an approximation
  1943  				extraFields: logging.Fields{"code": canonical},
  1944  			})
  1945  		}()
  1946  
  1947  		return handler(ctx)
  1948  	}
  1949  }
  1950  
  1951  // startRequest finishes preparing the per-request context.
  1952  //
  1953  // It returns a callback that must be called after finishing processing this
  1954  // request.
  1955  //
  1956  // The incoming context is assumed to be derived by either httpRoot(...) or
  1957  // grpcRoot(...) and have *incomingRequest inside.
  1958  func (s *Server) startRequest(ctx context.Context) (context.Context, func(*requestResult)) {
  1959  	// The value *must* be there. Let it panic if it is not.
  1960  	req := ctx.Value(&incomingRequestKey).(*incomingRequest)
  1961  
  1962  	// If running on GAE, initialize the per-request API tickets needed to make
  1963  	// RPCs to the GAE service bridge.
  1964  	if s.Options.Serverless == module.GAE {
  1965  		ctx = gae.WithTickets(ctx, gae.RequestTickets(req.metadata))
  1966  	}
  1967  
  1968  	// This is used in waitUntilNotServing.
  1969  	started := clock.Now(ctx)
  1970  	if !req.healthCheck {
  1971  		s.lastReqTime.Store(started)
  1972  	}
  1973  
  1974  	// If the tracing is completely disabled we'll have an empty span context.
  1975  	// But we need a trace ID in the context anyway for correlating logs (see
  1976  	// below). Open a noop non-recording span with random generated trace ID.
  1977  	span := oteltrace.SpanFromContext(ctx)
  1978  	spanCtx := span.SpanContext()
  1979  	if !spanCtx.HasTraceID() {
  1980  		var traceID oteltrace.TraceID
  1981  		s.genUniqueBlob(traceID[:])
  1982  		spanCtx = oteltrace.NewSpanContext(oteltrace.SpanContextConfig{
  1983  			TraceID: traceID,
  1984  		})
  1985  		ctx = oteltrace.ContextWithSpanContext(ctx, spanCtx)
  1986  	}
  1987  
  1988  	// Associate all logs with one another by using the same trace ID, which also
  1989  	// matches the trace ID extracted by the propagator from incoming headers.
  1990  	// Make sure to use the full trace ID format that includes the project name.
  1991  	// This is important to group logs generated by us with logs generated by
  1992  	// the GCP (which uses the full trace ID) when running in Cloud. Outside of
  1993  	// Cloud it doesn't really matter what trace ID is used as long as all log
  1994  	// entries use the same one.
  1995  	traceID := spanCtx.TraceID().String()
  1996  	if s.Options.CloudProject != "" {
  1997  		traceID = fmt.Sprintf("projects/%s/traces/%s", s.Options.CloudProject, traceID)
  1998  	}
  1999  
  2000  	// SpanID can be missing if there's no actual tracing. This is fine.
  2001  	spanID := ""
  2002  	if spanCtx.HasSpanID() {
  2003  		spanID = spanCtx.SpanID().String()
  2004  	}
  2005  
  2006  	// When running in prod, make the logger emit log entries in JSON format that
  2007  	// Cloud Logger collectors understand natively.
  2008  	var severityTracker *sdlogger.SeverityTracker
  2009  	if s.Options.Prod {
  2010  		// Start assembling logging sink layers starting with the innermost one.
  2011  		logSink := s.stdout
  2012  
  2013  		// If we are going to log the overall request status, install the tracker
  2014  		// that observes the maximum emitted severity to use it as an overall
  2015  		// severity for the request log entry.
  2016  		if s.logRequestCB != nil {
  2017  			severityTracker = &sdlogger.SeverityTracker{Out: logSink}
  2018  			logSink = severityTracker
  2019  		}
  2020  
  2021  		// If have Cloud Error Reporting enabled, intercept errors to upload them.
  2022  		// TODO(vadimsh): Fill in `CloudErrorsSink.Request` with something.
  2023  		if s.errRptClient != nil {
  2024  			logSink = &sdlogger.CloudErrorsSink{
  2025  				Client: s.errRptClient,
  2026  				Out:    logSink,
  2027  			}
  2028  		}
  2029  
  2030  		// Associate log entries with the tracing span where they were emitted.
  2031  		annotateWithSpan := func(ctx context.Context, e *sdlogger.LogEntry) {
  2032  			if spanID := oteltrace.SpanContextFromContext(ctx).SpanID(); spanID.IsValid() {
  2033  				e.SpanID = spanID.String()
  2034  			}
  2035  		}
  2036  
  2037  		// Finally install all this into the request context.
  2038  		ctx = logging.SetFactory(ctx, sdlogger.Factory(logSink, sdlogger.LogEntry{
  2039  			TraceID:   traceID,
  2040  			Operation: &sdlogger.Operation{ID: s.genUniqueID(32)},
  2041  		}, annotateWithSpan))
  2042  	}
  2043  
  2044  	// Do final context touches.
  2045  	ctx = caching.WithRequestCache(ctx)
  2046  
  2047  	// This will be called once the request is fully processed.
  2048  	return ctx, func(res *requestResult) {
  2049  		now := clock.Now(ctx)
  2050  		latency := now.Sub(started)
  2051  
  2052  		if req.healthCheck {
  2053  			// Do not log fast health check calls AT ALL, they just spam logs.
  2054  			if latency < healthTimeLogThreshold {
  2055  				return
  2056  			}
  2057  			// Emit a warning if the health check is slow, this likely indicates
  2058  			// high CPU load.
  2059  			logging.Warningf(ctx, "Health check is slow: %s > %s", latency, healthTimeLogThreshold)
  2060  		}
  2061  
  2062  		// If there's no need to emit the overall request log entry, we are done.
  2063  		// See initLogging(...) for where this is decided.
  2064  		if s.logRequestCB == nil {
  2065  			return
  2066  		}
  2067  
  2068  		// When running behind Envoy, log its request IDs to simplify debugging.
  2069  		extraFields := res.extraFields
  2070  		if xrid := req.metadata.Header("X-Request-Id"); xrid != "" {
  2071  			if extraFields == nil {
  2072  				extraFields = make(logging.Fields, 1)
  2073  			}
  2074  			extraFields["requestId"] = xrid
  2075  		}
  2076  
  2077  		// If we were tracking the overall severity, collect the outcome.
  2078  		severity := sdlogger.InfoSeverity
  2079  		if severityTracker != nil {
  2080  			severity = severityTracker.MaxSeverity()
  2081  		}
  2082  
  2083  		// Log the final outcome of the processed request.
  2084  		s.logRequestCB(ctx, &sdlogger.LogEntry{
  2085  			Severity:     severity,
  2086  			Timestamp:    sdlogger.ToTimestamp(now),
  2087  			TraceID:      traceID,
  2088  			TraceSampled: span.IsRecording(),
  2089  			SpanID:       spanID, // the top-level span ID if present
  2090  			Fields:       extraFields,
  2091  			RequestInfo: &sdlogger.RequestInfo{
  2092  				Method:       req.method,
  2093  				URL:          req.url,
  2094  				Status:       res.statusCode,
  2095  				RequestSize:  fmt.Sprintf("%d", res.requestSize),
  2096  				ResponseSize: fmt.Sprintf("%d", res.responseSize),
  2097  				UserAgent:    req.metadata.Header("User-Agent"),
  2098  				RemoteIP:     endUserIP(req.metadata),
  2099  				Latency:      fmt.Sprintf("%fs", latency.Seconds()),
  2100  			},
  2101  		})
  2102  	}
  2103  }
  2104  
  2105  // initLogging initializes the server logging.
  2106  //
  2107  // Called very early during server startup process. Many server fields may not
  2108  // be initialized yet, be careful.
  2109  //
  2110  // When running in production uses the ugly looking JSON format that is hard to
  2111  // read by humans but which is parsed by google-fluentd and GCP serverless
  2112  // hosting environment.
  2113  //
  2114  // To support per-request log grouping in Cloud Logging UI there must be
  2115  // two different log streams:
  2116  //   - A stream with top-level HTTP request entries (conceptually like Apache's
  2117  //     access.log, i.e. with one log entry per request).
  2118  //   - A stream with logs produced within requests (correlated with HTTP request
  2119  //     logs via the trace ID field).
  2120  //
  2121  // Both streams are expected to have a particular format and use particular
  2122  // fields for Cloud Logging UI to display them correctly. This technique is
  2123  // primarily intended for GAE Flex, but it works in many Google environments:
  2124  // https://cloud.google.com/appengine/articles/logging#linking_app_logs_and_requests
  2125  //
  2126  // On GKE we use 'stderr' stream for top-level HTTP request entries and 'stdout'
  2127  // stream for logs produced by requests.
  2128  //
  2129  // On GAE and Cloud Run, the stream with top-level HTTP request entries is
  2130  // produced by the GCP runtime itself. So we emit only logs produced within
  2131  // requests (also to 'stdout', just like on GKE).
  2132  //
  2133  // In all environments 'stderr' stream is used to log all global activities that
  2134  // happens outside of any request handler (stuff like initialization, shutdown,
  2135  // background goroutines, etc).
  2136  //
  2137  // In non-production mode we use the human-friendly format and a single 'stderr'
  2138  // log stream for everything.
  2139  func (s *Server) initLogging() {
  2140  	if !s.Options.Prod {
  2141  		s.Context = gologger.StdConfig.Use(s.Context)
  2142  		s.Context = logging.SetLevel(s.Context, logging.Debug)
  2143  		s.logRequestCB = func(ctx context.Context, entry *sdlogger.LogEntry) {
  2144  			logging.Infof(ctx, "%d %s %q (%s)",
  2145  				entry.RequestInfo.Status,
  2146  				entry.RequestInfo.Method,
  2147  				entry.RequestInfo.URL,
  2148  				entry.RequestInfo.Latency,
  2149  			)
  2150  		}
  2151  		return
  2152  	}
  2153  
  2154  	if s.Options.testStdout != nil {
  2155  		s.stdout = s.Options.testStdout
  2156  	} else {
  2157  		s.stdout = &sdlogger.Sink{Out: os.Stdout}
  2158  	}
  2159  
  2160  	if s.Options.testStderr != nil {
  2161  		s.stderr = s.Options.testStderr
  2162  	} else {
  2163  		s.stderr = &sdlogger.Sink{Out: os.Stderr}
  2164  	}
  2165  
  2166  	s.Context = logging.SetFactory(s.Context,
  2167  		sdlogger.Factory(s.stderr, sdlogger.LogEntry{
  2168  			Operation: &sdlogger.Operation{
  2169  				ID: s.genUniqueID(32), // correlate all global server logs together
  2170  			},
  2171  		}, nil),
  2172  	)
  2173  	s.Context = logging.SetLevel(s.Context, logging.Debug)
  2174  
  2175  	// Skip writing the root request log entry on Serverless GCP since the load
  2176  	// balancer there writes the entry itself.
  2177  	switch s.Options.Serverless {
  2178  	case module.GAE:
  2179  		// Skip. GAE writes it to "appengine.googleapis.com/request_log" itself.
  2180  	case module.CloudRun:
  2181  		// Skip. Cloud Run writes it to "run.googleapis.com/requests" itself.
  2182  	default:
  2183  		// Emit to stderr where Cloud Logging collectors pick it up.
  2184  		s.logRequestCB = func(_ context.Context, entry *sdlogger.LogEntry) { s.stderr.Write(entry) }
  2185  	}
  2186  }
  2187  
  2188  // initAuthStart initializes the core auth system by preparing the context
  2189  // and verifying auth tokens can actually be minted (i.e. supplied credentials
  2190  // are valid).
  2191  //
  2192  // It is called before the tsmon monitoring is initialized: tsmon needs auth.
  2193  // The rest of the auth initialization (the part that needs tsmon) happens in
  2194  // initAuthFinish after tsmon is initialized.
  2195  func (s *Server) initAuthStart() error {
  2196  	// Make a transport that appends information about the server as User-Agent.
  2197  	ua := s.Options.userAgent()
  2198  	rootTransport := clientauth.NewModifyingTransport(http.DefaultTransport, func(req *http.Request) error {
  2199  		newUA := ua
  2200  		if cur := req.UserAgent(); cur != "" {
  2201  			newUA += " " + cur
  2202  		}
  2203  		req.Header.Set("User-Agent", newUA)
  2204  		return nil
  2205  	})
  2206  
  2207  	// Initialize the token generator based on s.Options.ClientAuth.
  2208  	opts := s.Options.ClientAuth
  2209  
  2210  	// Use `rootTransport` for calls made by the token generator (e.g. when
  2211  	// refreshing tokens).
  2212  	opts.Transport = rootTransport
  2213  
  2214  	// We aren't going to use the authenticator's transport (and thus its
  2215  	// monitoring), only the token source. DisableMonitoring == true removes some
  2216  	// log spam.
  2217  	opts.DisableMonitoring = true
  2218  
  2219  	// GCP is very aggressive in caching the token internally (in the metadata
  2220  	// server) and refreshing it only when it is very close to its expiration. We
  2221  	// need to match this behavior in our in-process cache, otherwise
  2222  	// GetAccessToken complains that the token refresh procedure doesn't actually
  2223  	// change the token (because the metadata server returned the cached one).
  2224  	opts.MinTokenLifetime = 20 * time.Second
  2225  
  2226  	// The default value for ClientAuth.SecretsDir is usually hardcoded to point
  2227  	// to where the token cache is located on developer machines (~/.config/...).
  2228  	// This location often doesn't exist when running from inside a container.
  2229  	// The token cache is also not really needed for production services that use
  2230  	// service accounts (they don't need cached refresh tokens). So in production
  2231  	// mode totally ignore default ClientAuth.SecretsDir and use whatever was
  2232  	// passed as -token-cache-dir. If it is empty (default), then no on-disk token
  2233  	// cache is used at all.
  2234  	//
  2235  	// If -token-cache-dir was explicitly set, always use it (even in dev mode).
  2236  	// This is useful when running containers locally: developer's credentials
  2237  	// on the host machine can be mounted inside the container.
  2238  	if s.Options.Prod || s.Options.TokenCacheDir != "" {
  2239  		opts.SecretsDir = s.Options.TokenCacheDir
  2240  	}
  2241  
  2242  	// Annotate the context used for logging from the token generator.
  2243  	ctx := logging.SetField(s.Context, "activity", "luci.auth")
  2244  	tokens := clientauth.NewTokenGenerator(ctx, opts)
  2245  
  2246  	// Prepare partially initialized structs for the auth.Config. They will be
  2247  	// fully initialized in initAuthFinish once we have a sufficiently working
  2248  	// auth context that can call Cloud IAM.
  2249  	s.signer = &signerImpl{srv: s}
  2250  	s.actorTokens = &actorTokensImpl{}
  2251  
  2252  	// Either use the explicitly passed AuthDB provider or the one initialized
  2253  	// by initAuthDB.
  2254  	provider := s.Options.AuthDBProvider
  2255  	if provider == nil {
  2256  		provider = func(context.Context) (authdb.DB, error) {
  2257  			db, _ := s.authDB.Load().(authdb.DB) // refreshed asynchronously in refreshAuthDB
  2258  			return db, nil
  2259  		}
  2260  	}
  2261  
  2262  	// Initialize the state in the context.
  2263  	s.Context = auth.Initialize(s.Context, &auth.Config{
  2264  		DBProvider: provider,
  2265  		Signer:     s.signer,
  2266  		AccessTokenProvider: func(ctx context.Context, scopes []string) (*oauth2.Token, error) {
  2267  			return tokens.GenerateOAuthToken(ctx, scopes, 0)
  2268  		},
  2269  		IDTokenProvider: func(ctx context.Context, audience string) (*oauth2.Token, error) {
  2270  			return tokens.GenerateIDToken(ctx, audience, 0)
  2271  		},
  2272  		ActorTokensProvider: s.actorTokens,
  2273  		AnonymousTransport:  func(context.Context) http.RoundTripper { return rootTransport },
  2274  		FrontendClientID:    func(context.Context) (string, error) { return s.Options.FrontendClientID, nil },
  2275  		EndUserIP:           endUserIP,
  2276  		IsDevMode:           !s.Options.Prod,
  2277  	})
  2278  
  2279  	// Note: we initialize a token source for one arbitrary set of scopes here. In
  2280  	// many practical cases this is sufficient to verify that credentials are
  2281  	// valid. For example, when we use service account JSON key, if we can
  2282  	// generate a token with *some* scope (meaning Cloud accepted our signature),
  2283  	// we can generate tokens with *any* scope, since there's no restrictions on
  2284  	// what scopes are accessible to a service account, as long as the private key
  2285  	// is valid (which we just verified by generating some token).
  2286  	_, err := tokens.GenerateOAuthToken(ctx, auth.CloudOAuthScopes, 0)
  2287  	if err != nil {
  2288  		// ErrLoginRequired may happen only when running the server locally using
  2289  		// developer's credentials. Let them know how the problem can be fixed.
  2290  		if !s.Options.Prod && err == clientauth.ErrLoginRequired {
  2291  			scopes := fmt.Sprintf("-scopes %q", strings.Join(auth.CloudOAuthScopes, " "))
  2292  			if opts.ActAsServiceAccount != "" && opts.ActViaLUCIRealm == "" {
  2293  				scopes = "-scopes-iam"
  2294  			}
  2295  			logging.Errorf(s.Context, "Looks like you run the server locally and it doesn't have credentials for some OAuth scopes")
  2296  			logging.Errorf(s.Context, "Run the following command to set them up: ")
  2297  			logging.Errorf(s.Context, "  $ luci-auth login %s", scopes)
  2298  		}
  2299  		return errors.Annotate(err, "failed to initialize the token source").Err()
  2300  	}
  2301  
  2302  	// Report who we are running as. Useful when debugging access issues.
  2303  	switch email, err := tokens.GetEmail(); {
  2304  	case err == nil:
  2305  		logging.Infof(s.Context, "Running as %s", email)
  2306  		s.runningAs = email
  2307  	case err == clientauth.ErrNoEmail:
  2308  		logging.Warningf(s.Context, "Running as <unknown>, cautiously proceeding...")
  2309  	case err != nil:
  2310  		return errors.Annotate(err, "failed to check the service account email").Err()
  2311  	}
  2312  
  2313  	return nil
  2314  }
  2315  
  2316  // initAuthFinish finishes auth system initialization.
  2317  //
  2318  // It is called after tsmon is initialized.
  2319  func (s *Server) initAuthFinish() error {
  2320  	// We should be able to make basic authenticated requests now and can
  2321  	// construct a token source used by server's own guts to call Cloud APIs,
  2322  	// such us Cloud Trace and Cloud Error Reporting (and others).
  2323  	var err error
  2324  	s.cloudTS, err = auth.GetTokenSource(s.Context, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...))
  2325  	if err != nil {
  2326  		return errors.Annotate(err, "failed to initialize the cloud token source").Err()
  2327  	}
  2328  
  2329  	// Finish constructing `signer` and `actorTokens` that were waiting for
  2330  	// an IAM client.
  2331  	iamClient, err := credentials.NewIamCredentialsClient(
  2332  		s.Context,
  2333  		option.WithTokenSource(s.cloudTS),
  2334  		option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})),
  2335  		option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())),
  2336  		option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())),
  2337  	)
  2338  	if err != nil {
  2339  		return errors.Annotate(err, "failed to construct IAM client").Err()
  2340  	}
  2341  	s.RegisterCleanup(func(ctx context.Context) { iamClient.Close() })
  2342  	s.signer.iamClient = iamClient
  2343  	s.actorTokens.iamClient = iamClient
  2344  
  2345  	// If not using a custom AuthDB provider, initialize the standard one that
  2346  	// fetches AuthDB (a database with groups and auth config) from a central
  2347  	// place. This also starts a goroutine to periodically refresh it.
  2348  	if s.Options.AuthDBProvider == nil {
  2349  		if err := s.initAuthDB(); err != nil {
  2350  			return errors.Annotate(err, "failed to initialize AuthDB").Err()
  2351  		}
  2352  	}
  2353  
  2354  	// Default RPC authentication methods. See also SetRPCAuthMethods.
  2355  	s.rpcAuthMethods = make([]auth.Method, 0, 2)
  2356  	if s.Options.OpenIDRPCAuthEnable {
  2357  		// The preferred authentication method.
  2358  		s.rpcAuthMethods = append(s.rpcAuthMethods, &openid.GoogleIDTokenAuthMethod{
  2359  			AudienceCheck: openid.AudienceMatchesHost,
  2360  			Audience:      s.Options.OpenIDRPCAuthAudience,
  2361  			SkipNonJWT:    true, // pass OAuth2 access tokens through
  2362  		})
  2363  	}
  2364  	// Backward compatibility for the RPC Explorer and old clients.
  2365  	s.rpcAuthMethods = append(s.rpcAuthMethods, &auth.GoogleOAuth2Method{
  2366  		Scopes: []string{clientauth.OAuthScopeEmail},
  2367  	})
  2368  
  2369  	return nil
  2370  }
  2371  
  2372  // initAuthDB interprets -auth-db-* flags and sets up fetching of AuthDB.
  2373  func (s *Server) initAuthDB() error {
  2374  	// Check flags are compatible.
  2375  	switch {
  2376  	case s.Options.AuthDBPath != "" && s.Options.AuthServiceHost != "":
  2377  		return errors.Reason("-auth-db-path and -auth-service-host can't be used together").Err()
  2378  	case s.Options.AuthServiceHost == "" && (s.Options.AuthDBDump != "" || s.Options.AuthDBSigner != ""):
  2379  		return errors.Reason("-auth-db-dump and -auth-db-signer can be used only with -auth-service-host").Err()
  2380  	case s.Options.AuthDBDump != "" && !strings.HasPrefix(s.Options.AuthDBDump, "gs://"):
  2381  		return errors.Reason("-auth-db-dump value should start with gs://, got %q", s.Options.AuthDBDump).Err()
  2382  	case strings.Contains(s.Options.AuthServiceHost, "/"):
  2383  		return errors.Reason("-auth-service-host should be a plain hostname, got %q", s.Options.AuthServiceHost).Err()
  2384  	}
  2385  
  2386  	// Fill in defaults.
  2387  	if s.Options.AuthServiceHost != "" {
  2388  		if s.Options.AuthDBDump == "" {
  2389  			s.Options.AuthDBDump = fmt.Sprintf("gs://%s/auth-db", s.Options.AuthServiceHost)
  2390  		}
  2391  		if s.Options.AuthDBSigner == "" {
  2392  			if !strings.HasSuffix(s.Options.AuthServiceHost, ".appspot.com") {
  2393  				return errors.Reason("-auth-db-signer is required if -auth-service-host is not *.appspot.com").Err()
  2394  			}
  2395  			s.Options.AuthDBSigner = fmt.Sprintf("%s@appspot.gserviceaccount.com",
  2396  				strings.TrimSuffix(s.Options.AuthServiceHost, ".appspot.com"))
  2397  		}
  2398  	}
  2399  
  2400  	// Fetch the initial copy of AuthDB. Note that this happens before we start
  2401  	// the serving loop, to make sure incoming requests have some AuthDB to use.
  2402  	if err := s.refreshAuthDB(s.Context); err != nil {
  2403  		return errors.Annotate(err, "failed to load the initial AuthDB version").Err()
  2404  	}
  2405  
  2406  	// Periodically refresh it in the background.
  2407  	s.RunInBackground("luci.authdb", func(c context.Context) {
  2408  		for {
  2409  			jitter := time.Duration(rand.Int63n(int64(10 * time.Second)))
  2410  			if r := <-clock.After(c, 30*time.Second+jitter); r.Err != nil {
  2411  				return // the context is canceled
  2412  			}
  2413  			if err := s.refreshAuthDB(c); err != nil {
  2414  				// Don't log the error if the server is shutting down.
  2415  				if !errors.Is(err, context.Canceled) {
  2416  					logging.WithError(err).Errorf(c, "Failed to reload AuthDB, using the cached one")
  2417  				}
  2418  			}
  2419  		}
  2420  	})
  2421  	return nil
  2422  }
  2423  
  2424  // refreshAuthDB reloads AuthDB from the source and stores it in memory.
  2425  func (s *Server) refreshAuthDB(c context.Context) error {
  2426  	cur, _ := s.authDB.Load().(authdb.DB)
  2427  	db, err := s.fetchAuthDB(c, cur)
  2428  	if err != nil {
  2429  		return err
  2430  	}
  2431  	s.authDB.Store(db)
  2432  	return nil
  2433  }
  2434  
  2435  // fetchAuthDB fetches the most recent copy of AuthDB from the external source.
  2436  //
  2437  // Used only if Options.AuthDBProvider is nil.
  2438  //
  2439  // 'cur' is the currently used AuthDB or nil if fetching it for the first time.
  2440  // Returns 'cur' as is if it's already fresh.
  2441  func (s *Server) fetchAuthDB(c context.Context, cur authdb.DB) (authdb.DB, error) {
  2442  	// Loading from a local file (useful in integration tests).
  2443  	if s.Options.AuthDBPath != "" {
  2444  		r, err := os.Open(s.Options.AuthDBPath)
  2445  		if err != nil {
  2446  			return nil, errors.Annotate(err, "failed to open AuthDB file").Err()
  2447  		}
  2448  		defer r.Close()
  2449  		db, err := authdb.SnapshotDBFromTextProto(r)
  2450  		if err != nil {
  2451  			return nil, errors.Annotate(err, "failed to load AuthDB file").Err()
  2452  		}
  2453  		return db, nil
  2454  	}
  2455  
  2456  	// Loading from a GCS dump (s.Options.AuthDB* are validated here already).
  2457  	if s.Options.AuthDBDump != "" {
  2458  		c, cancel := clock.WithTimeout(c, 5*time.Minute)
  2459  		defer cancel()
  2460  		fetcher := dump.Fetcher{
  2461  			StorageDumpPath:    s.Options.AuthDBDump[len("gs://"):],
  2462  			AuthServiceURL:     "https://" + s.Options.AuthServiceHost,
  2463  			AuthServiceAccount: s.Options.AuthDBSigner,
  2464  			OAuthScopes:        auth.CloudOAuthScopes,
  2465  		}
  2466  		curSnap, _ := cur.(*authdb.SnapshotDB)
  2467  		snap, err := fetcher.FetchAuthDB(c, curSnap)
  2468  		if err != nil {
  2469  			return nil, errors.Annotate(err, "fetching from GCS dump failed").Err()
  2470  		}
  2471  		return snap, nil
  2472  	}
  2473  
  2474  	// In dev mode default to "allow everything".
  2475  	if !s.Options.Prod {
  2476  		return authdb.DevServerDB{}, nil
  2477  	}
  2478  
  2479  	// In prod mode default to "fail on any non-trivial check". Some services may
  2480  	// not need to use AuthDB at all and configuring it for them is a hassle. If
  2481  	// they try to use it for something vital, they'll see the error.
  2482  	return authdb.UnconfiguredDB{
  2483  		Error: errors.Reason("a source of AuthDB is not configured, see -auth-* server flags").Err(),
  2484  	}, nil
  2485  }
  2486  
  2487  // initTSMon initializes time series monitoring state.
  2488  func (s *Server) initTSMon() error {
  2489  	// We keep tsmon always enabled (flushing to /dev/null if no -ts-mon-* flags
  2490  	// are set) so that tsmon's in-process store is populated, and metrics there
  2491  	// can be examined via /admin/tsmon. This is useful when developing/debugging
  2492  	// tsmon metrics.
  2493  	var customMonitor monitor.Monitor
  2494  	if s.Options.TsMonAccount == "" || s.Options.TsMonServiceName == "" || s.Options.TsMonJobName == "" {
  2495  		logging.Infof(s.Context, "tsmon is in the debug mode: metrics are collected, but flushed to /dev/null (pass -ts-mon-* flags to start uploading metrics)")
  2496  		customMonitor = monitor.NewNilMonitor()
  2497  	}
  2498  
  2499  	interval := int(s.Options.TsMonFlushInterval.Seconds())
  2500  	if interval == 0 {
  2501  		interval = int(defaultTsMonFlushInterval.Seconds())
  2502  	}
  2503  	timeout := int(s.Options.TsMonFlushTimeout.Seconds())
  2504  	if timeout == 0 {
  2505  		timeout = int(defaultTsMonFlushTimeout.Seconds())
  2506  	}
  2507  	if timeout >= interval {
  2508  		return errors.Reason("-ts-mon-flush-timeout (%ds) must be shorter than -ts-mon-flush-interval (%ds)", timeout, interval).Err()
  2509  	}
  2510  	s.tsmon = &tsmon.State{
  2511  		CustomMonitor: customMonitor,
  2512  		Settings: &tsmon.Settings{
  2513  			Enabled:            true,
  2514  			ProdXAccount:       s.Options.TsMonAccount,
  2515  			FlushIntervalSec:   interval,
  2516  			FlushTimeoutSec:    timeout,
  2517  			ReportRuntimeStats: true,
  2518  		},
  2519  		Target: func(c context.Context) target.Task {
  2520  			// TODO(vadimsh): We pretend to be a GAE app for now to be able to
  2521  			// reuse existing dashboards. Each pod pretends to be a separate GAE
  2522  			// version. That way we can stop worrying about TaskNumAllocator and just
  2523  			// use 0 (since there'll be only one task per "version"). This looks
  2524  			// chaotic for deployments with large number of pods.
  2525  			return target.Task{
  2526  				DataCenter:  "appengine",
  2527  				ServiceName: s.Options.TsMonServiceName,
  2528  				JobName:     s.Options.TsMonJobName,
  2529  				HostName:    s.Options.Hostname,
  2530  			}
  2531  		},
  2532  	}
  2533  	if customMonitor != nil {
  2534  		tsmon.PortalPage.SetReadOnlySettings(s.tsmon.Settings,
  2535  			"Running in the debug mode. Pass all -ts-mon-* command line flags to start uploading metrics.")
  2536  	} else {
  2537  		tsmon.PortalPage.SetReadOnlySettings(s.tsmon.Settings,
  2538  			"Settings are controlled through -ts-mon-* command line flags.")
  2539  	}
  2540  
  2541  	// Enable this configuration in s.Context so all transports created during
  2542  	// the server startup have tsmon instrumentation.
  2543  	s.tsmon.Activate(s.Context)
  2544  
  2545  	// Report our image version as a metric, useful to monitor rollouts.
  2546  	tsmoncommon.RegisterCallbackIn(s.Context, func(ctx context.Context) {
  2547  		versionMetric.Set(ctx, s.Options.ImageVersion())
  2548  	})
  2549  
  2550  	// Periodically flush metrics.
  2551  	s.RunInBackground("luci.tsmon", s.tsmon.FlushPeriodically)
  2552  	return nil
  2553  }
  2554  
  2555  // otelResource returns an OTEL resource identifying this server instance.
  2556  //
  2557  // It is just a bunch of labels essentially reported to monitoring backends
  2558  // together with traces.
  2559  func (s *Server) otelResource(ctx context.Context) (*resource.Resource, error) {
  2560  	return resource.New(
  2561  		ctx,
  2562  		resource.WithTelemetrySDK(),
  2563  		resource.WithDetectors(gcp.NewDetector()),
  2564  		resource.WithAttributes(
  2565  			semconv.ServiceName(fmt.Sprintf("%s/%s", s.Options.TsMonServiceName, s.Options.TsMonJobName)),
  2566  			semconv.ServiceInstanceID(s.Options.Hostname),
  2567  			semconv.ContainerImageName(s.Options.ImageName()),
  2568  			semconv.ContainerImageTag(s.Options.ImageVersion()),
  2569  		),
  2570  	)
  2571  }
  2572  
  2573  // otelErrorHandler returns a top-level OTEL error catcher.
  2574  //
  2575  // It just logs errors (with some dedupping to avoid spam).
  2576  func (s *Server) otelErrorHandler(ctx context.Context) otel.ErrorHandlerFunc {
  2577  	// State for suppressing repeated ResourceExhausted error messages, otherwise
  2578  	// logs may get flooded with them. They are usually not super important, but
  2579  	// ignoring them completely is also not great.
  2580  	errorDedup := struct {
  2581  		lock   sync.Mutex
  2582  		report time.Time
  2583  		count  int
  2584  	}{}
  2585  	return func(err error) {
  2586  		if !strings.Contains(err.Error(), "ResourceExhausted") {
  2587  			logging.Warningf(ctx, "Error in Cloud Trace exporter: %s", err)
  2588  			return
  2589  		}
  2590  
  2591  		errorDedup.lock.Lock()
  2592  		defer errorDedup.lock.Unlock()
  2593  
  2594  		errorDedup.count++
  2595  
  2596  		if errorDedup.report.IsZero() || time.Since(errorDedup.report) > 5*time.Minute {
  2597  			if errorDedup.report.IsZero() {
  2598  				logging.Warningf(ctx, "Error in Cloud Trace exporter: %s", err)
  2599  			} else {
  2600  				logging.Warningf(ctx, "Error in Cloud Trace exporter: %s (%d occurrences in %s since the last report)", err, errorDedup.count, time.Since(errorDedup.report))
  2601  			}
  2602  			errorDedup.report = time.Now()
  2603  			errorDedup.count = 0
  2604  		}
  2605  	}
  2606  }
  2607  
  2608  // otelSampler prepares a sampler based on CLI flags and environment.
  2609  func (s *Server) otelSampler(ctx context.Context) (trace.Sampler, error) {
  2610  	// On GCP Serverless let the GCP load balancer make decisions about
  2611  	// sampling. If it decides to sample a trace, it will let us know through
  2612  	// options of the parent span in X-Cloud-Trace-Context. We will collect only
  2613  	// traces from requests that GCP wants to sample itself. Traces without
  2614  	// a parent context are never sampled. This also means traces from random
  2615  	// background goroutines aren't sampled either (i.e. we don't need GateSampler
  2616  	// as used below).
  2617  	if s.Options.Serverless.IsGCP() {
  2618  		logging.Infof(ctx, "Setting up Cloud Trace exports to %q using GCP Serverless sampling strategy", s.Options.CloudProject)
  2619  		return trace.ParentBased(trace.NeverSample()), nil
  2620  	}
  2621  
  2622  	// Parse -trace-sampling spec to get the base sampler.
  2623  	sampling := s.Options.TraceSampling
  2624  	if sampling == "" {
  2625  		sampling = "0.1qps"
  2626  	}
  2627  	logging.Infof(ctx, "Setting up Cloud Trace exports to %q (%s)", s.Options.CloudProject, sampling)
  2628  	sampler, err := internal.BaseSampler(sampling)
  2629  	if err != nil {
  2630  		return nil, errors.Annotate(err, "bad -trace-sampling").Err()
  2631  	}
  2632  
  2633  	// Sample only if the context is an incoming request context. This is needed
  2634  	// to avoid various background goroutines spamming with top-level spans. This
  2635  	// usually happens if a library is oblivious of tracing, but uses an
  2636  	// instrumented HTTP or gRPC client it got from outside, and the passes
  2637  	// context.Background() (or some unrelated context) to it. The end result is
  2638  	// lots and lots of non-informative disconnected top-level spans.
  2639  	//
  2640  	// Also skip sampling health check requests, they end up being spammy as well.
  2641  	sampler = internal.GateSampler(sampler, func(ctx context.Context) bool {
  2642  		req, _ := ctx.Value(&incomingRequestKey).(*incomingRequest)
  2643  		return req != nil && !req.healthCheck
  2644  	})
  2645  
  2646  	// Inherit the sampling decision from a parent span. Note this totally ignores
  2647  	// `sampler` if there's a parent span (local or remote). This is usually what
  2648  	// we want to get complete trace trees with well-defined root and no gaps.
  2649  	return trace.ParentBased(sampler), nil
  2650  }
  2651  
  2652  // otelSpanExporter initializes a trace spans exporter.
  2653  func (s *Server) otelSpanExporter(ctx context.Context) (trace.SpanExporter, error) {
  2654  	return texporter.New(
  2655  		texporter.WithContext(ctx),
  2656  		texporter.WithProjectID(s.Options.CloudProject),
  2657  		texporter.WithTraceClientOptions([]option.ClientOption{
  2658  			option.WithTokenSource(s.cloudTS),
  2659  		}),
  2660  	)
  2661  }
  2662  
  2663  // initTracing initializes Cloud Trace exporter via OpenTelemetry.
  2664  func (s *Server) initTracing() error {
  2665  	// Initialize a transformer that knows how to extract span info from the
  2666  	// context and serialize it as a bunch of headers and vice-versa. It is
  2667  	// invoked by otelhttp and otelgrpc middleware and when creating instrumented
  2668  	// HTTP clients. Recognize X-Cloud-Trace-Context for compatibility with traces
  2669  	// created by GCLB.
  2670  	//
  2671  	// It is used to parse incoming headers even when tracing is disabled, so
  2672  	// initialize it unconditionally, just don't install as a global propagator.
  2673  	s.propagator = propagation.NewCompositeTextMapPropagator(
  2674  		gcppropagator.CloudTraceOneWayPropagator{},
  2675  		propagation.TraceContext{},
  2676  	)
  2677  
  2678  	// If tracing is disabled, just don't initialize OpenTelemetry library. All
  2679  	// tracing machinery would still nominally "work", just do nothing in a
  2680  	// relatively efficient way.
  2681  	if !s.Options.shouldEnableTracing() {
  2682  		return nil
  2683  	}
  2684  
  2685  	// Annotate logs from OpenTelemetry so they can be filtered in Cloud Logging.
  2686  	ctx := logging.SetField(s.Context, "activity", "luci.trace")
  2687  
  2688  	// TODO(vadimsh): Install OpenTelemetry global logger using otel.SetLogger().
  2689  	// This will require implementing a hefty logr.LogSink interface on top of
  2690  	// the LUCI logger. Not doing that results in garbled stderr when OTEL wants
  2691  	// to log something (unclear when it happens exactly, if at all).
  2692  
  2693  	res, err := s.otelResource(ctx)
  2694  	if err != nil {
  2695  		return errors.Annotate(err, "failed to init OpenTelemetry resource").Err()
  2696  	}
  2697  	sampler, err := s.otelSampler(ctx)
  2698  	if err != nil {
  2699  		return errors.Annotate(err, "failed to init OpenTelemetry sampler").Err()
  2700  	}
  2701  	exp, err := s.otelSpanExporter(ctx)
  2702  	if err != nil {
  2703  		return errors.Annotate(err, "failed to init OpenTelemetry span exporter").Err()
  2704  	}
  2705  
  2706  	tp := trace.NewTracerProvider(
  2707  		trace.WithResource(res),
  2708  		trace.WithSampler(sampler),
  2709  		trace.WithBatcher(exp,
  2710  			trace.WithMaxQueueSize(8192),           // how much to buffer before dropping
  2711  			trace.WithBatchTimeout(30*time.Second), // how long to buffer before flushing
  2712  			trace.WithExportTimeout(time.Minute),   // deadline for the export RPC call
  2713  			trace.WithMaxExportBatchSize(2048),     // size of a single RPC
  2714  		),
  2715  	)
  2716  
  2717  	s.RegisterCleanup(func(ctx context.Context) {
  2718  		ctx = logging.SetField(ctx, "activity", "luci.trace")
  2719  		if err := tp.ForceFlush(ctx); err != nil {
  2720  			logging.Errorf(ctx, "Final trace flush failed: %s", err)
  2721  		}
  2722  		if err := tp.Shutdown(ctx); err != nil {
  2723  			logging.Errorf(ctx, "Error shutting down TracerProvider: %s", err)
  2724  		}
  2725  	})
  2726  
  2727  	// Register all globals to make them be used by default.
  2728  	otel.SetErrorHandler(s.otelErrorHandler(ctx))
  2729  	otel.SetTracerProvider(tp)
  2730  	otel.SetTextMapPropagator(s.propagator)
  2731  
  2732  	return nil
  2733  }
  2734  
  2735  // initProfiling initialized Cloud Profiler.
  2736  func (s *Server) initProfiling() error {
  2737  	// Skip if not enough configuration is given.
  2738  	switch {
  2739  	case !s.Options.Prod:
  2740  		return nil // silently skip, no need for log spam in dev mode
  2741  	case s.Options.CloudProject == "":
  2742  		logging.Infof(s.Context, "Cloud Profiler is disabled: -cloud-project is not set")
  2743  		return nil
  2744  	case s.Options.ProfilingServiceID == "" && s.Options.TsMonJobName == "":
  2745  		logging.Infof(s.Context, "Cloud Profiler is disabled: neither -profiling-service-id nor -ts-mon-job-name are set")
  2746  		return nil
  2747  	}
  2748  
  2749  	// Enable profiler based on a given probability. Low probabilities are useful
  2750  	// to avoid hitting Cloud Profiler quotas when running services with many
  2751  	// replicas. Profiles are aggregated anyway, for large enough number of
  2752  	// servers it doesn't matter if only a random subset of them is sampled.
  2753  	sample := rand.Float64()
  2754  	if sample < s.Options.ProfilingProbability {
  2755  		if s.Options.ProfilingProbability >= 1.0 {
  2756  			logging.Infof(s.Context, "Cloud Profiler is enabled")
  2757  		} else {
  2758  			logging.Infof(s.Context,
  2759  				"Cloud Profiler is enabled: rand %.2f < profiling-probability %.2f",
  2760  				sample, s.Options.ProfilingProbability)
  2761  		}
  2762  	} else {
  2763  		if s.Options.ProfilingProbability <= 0 {
  2764  			logging.Infof(s.Context, "Cloud Profiler is disabled")
  2765  		} else {
  2766  			logging.Infof(s.Context,
  2767  				"Cloud Profiler is disabled: rand %.2f >= profiling-probability %.2f",
  2768  				sample, s.Options.ProfilingProbability)
  2769  		}
  2770  		return nil
  2771  	}
  2772  
  2773  	cfg := profiler.Config{
  2774  		ProjectID:      s.Options.CloudProject,
  2775  		Service:        s.getServiceID(),
  2776  		ServiceVersion: s.Options.ImageVersion(),
  2777  		Instance:       s.Options.Hostname,
  2778  		// Note: these two options may potentially have impact on performance, but
  2779  		// it is likely small enough not to bother.
  2780  		MutexProfiling: true,
  2781  		AllocForceGC:   true,
  2782  	}
  2783  
  2784  	// Launch the agent that runs in the background and periodically collects and
  2785  	// uploads profiles. It fails to launch if Service or ServiceVersion do not
  2786  	// pass regexp validation. Make it non-fatal, but still log.
  2787  	if err := profiler.Start(cfg, option.WithTokenSource(s.cloudTS)); err != nil {
  2788  		logging.Errorf(s.Context, "Cloud Profiler is disabled: failed do start - %s", err)
  2789  		return nil
  2790  	}
  2791  
  2792  	logging.Infof(s.Context, "Set up Cloud Profiler (service %q, version %q)", cfg.Service, cfg.ServiceVersion)
  2793  	return nil
  2794  }
  2795  
  2796  // getServiceID get the service id from either ProfilingServiceID or TsMonJobName.
  2797  func (s *Server) getServiceID() string {
  2798  	// Prefer ProfilingServiceID if given, fall back to TsMonJobName. Replace
  2799  	// forbidden '/' symbol.
  2800  	serviceID := s.Options.ProfilingServiceID
  2801  	if serviceID == "" {
  2802  		serviceID = s.Options.TsMonJobName
  2803  	}
  2804  	serviceID = strings.ReplaceAll(serviceID, "/", "-")
  2805  	return serviceID
  2806  }
  2807  
  2808  // initMainPort initializes the server on options.HTTPAddr port.
  2809  func (s *Server) initMainPort() error {
  2810  	var err error
  2811  	s.mainPort, err = s.AddPort(PortOptions{
  2812  		Name:       "main",
  2813  		ListenAddr: s.Options.HTTPAddr,
  2814  	})
  2815  	if err != nil {
  2816  		return err
  2817  	}
  2818  	s.Routes = s.mainPort.Routes
  2819  
  2820  	// Install auth info handlers (under "/auth/api/v1/server/").
  2821  	auth.InstallHandlers(s.Routes, nil)
  2822  
  2823  	// Prepare the pRPC server. Its configuration will be finished in Serve after
  2824  	// all interceptors and authentication methods are registered.
  2825  	s.prpc = &prpc.Server{
  2826  		// Allow compression when not running on GAE. On GAE compression for text
  2827  		// responses is done by GAE itself and doing it in our code would be
  2828  		// wasteful.
  2829  		EnableResponseCompression: s.Options.Serverless != module.GAE,
  2830  	}
  2831  	discovery.Enable(s.prpc)
  2832  	s.prpc.InstallHandlers(s.Routes, nil)
  2833  
  2834  	return nil
  2835  }
  2836  
  2837  // initGrpcPort initializes the listening gRPC port.
  2838  func (s *Server) initGrpcPort() error {
  2839  	if s.Options.GRPCAddr == "" || s.Options.GRPCAddr == "-" {
  2840  		return nil // the gRPC port is disabled
  2841  	}
  2842  	listener, err := s.createListener(s.Options.GRPCAddr)
  2843  	if err != nil {
  2844  		return errors.Annotate(err, `failed to bind the listening port for "grpc" at %q`, s.Options.GRPCAddr).Err()
  2845  	}
  2846  	s.grpcPort = &grpcPort{listener: listener}
  2847  	s.ports = append(s.ports, s.grpcPort)
  2848  	return nil
  2849  }
  2850  
  2851  // initAdminPort initializes the server on options.AdminAddr port.
  2852  func (s *Server) initAdminPort() error {
  2853  	if s.Options.AdminAddr == "-" {
  2854  		return nil // the admin port is disabled
  2855  	}
  2856  
  2857  	// Admin portal uses XSRF tokens that require a secret key. We generate this
  2858  	// key randomly during process startup (i.e. now). It means XSRF tokens in
  2859  	// admin HTML pages rendered by a server process are understood only by the
  2860  	// exact same process. This is OK for admin pages (they are not behind load
  2861  	// balancers and we don't care that a server restart invalidates all tokens).
  2862  	secret := make([]byte, 20)
  2863  	if _, err := cryptorand.Read(secret); err != nil {
  2864  		return err
  2865  	}
  2866  	store := secrets.NewDerivedStore(secrets.Secret{Active: secret})
  2867  	withAdminSecret := router.NewMiddlewareChain(func(c *router.Context, next router.Handler) {
  2868  		c.Request = c.Request.WithContext(secrets.Use(c.Request.Context(), store))
  2869  		next(c)
  2870  	})
  2871  
  2872  	// Install endpoints accessible through the admin port only.
  2873  	adminPort, err := s.AddPort(PortOptions{
  2874  		Name:           "admin",
  2875  		ListenAddr:     s.Options.AdminAddr,
  2876  		DisableMetrics: true, // do not pollute HTTP metrics with admin-only routes
  2877  	})
  2878  	if err != nil {
  2879  		return err
  2880  	}
  2881  	routes := adminPort.Routes
  2882  
  2883  	routes.GET("/", nil, func(c *router.Context) {
  2884  		http.Redirect(c.Writer, c.Request, "/admin/portal", http.StatusFound)
  2885  	})
  2886  	portal.InstallHandlers(routes, withAdminSecret, portal.AssumeTrustedPort)
  2887  
  2888  	// Install pprof endpoints on the admin port. Note that they must not be
  2889  	// exposed via the main serving port, since they do no authentication and
  2890  	// may leak internal information. Also note that pprof handlers rely on
  2891  	// routing structure not supported by our router, so we do a bit of manual
  2892  	// routing.
  2893  	//
  2894  	// See also internal/pprof.go for more profiling goodies exposed through the
  2895  	// admin portal.
  2896  	routes.GET("/debug/pprof/*path", nil, func(c *router.Context) {
  2897  		switch strings.TrimPrefix(c.Params.ByName("path"), "/") {
  2898  		case "cmdline":
  2899  			pprof.Cmdline(c.Writer, c.Request)
  2900  		case "profile":
  2901  			pprof.Profile(c.Writer, c.Request)
  2902  		case "symbol":
  2903  			pprof.Symbol(c.Writer, c.Request)
  2904  		case "trace":
  2905  			pprof.Trace(c.Writer, c.Request)
  2906  		default:
  2907  			pprof.Index(c.Writer, c.Request)
  2908  		}
  2909  	})
  2910  	return nil
  2911  }
  2912  
  2913  // initErrorReporting initializes an Error Report client.
  2914  func (s *Server) initErrorReporting() error {
  2915  	if !s.Options.CloudErrorReporting || s.Options.CloudProject == "" {
  2916  		return nil
  2917  	}
  2918  
  2919  	// Get token source to call Error Reporting API.
  2920  	var err error
  2921  	s.errRptClient, err = errorreporting.NewClient(s.Context, s.Options.CloudProject, errorreporting.Config{
  2922  		ServiceName:    s.getServiceID(),
  2923  		ServiceVersion: s.Options.ImageVersion(),
  2924  		OnError: func(err error) {
  2925  			// TODO(crbug/1204640): s/Warningf/Errorf once "Error Reporting" is itself
  2926  			// more reliable.
  2927  			logging.Warningf(s.Context, "Error Reporting could not log error: %s", err)
  2928  		},
  2929  	}, option.WithTokenSource(s.cloudTS))
  2930  	if err != nil {
  2931  		return err
  2932  	}
  2933  
  2934  	s.RegisterCleanup(func(ctx context.Context) { s.errRptClient.Close() })
  2935  	return nil
  2936  }
  2937  
  2938  // initWarmup schedules execution of global warmup callbacks.
  2939  //
  2940  // On GAE also registers /_ah/warmup route.
  2941  func (s *Server) initWarmup() error {
  2942  	// See https://cloud.google.com/appengine/docs/standard/go/configuring-warmup-requests.
  2943  	// All warmups should happen *before* the serving loop and /_ah/warmup should
  2944  	// just always return OK.
  2945  	if s.Options.Serverless == module.GAE {
  2946  		s.Routes.GET("/_ah/warmup", nil, func(*router.Context) {})
  2947  	}
  2948  	s.RegisterWarmup(func(ctx context.Context) { warmup.Warmup(ctx) })
  2949  	return nil
  2950  }
  2951  
  2952  // signerImpl implements signing.Signer on top of *Server.
  2953  type signerImpl struct {
  2954  	srv       *Server
  2955  	iamClient *credentials.IamCredentialsClient
  2956  }
  2957  
  2958  // SignBytes signs the blob with some active private key.
  2959  func (s *signerImpl) SignBytes(ctx context.Context, blob []byte) (keyName string, signature []byte, err error) {
  2960  	resp, err := s.iamClient.SignBlob(ctx, &credentialspb.SignBlobRequest{
  2961  		Name:    "projects/-/serviceAccounts/" + s.srv.runningAs,
  2962  		Payload: blob,
  2963  	})
  2964  	if err != nil {
  2965  		return "", nil, grpcutil.WrapIfTransient(err)
  2966  	}
  2967  	return resp.KeyId, resp.SignedBlob, nil
  2968  }
  2969  
  2970  // Certificates returns a bundle with public certificates for all active keys.
  2971  func (s *signerImpl) Certificates(ctx context.Context) (*signing.PublicCertificates, error) {
  2972  	return signing.FetchCertificatesForServiceAccount(ctx, s.srv.runningAs)
  2973  }
  2974  
  2975  // ServiceInfo returns information about the current service.
  2976  func (s *signerImpl) ServiceInfo(ctx context.Context) (*signing.ServiceInfo, error) {
  2977  	return &signing.ServiceInfo{
  2978  		AppID:              s.srv.Options.CloudProject,
  2979  		AppRuntime:         "go",
  2980  		AppRuntimeVersion:  runtime.Version(),
  2981  		AppVersion:         s.srv.Options.ImageVersion(),
  2982  		ServiceAccountName: s.srv.runningAs,
  2983  	}, nil
  2984  }
  2985  
  2986  // actorTokensImpl implements auth.ActorTokensProvider using IAM Credentials.
  2987  type actorTokensImpl struct {
  2988  	iamClient *credentials.IamCredentialsClient
  2989  }
  2990  
  2991  // GenerateAccessToken generates an access token for the given account.
  2992  func (a *actorTokensImpl) GenerateAccessToken(ctx context.Context, serviceAccount string, scopes, delegates []string) (*oauth2.Token, error) {
  2993  	resp, err := a.iamClient.GenerateAccessToken(ctx, &credentialspb.GenerateAccessTokenRequest{
  2994  		Name:      "projects/-/serviceAccounts/" + serviceAccount,
  2995  		Scope:     scopes,
  2996  		Delegates: delegatesList(delegates),
  2997  	})
  2998  	if err != nil {
  2999  		return nil, grpcutil.WrapIfTransient(err)
  3000  	}
  3001  	return &oauth2.Token{
  3002  		AccessToken: resp.AccessToken,
  3003  		TokenType:   "Bearer",
  3004  		Expiry:      resp.ExpireTime.AsTime(),
  3005  	}, nil
  3006  }
  3007  
  3008  // GenerateIDToken generates an ID token for the given account.
  3009  func (a *actorTokensImpl) GenerateIDToken(ctx context.Context, serviceAccount, audience string, delegates []string) (string, error) {
  3010  	resp, err := a.iamClient.GenerateIdToken(ctx, &credentialspb.GenerateIdTokenRequest{
  3011  		Name:         "projects/-/serviceAccounts/" + serviceAccount,
  3012  		Audience:     audience,
  3013  		Delegates:    delegatesList(delegates),
  3014  		IncludeEmail: true,
  3015  	})
  3016  	if err != nil {
  3017  		return "", grpcutil.WrapIfTransient(err)
  3018  	}
  3019  	return resp.Token, nil
  3020  }
  3021  
  3022  // delegatesList prepends `projects/-/serviceAccounts/` to emails.
  3023  func delegatesList(emails []string) []string {
  3024  	if len(emails) == 0 {
  3025  		return nil
  3026  	}
  3027  	out := make([]string, len(emails))
  3028  	for i, email := range emails {
  3029  		out[i] = "projects/-/serviceAccounts/" + email
  3030  	}
  3031  	return out
  3032  }
  3033  
  3034  // networkAddrsForLog returns a string with IPv4 addresses of local network
  3035  // interfaces, if possible.
  3036  func networkAddrsForLog() string {
  3037  	addrs, err := net.InterfaceAddrs()
  3038  	if err != nil {
  3039  		return fmt.Sprintf("failed to enumerate network interfaces: %s", err)
  3040  	}
  3041  	var ips []string
  3042  	for _, address := range addrs {
  3043  		if ipnet, ok := address.(*net.IPNet); ok && !ipnet.IP.IsLoopback() {
  3044  			if ipv4 := ipnet.IP.To4(); ipv4 != nil {
  3045  				ips = append(ips, ipv4.String())
  3046  			}
  3047  		}
  3048  	}
  3049  	if len(ips) == 0 {
  3050  		return "<no IPv4 interfaces>"
  3051  	}
  3052  	return strings.Join(ips, ", ")
  3053  }
  3054  
  3055  // endUserIP extracts end-user IP address from X-Forwarded-For header.
  3056  func endUserIP(r auth.RequestMetadata) string {
  3057  	// X-Forwarded-For header is set by Cloud Load Balancer and GCP Serverless
  3058  	// load balancer and has format:
  3059  	//   [<untrusted part>,]<IP that connected to LB>,<unimportant>[,<more>].
  3060  	//
  3061  	// <untrusted part> may be present if the original request from the Internet
  3062  	// comes with X-Forwarded-For header. We can't trust IPs specified there. We
  3063  	// assume GCP load balancers sanitize the format of this field though.
  3064  	//
  3065  	// <IP that connected to LB> is what we are after.
  3066  	//
  3067  	// <unimportant> is "global forwarding rule external IP" for GKE or
  3068  	// the constant "169.254.1.1" for GCP Serverless. We don't care about these.
  3069  	//
  3070  	// <more> is present only if we proxy the request through more layers of
  3071  	// load balancers *while it is already inside GKE cluster*. We assume we don't
  3072  	// do that (if we ever do, Options{...} should be extended with a setting that
  3073  	// specifies how many layers of load balancers to skip to get to the original
  3074  	// IP). On GCP Serverless <more> is always empty.
  3075  	//
  3076  	// See https://cloud.google.com/load-balancing/docs/https for more info.
  3077  	forwardedFor := strings.Split(r.Header("X-Forwarded-For"), ",")
  3078  	if len(forwardedFor) >= 2 {
  3079  		return strings.TrimSpace(forwardedFor[len(forwardedFor)-2])
  3080  	}
  3081  
  3082  	// Fallback to the peer IP if X-Forwarded-For is not set. Happens when
  3083  	// connecting to the server's port directly from within the cluster.
  3084  	ip, _, err := net.SplitHostPort(r.RemoteAddr())
  3085  	if err != nil {
  3086  		return "0.0.0.0"
  3087  	}
  3088  	return ip
  3089  }
  3090  
  3091  // isHealthCheckerUA returns true for known user agents of health probers.
  3092  func isHealthCheckerUA(ua string) bool {
  3093  	switch {
  3094  	case strings.HasPrefix(ua, "kube-probe/"): // Kubernetes
  3095  		return true
  3096  	case strings.HasPrefix(ua, "GoogleHC"): // Cloud Load Balancer
  3097  		return true
  3098  	default:
  3099  		return false
  3100  	}
  3101  }
  3102  
  3103  // resolveDependencies sorts modules based on their dependencies.
  3104  //
  3105  // Discovers unfulfilled required dependencies.
  3106  func resolveDependencies(mods []module.Module) ([]module.Module, error) {
  3107  	// Build a map: module.Name => module.Module
  3108  	modules := make(map[module.Name]module.Module, len(mods))
  3109  	for _, m := range mods {
  3110  		if _, ok := modules[m.Name()]; ok {
  3111  			return nil, errors.Reason("duplicate module %q", m.Name()).Err()
  3112  		}
  3113  		modules[m.Name()] = m
  3114  	}
  3115  
  3116  	// Ensure all required dependencies exist, throw away missing optional
  3117  	// dependencies. The result is a directed graph that can be topo-sorted.
  3118  	graph := map[module.Name][]module.Name{}
  3119  	for _, m := range mods {
  3120  		for _, d := range m.Dependencies() {
  3121  			name := d.Dependency()
  3122  			if _, exists := modules[name]; !exists {
  3123  				if !d.Required() {
  3124  					continue
  3125  				}
  3126  				return nil, errors.Reason("module %q requires module %q which is not provided", m.Name(), name).Err()
  3127  			}
  3128  			graph[m.Name()] = append(graph[m.Name()], name)
  3129  		}
  3130  	}
  3131  
  3132  	sorted := make([]module.Module, 0, len(graph))
  3133  	visited := make(map[module.Name]bool, len(graph))
  3134  
  3135  	var visit func(n module.Name)
  3136  	visit = func(n module.Name) {
  3137  		if !visited[n] {
  3138  			visited[n] = true
  3139  			for _, dep := range graph[n] {
  3140  				visit(dep)
  3141  			}
  3142  			sorted = append(sorted, modules[n])
  3143  		}
  3144  	}
  3145  
  3146  	for _, m := range mods {
  3147  		visit(m.Name())
  3148  	}
  3149  	return sorted, nil
  3150  }