go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/dispatcher.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tq
    16  
    17  import (
    18  	"context"
    19  	"crypto/sha256"
    20  	"encoding/hex"
    21  	"encoding/json"
    22  	"fmt"
    23  	"io"
    24  	"net/http"
    25  	"regexp"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"sync/atomic"
    30  	"time"
    31  
    32  	taskspb "cloud.google.com/go/cloudtasks/apiv2/cloudtaskspb"
    33  	"cloud.google.com/go/pubsub/apiv1/pubsubpb"
    34  	"github.com/GoogleCloudPlatform/opentelemetry-operations-go/propagator"
    35  	"go.opentelemetry.io/otel"
    36  	"go.opentelemetry.io/otel/attribute"
    37  	"go.opentelemetry.io/otel/codes"
    38  	"go.opentelemetry.io/otel/propagation"
    39  	"go.opentelemetry.io/otel/trace"
    40  	"google.golang.org/protobuf/encoding/protojson"
    41  	"google.golang.org/protobuf/proto"
    42  	"google.golang.org/protobuf/reflect/protoreflect"
    43  	"google.golang.org/protobuf/reflect/protoregistry"
    44  	"google.golang.org/protobuf/types/known/timestamppb"
    45  
    46  	"go.chromium.org/luci/common/clock"
    47  	"go.chromium.org/luci/common/data/rand/cryptorand"
    48  	"go.chromium.org/luci/common/errors"
    49  	"go.chromium.org/luci/common/logging"
    50  	"go.chromium.org/luci/common/retry/transient"
    51  
    52  	srvinternal "go.chromium.org/luci/server/internal"
    53  	"go.chromium.org/luci/server/router"
    54  	"go.chromium.org/luci/server/tq/internal"
    55  	"go.chromium.org/luci/server/tq/internal/db"
    56  	"go.chromium.org/luci/server/tq/internal/metrics"
    57  	"go.chromium.org/luci/server/tq/internal/reminder"
    58  )
    59  
    60  const (
    61  	// TraceContextHeader is name of a header that contains the trace context of
    62  	// a span that produced the task.
    63  	//
    64  	// This header is read only by Dispatcher itself and exists mostly for FYI
    65  	// purposes to help in debugging issues.
    66  	TraceContextHeader = "X-Luci-Tq-Trace-Context"
    67  
    68  	// ExpectedETAHeader is the name of a header that indicates when the task was
    69  	// originally expected to run.
    70  	//
    71  	// One use of this header is for measuring latency of task completion.
    72  	ExpectedETAHeader = "X-Luci-Tq-Expected-ETA"
    73  )
    74  
    75  // Dispatcher is a registry of task classes that knows how serialize and route
    76  // them.
    77  //
    78  // There's rarely a need to manually create instances of Dispatcher outside of
    79  // Dispatcher's own tests. You should generally use the global Default
    80  // dispatcher which is configured by the tq server module. Methods of the
    81  // default dispatcher (such as RegisterTaskClass and AddTask) are also available
    82  // as lop-level functions, prefer to use them.
    83  //
    84  // The dispatcher needs a way to submit tasks to Cloud Tasks or Cloud PubSub.
    85  // This is the job of Submitter. It lives in the context, so that it can be
    86  // mocked in tests. In production contexts (setup when using the tq server
    87  // module), the submitter is initialized to be CloudSubmitter. Tests will need
    88  // to provide their own submitter (usually via TestingContext).
    89  //
    90  // TODO(vadimsh): Support consuming PubSub tasks, not just producing them.
    91  type Dispatcher struct {
    92  	// Sweeper knows how to sweep transactional tasks reminders.
    93  	//
    94  	// If not set, Sweep calls will fail.
    95  	Sweeper Sweeper
    96  
    97  	// Namespace is a namespace for tasks that use DeduplicationKey.
    98  	//
    99  	// This is needed if two otherwise independent deployments share a single
   100  	// Cloud Tasks instance.
   101  	//
   102  	// Used only for Cloud Tasks tasks. Doesn't affect PubSub tasks.
   103  	//
   104  	// Must be valid per ValidateNamespace. Default is "".
   105  	Namespace string
   106  
   107  	// GAE is true when running on Appengine.
   108  	//
   109  	// It alters how tasks are submitted and how incoming HTTP requests are
   110  	// authenticated.
   111  	GAE bool
   112  
   113  	// DisableAuth can be used to disable authentication on HTTP endpoints.
   114  	//
   115  	// This is useful when running in development mode on localhost or in tests.
   116  	DisableAuth bool
   117  
   118  	// CloudProject is ID of a project to use to construct full resource names.
   119  	//
   120  	// If not set, "default" will be used, which is pretty useless outside of
   121  	// tests.
   122  	CloudProject string
   123  
   124  	// CloudRegion is a ID of a region to use to construct full resource names.
   125  	//
   126  	// If not set, "default" will be used, which is pretty useless outside of
   127  	// tests.
   128  	CloudRegion string
   129  
   130  	// DefaultRoutingPrefix is a URL prefix for produced Cloud Tasks.
   131  	//
   132  	// Used only for Cloud Tasks tasks whose TaskClass doesn't provide some custom
   133  	// RoutingPrefix. Doesn't affect PubSub tasks.
   134  	//
   135  	// Default is "/internal/tasks/t/". It means generated Cloud Tasks by will
   136  	// have target URL "/internal/tasks/t/<generated-per-task-suffix>".
   137  	//
   138  	// A non-default value may be valuable if you host multiple dispatchers in
   139  	// a single process. This is a niche use case.
   140  	DefaultRoutingPrefix string
   141  
   142  	// DefaultTargetHost is a hostname to dispatch Cloud Tasks to by default.
   143  	//
   144  	// Individual Cloud Tasks task classes may override it with their own specific
   145  	// host. Doesn't affect PubSub tasks.
   146  	//
   147  	// On GAE defaults to the GAE application itself. Elsewhere defaults to
   148  	// "127.0.0.1", which is pretty useless outside of tests.
   149  	DefaultTargetHost string
   150  
   151  	// PushAs is a service account email to be used for generating OIDC tokens.
   152  	//
   153  	// Used only for Cloud Tasks tasks. Doesn't affect PubSub tasks.
   154  	//
   155  	// The service account must be within the same project. The server account
   156  	// must have "iam.serviceAccounts.actAs" permission for PushAs account.
   157  	//
   158  	// Optional on GAE when submitting tasks targeting GAE. Elsewhere defaults to
   159  	// "default@example.com", which is pretty useless outside of tests.
   160  	PushAs string
   161  
   162  	// AuthorizedPushers is a list of service account emails to accept pushes from
   163  	// in addition to PushAs.
   164  	//
   165  	// This is handy when migrating from one PushAs account to another, or when
   166  	// submitting tasks from one service, but handing them in another.
   167  	//
   168  	// Optional.
   169  	AuthorizedPushers []string
   170  
   171  	// SweepInitiationLaunchers is a list of service account emails authorized to
   172  	// launch sweeps via the exposed HTTP endpoint.
   173  	SweepInitiationLaunchers []string
   174  
   175  	mu       sync.RWMutex
   176  	clsByID  map[string]*taskClassImpl
   177  	clsByTyp map[protoreflect.MessageType]*taskClassImpl
   178  }
   179  
   180  // Sweeper knows how sweep transaction tasks reminders.
   181  type Sweeper interface {
   182  	// sweep either performs the full sweep itself or schedules a task to do it.
   183  	sweep(ctx context.Context, s Submitter, reminderKeySpaceBytes int) error
   184  }
   185  
   186  // TaskKind describes how a task class interoperates with transactions.
   187  type TaskKind int
   188  
   189  const (
   190  	// NonTransactional is a task kind for tasks that must be enqueued outside
   191  	// of a transaction.
   192  	NonTransactional TaskKind = 1
   193  
   194  	// Transactional is a task kind for tasks that must be enqueued only from
   195  	// a transaction.
   196  	Transactional TaskKind = 2
   197  
   198  	// FollowsContext is a task kind for tasks that are enqueue transactionally
   199  	// if the context is transactional or non-transactionally otherwise.
   200  	FollowsContext TaskKind = 3
   201  )
   202  
   203  // TaskClass defines how to treat tasks of a specific proto message type.
   204  //
   205  // It assigns some stable ID to a proto message kind and also defines how tasks
   206  // of this kind should be submitted and routed.
   207  //
   208  // The are two backends for tasks: Cloud Tasks and Cloud PubSub. Which one to
   209  // use for a particular task class is defined via mutually exclusive Queue and
   210  // Topic fields.
   211  //
   212  // Refer to Google Cloud documentation for all semantic differences between
   213  // Cloud Tasks and Cloud PubSub. One important difference is that Cloud PubSub
   214  // tasks can't be deduplicated and thus the handler must expect to receive
   215  // duplicates.
   216  type TaskClass struct {
   217  	// ID is unique identifier of this class of tasks.
   218  	//
   219  	// Must match `[a-zA-Z0-9_\-.]{1,100}`.
   220  	//
   221  	// It is used to decide how to deserialize and route the task. Changing IDs of
   222  	// existing task classes is a disruptive operation, make sure the queue is
   223  	// drained first. The dispatcher will reject Cloud Tasks with unrecognized
   224  	// class IDs with HTTP 404 error (casing Cloud Tasks to retry them later).
   225  	//
   226  	// Required.
   227  	ID string
   228  
   229  	// Prototype identifies a proto message type of a task payload.
   230  	//
   231  	// Used for its type information only. In particular it is used by AddTask
   232  	// to discover what TaskClass matches the added task. There should be
   233  	// one-to-one correspondence between proto message types and task classes.
   234  	//
   235  	// It is safe to arbitrarily change this type as long as JSONPB encoding of
   236  	// the previous type can be decoded using the new type. The dispatcher will
   237  	// reject Cloud Tasks with bodies it can't deserialize with HTTP 400 error
   238  	// (causing Cloud Tasks to retry them later).
   239  	//
   240  	// Required.
   241  	Prototype proto.Message
   242  
   243  	// Kind indicates whether the task requires a transaction to be enqueued.
   244  	//
   245  	// Note that using transactional tasks requires setting up a sweeper first
   246  	// and importing a module that implements transactions support for the
   247  	// database you are using. See "Transactional tasks" section above.
   248  	//
   249  	// Required. Pick one of NonTransactional, Transactional or FollowsContext.
   250  	Kind TaskKind
   251  
   252  	// Queue is a name of Cloud Tasks queue to use for the tasks.
   253  	//
   254  	// If set, indicates the task should be submitted through Cloud Tasks API.
   255  	// The queue must exist already in this case. Can't be set together with
   256  	// Topic.
   257  	//
   258  	// It can either be a short name like "default" or a full name like
   259  	// "projects/<project>/locations/<region>/queues/<name>". If it is a full
   260  	// name, it must have the above format or RegisterTaskClass would panic.
   261  	//
   262  	// If it is a short queue name, the full queue name will be constructed using
   263  	// dispatcher's CloudProject and CloudRegion if they are set.
   264  	Queue string
   265  
   266  	// Topic is a name of PubSub topic to use for the tasks.
   267  	//
   268  	// If set, indicates the task should be submitted through Cloud PubSub API.
   269  	// The topic must exist already in this case. Can't be set together with
   270  	// Queue.
   271  	//
   272  	// It can either be a short name like "tasks" or a full name like
   273  	// "projects/<project>/topics/<name>". If it is a full name, it must have the
   274  	// above format or RegisterTaskClass would panic.
   275  	Topic string
   276  
   277  	// RoutingPrefix is a URL prefix for produced Cloud Tasks.
   278  	//
   279  	// Can only be used for Cloud Tasks task (i.e. only if Queue is also set).
   280  	//
   281  	// Default is dispatcher's DefaultRoutingPrefix which itself defaults to
   282  	// "/internal/tasks/t/". It means generated Cloud Tasks by default will have
   283  	// target URL "/internal/tasks/t/<generated-per-task-suffix>".
   284  	//
   285  	// A non-default value can be used to route Cloud Tasks tasks of a particular
   286  	// class to particular processes, assuming the load balancer is configured
   287  	// accordingly.
   288  	RoutingPrefix string
   289  
   290  	// TargetHost is a hostname to dispatch Cloud Tasks to.
   291  	//
   292  	// Can only be used for Cloud Tasks task (i.e. only if Queue is also set).
   293  	//
   294  	// If unset, will use dispatcher's DefaultTargetHost.
   295  	TargetHost string
   296  
   297  	// Quiet, if set, instructs the dispatcher not to log bodies of tasks.
   298  	Quiet bool
   299  
   300  	// QuietOnError, if set, instructs the dispatcher not to log errors returned
   301  	// by the task handler.
   302  	//
   303  	// This is useful if task handler wants to do its own custom error logging.
   304  	QuietOnError bool
   305  
   306  	// Custom, if given, will be called to generate a custom payload from the
   307  	// task's proto payload.
   308  	//
   309  	// Useful for interoperability with existing code that doesn't use dispatcher
   310  	// or if the tasks are meant to be consumed in some custom way. You'll need to
   311  	// setup the consumer manually, the Dispatcher doesn't know how to handle
   312  	// tasks with custom payload.
   313  	//
   314  	// For Cloud Tasks tasks it is possible to customize HTTP method, relative
   315  	// URI, headers and the request body this way. Other properties of the task
   316  	// (such as the target host, the queue, the task name, authentication headers)
   317  	// are not customizable.
   318  	//
   319  	// For PubSub tasks it is possible to customize only task's body and
   320  	// attributes (via CustomPayload.Meta). Other fields in CustomPayload are
   321  	// ignored.
   322  	//
   323  	// Receives the exact same context as passed to AddTask. If returns nil
   324  	// result, the task will be submitted as usual.
   325  	Custom func(ctx context.Context, m proto.Message) (*CustomPayload, error)
   326  
   327  	// Handler will be called by the dispatcher to execute the tasks.
   328  	//
   329  	// The handler will receive the task's payload as a proto message of the exact
   330  	// same type as the type of Prototype. See Handler doc for more info.
   331  	//
   332  	// Populating this field is equivalent to calling AttachHandler after
   333  	// registering the class. It may be left nil if the current process just wants
   334  	// to submit tasks, but not handle them. Some other process would need to
   335  	// attach the handler then to be able to process tasks.
   336  	//
   337  	// The dispatcher will permanently fail tasks if it can't find a handler for
   338  	// them.
   339  	Handler Handler
   340  }
   341  
   342  // CustomPayload is returned by TaskClass's Custom, see its doc.
   343  type CustomPayload struct {
   344  	Method      string            // e.g. "GET" or "POST", Cloud Tasks only
   345  	RelativeURI string            // an URI relative to the task's target host, Cloud Tasks only
   346  	Meta        map[string]string // HTTP headers or message attributes to attach
   347  	Body        []byte            // serialized body of the request
   348  }
   349  
   350  // TaskClassRef represents a TaskClass registered in a Dispatcher.
   351  type TaskClassRef interface {
   352  	// AttachHandler sets a handler which will be called by the dispatcher to
   353  	// execute the tasks.
   354  	//
   355  	// The handler will receive the task's payload as a proto message of the exact
   356  	// same type as the type of TaskClass's Prototype. See Handler doc for more
   357  	// info.
   358  	//
   359  	// Panics if the class has already a handler attached.
   360  	AttachHandler(h Handler)
   361  
   362  	// Definition returns the original task class definition.
   363  	Definition() TaskClass
   364  }
   365  
   366  // Task contains task body and metadata.
   367  type Task struct {
   368  	// Payload is task's payload as well as indicator of its class.
   369  	//
   370  	// Its type will be used to find a matching registered TaskClass which defines
   371  	// how to route and handle the task.
   372  	Payload proto.Message
   373  
   374  	// DeduplicationKey is optional unique key used to derive name of the task.
   375  	//
   376  	// If a task of a given class with a given key has already been enqueued
   377  	// recently (within ~1h), this task will be silently ignored.
   378  	//
   379  	// Because there is an extra lookup cost to identify duplicate task names,
   380  	// enqueues of named tasks have significantly increased latency.
   381  	//
   382  	// Can be used only with Cloud Tasks tasks, since PubSub doesn't support
   383  	// deduplication during enqueuing.
   384  	//
   385  	// Named tasks can only be used outside of transactions.
   386  	DeduplicationKey string
   387  
   388  	// Title is optional string that identifies the task in server logs.
   389  	//
   390  	// For Cloud Tasks it will also show up as a suffix in task handler URL. It
   391  	// exists exclusively to simplify reading server logs. It serves no other
   392  	// purpose! In particular, it is *not* a task name.
   393  	//
   394  	// Handlers won't ever see it. Pass all information through the payload.
   395  	Title string
   396  
   397  	// Delay specifies the duration the Cloud Tasks service must wait before
   398  	// attempting to execute the task.
   399  	//
   400  	// Can be used only with Cloud Tasks tasks. Either Delay or ETA may be set,
   401  	// but not both.
   402  	Delay time.Duration
   403  
   404  	// ETA specifies the earliest time a task may be executed.
   405  	//
   406  	// Can be used only with Cloud Tasks tasks. Either Delay or ETA may be set,
   407  	// but not both.
   408  	ETA time.Time
   409  }
   410  
   411  var (
   412  	// Fatal is an error tag used to indicate that the handler wants the task to
   413  	// be dropped due to unrecoverable failure.
   414  	//
   415  	// See Handler doc for more details.
   416  	Fatal = errors.BoolTag{Key: errors.NewTagKey("the task should be dropped due to fatal failure")}
   417  
   418  	// Ignore is an error tag used to indicate that the handler wants the task to
   419  	// be dropped as no longer needed.
   420  	//
   421  	// See Handler doc for more details.
   422  	Ignore = errors.BoolTag{Key: errors.NewTagKey("the task should be dropped as no longer needed")}
   423  )
   424  
   425  // Used to override HTTP status of some errors.
   426  var (
   427  	httpStatusKey = errors.NewTagKey("http status override")
   428  	httpStatus404 = errors.TagValue{Key: httpStatusKey, Value: 404}
   429  	httpStatus400 = errors.TagValue{Key: httpStatusKey, Value: 400}
   430  )
   431  
   432  // quietOnError is an error tag used to implement TaskClass.QuietOnError.
   433  var quietOnError = errors.BoolTag{Key: errors.NewTagKey("QuietOnError")}
   434  
   435  // Handler is called to handle one enqueued task.
   436  //
   437  // If Handler returns an error tagged with Ignore tag, the task will be dropped
   438  // with HTTP 204 reply to Cloud Tasks. This is useful when task is no longer
   439  // needed yet it's desirable to distinguish such a case from the normal case
   440  // for monitoring purposes (e.g. in emitted logs or tsmon metrics).
   441  //
   442  // If Handler returns an error tagged with Fatal tag, the task will be dropped with
   443  // HTTP 202 reply to Cloud Tasks. This should be rarely used.
   444  //
   445  // Otherwise, the task will be retried later (per the queue configuration) with
   446  // HTTP 429 reply.
   447  //
   448  // Errors tagged with transient.Tag result in HTTP 500 replies. They also
   449  // trigger a retry.
   450  type Handler func(ctx context.Context, payload proto.Message) error
   451  
   452  // ExecutionInfo is parsed from incoming task's metadata.
   453  //
   454  // It is accessible from within task handlers via TaskExecutionInfo(ctx).
   455  type ExecutionInfo struct {
   456  	// ExecutionCount is 0 on a first delivery attempt and increased by 1 for each
   457  	// failed attempt.
   458  	ExecutionCount int
   459  
   460  	// TaskID is the ID of the task in the underlying backend service.
   461  	//
   462  	// For Cloud Task, it is `X-CloudTasks-TaskName`.
   463  	// For PubSub, it is `messageID`.
   464  	TaskID string
   465  
   466  	taskRetryReason       string    // X-CloudTasks-TaskRetryReason
   467  	taskPreviousResponse  string    // X-CloudTasks-TaskPreviousResponse
   468  	submitterTraceContext string    // see TraceContextHeader
   469  	expectedETA           time.Time // see ExpectedETAHeader
   470  }
   471  
   472  var executionInfoKey = "go.chromium.org/luci/server/tq.ExecutionInfo"
   473  
   474  // TaskExecutionInfo returns information about the currently executing task.
   475  //
   476  // Returns nil if called not from a task handler.
   477  func TaskExecutionInfo(ctx context.Context) *ExecutionInfo {
   478  	info, _ := ctx.Value(&executionInfoKey).(*ExecutionInfo)
   479  	return info
   480  }
   481  
   482  // ValidateNamespace returns an error if `n` is not a valid namespace name.
   483  //
   484  // An empty string is a valid namespace (denoting the default namespace). Other
   485  // valid namespaces must start with an ASCII letter or '_', contain only
   486  // ASCII letters, digits or '_', and be less than 50 chars in length.
   487  func ValidateNamespace(n string) error {
   488  	if n != "" && !namespaceRe.MatchString(n) {
   489  		return errors.New("must start with a letter or '_' and contain only letters, numbers and '_'")
   490  	}
   491  	return nil
   492  }
   493  
   494  // RegisterTaskClass tells the dispatcher how to route and handle tasks of some
   495  // particular type.
   496  //
   497  // Intended to be called during process startup. Panics if there's already
   498  // a registered task class with the same ID or Prototype.
   499  func (d *Dispatcher) RegisterTaskClass(cls TaskClass) TaskClassRef {
   500  	if !taskClassIDRe.MatchString(cls.ID) {
   501  		panic(fmt.Sprintf("bad TaskClass ID %q", cls.ID))
   502  	}
   503  	if cls.Prototype == nil {
   504  		panic("TaskClass Prototype must be set")
   505  	}
   506  	if cls.RoutingPrefix != "" && !strings.HasPrefix(cls.RoutingPrefix, "/") {
   507  		panic("TaskClass RoutingPrefix must start with /")
   508  	}
   509  	if cls.Kind == 0 {
   510  		panic("TaskClass Kind is required")
   511  	}
   512  
   513  	var backend taskBackend
   514  	switch {
   515  	case cls.Queue == "" && cls.Topic == "":
   516  		panic("TaskClass must have either Queue or Topic set")
   517  	case cls.Queue != "" && cls.Topic != "":
   518  		panic("TaskClass must have either Queue or Topic set, not both")
   519  	case cls.Queue != "":
   520  		backend = backendCloudTasks
   521  		if strings.ContainsRune(cls.Queue, '/') && !isValidQueue(cls.Queue) {
   522  			panic(fmt.Sprintf("not a valid full queue name %q", cls.Queue))
   523  		}
   524  	case cls.Topic != "":
   525  		backend = backendPubSub
   526  		if strings.ContainsRune(cls.Topic, '/') && !isValidTopic(cls.Topic) {
   527  			panic(fmt.Sprintf("not a valid full topic name %q", cls.Topic))
   528  		}
   529  		if cls.RoutingPrefix != "" {
   530  			panic("PubSub tasks do not support RoutingPrefix")
   531  		}
   532  		if cls.TargetHost != "" {
   533  			panic("PubSub tasks do not support TargetHost")
   534  		}
   535  	}
   536  
   537  	typ := cls.Prototype.ProtoReflect().Type()
   538  
   539  	d.mu.Lock()
   540  	defer d.mu.Unlock()
   541  
   542  	if d.clsByID == nil {
   543  		d.clsByID = make(map[string]*taskClassImpl, 1)
   544  	}
   545  	if d.clsByTyp == nil {
   546  		d.clsByTyp = make(map[protoreflect.MessageType]*taskClassImpl, 1)
   547  	}
   548  
   549  	if _, ok := d.clsByID[cls.ID]; ok {
   550  		panic(fmt.Sprintf("TaskClass with ID %q is already registered", cls.ID))
   551  	}
   552  	if _, ok := d.clsByTyp[typ]; ok {
   553  		panic(fmt.Sprintf("TaskClass with Prototype %q is already registered", proto.MessageName(cls.Prototype)))
   554  	}
   555  
   556  	impl := &taskClassImpl{
   557  		TaskClass: cls,
   558  		disp:      d,
   559  		protoType: typ,
   560  		backend:   backend,
   561  	}
   562  	d.clsByID[cls.ID] = impl
   563  	d.clsByTyp[typ] = impl
   564  	return impl
   565  }
   566  
   567  // TaskClassRef returns a task class reference given its ID or nil if no such
   568  // task class is registered.
   569  func (d *Dispatcher) TaskClassRef(id string) TaskClassRef {
   570  	impl, _, _ := d.classByID(id)
   571  	if impl == nil {
   572  		return nil
   573  	}
   574  	return impl
   575  }
   576  
   577  // AddTask submits a task for later execution.
   578  //
   579  // The task payload type should match some registered TaskClass. Its ID will
   580  // be used to identify the task class in the serialized Cloud Tasks task body.
   581  //
   582  // At some later time, in some other process, the dispatcher will invoke
   583  // a handler attached to the corresponding TaskClass, based on its ID extracted
   584  // from the task body.
   585  //
   586  // If the given context is transactional, inherits the transaction if allowed
   587  // according to the TaskClass's Kind. A transactional task will eventually be
   588  // submitted to Cloud Tasks if and only if the transaction successfully commits.
   589  // This requires a sweeper instance to be running somewhere, see ModuleOptions.
   590  // Note that a failure to submit the task to Cloud Tasks will not abort
   591  // the transaction.
   592  //
   593  // If the task has a DeduplicationKey and there already was a recent task with
   594  // the same TaskClass ID and DeduplicationKey, silently ignores the added task.
   595  // This works only outside of transactions. Using DeduplicationKey with
   596  // transactional tasks results in an error.
   597  //
   598  // Annotates retriable errors with transient.Tag.
   599  func (d *Dispatcher) AddTask(ctx context.Context, task *Task) (err error) {
   600  	sub, err := currentSubmitter(ctx)
   601  	if err != nil {
   602  		return err
   603  	}
   604  
   605  	// Start a span annotated with the task's class.
   606  	cls, _, err := d.classByMsg(task.Payload)
   607  	if err != nil {
   608  		return err
   609  	}
   610  	ctx, span := startSpan(ctx, "go.chromium.org/luci/server/tq.AddTask", map[string]string{
   611  		"cr.dev.class": cls.ID,
   612  		"cr.dev.title": task.Title,
   613  	})
   614  	defer func() {
   615  		if err != nil {
   616  			span.RecordError(err)
   617  			span.SetStatus(codes.Error, err.Error())
   618  		}
   619  		span.End()
   620  	}()
   621  
   622  	// Prepare a raw request. We'll either submit it right away (for non-tx
   623  	// tasks), or attach it to a reminder and store in the DB for later handling.
   624  	payload, err := d.prepPayload(ctx, cls, task)
   625  	if err != nil {
   626  		return err
   627  	}
   628  
   629  	// Examine the context to see if we are inside a transaction.
   630  	txndb := db.TxnDB(ctx)
   631  	switch cls.Kind {
   632  	case FollowsContext:
   633  		// do nothing, will use `txndb` if it is non-nil
   634  	case Transactional:
   635  		if txndb == nil {
   636  			if !db.Configured() {
   637  				return errors.Reason("enqueuing of tasks %q requires transactions support, "+
   638  					"see https://pkg.go.dev/go.chromium.org/luci/server/tq#hdr-Transactional_tasks", cls.ID).Err()
   639  			}
   640  			return errors.Reason("enqueuing of tasks %q must be done from inside a transaction", cls.ID).Err()
   641  		}
   642  	case NonTransactional:
   643  		if txndb != nil {
   644  			return errors.Reason("enqueuing of tasks %q must be done outside of a transaction", cls.ID).Err()
   645  		}
   646  	default:
   647  		panic(fmt.Sprintf("unrecognized TaskKind %v", cls.Kind))
   648  	}
   649  
   650  	// If not inside a transaction, submit the task right away.
   651  	if txndb == nil {
   652  		return internal.Submit(ctx, sub, payload, internal.TxnPathNone)
   653  	}
   654  
   655  	// Named transactional tasks are not supported.
   656  	if task.DeduplicationKey != "" {
   657  		return errors.Reason("when enqueuing %q: can't use DeduplicationKey for a transactional task", cls.ID).Err()
   658  	}
   659  
   660  	// Otherwise transactionally commit a reminder and schedule a best-effort
   661  	// post-transaction enqueuing of the actual task. If it fails, the sweeper
   662  	// will eventually discover the reminder and enqueue the task. Note that this
   663  	// modifies `payload` with the reminder's ID.
   664  	r, err := d.attachToReminder(ctx, payload)
   665  	if err != nil {
   666  		return errors.Annotate(err, "failed to prepare a reminder").Err()
   667  	}
   668  	span.SetAttributes(attribute.String("cr.dev.reminder", r.ID))
   669  	if err := txndb.SaveReminder(ctx, r); err != nil {
   670  		return errors.Annotate(err, "failed to store a transactional enqueue reminder").Err()
   671  	}
   672  
   673  	once := int32(0)
   674  	txndb.Defer(ctx, func(ctx context.Context) {
   675  		if count := atomic.AddInt32(&once, 1); count > 1 {
   676  			panic("transaction defer has already been called")
   677  		}
   678  
   679  		// `ctx` here is an outer non-transactional context.
   680  		var err error
   681  		ctx, span := startSpan(ctx, "go.chromium.org/luci/server/tq.PostTxn", map[string]string{
   682  			"cr.dev.class":    cls.ID,
   683  			"cr.dev.title":    task.Title,
   684  			"cr.dev.reminder": r.ID,
   685  		})
   686  		defer func() {
   687  			if err != nil {
   688  				span.RecordError(err)
   689  				span.SetStatus(codes.Error, err.Error())
   690  			}
   691  			span.End()
   692  		}()
   693  
   694  		// Attempt to submit the task right away if the reminder is still fresh.
   695  		err = internal.ProcessReminderPostTxn(ctx, sub, txndb, r)
   696  	})
   697  
   698  	return nil
   699  }
   700  
   701  // Sweep initiates a sweep of transactional tasks reminders.
   702  //
   703  // It must be called periodically (e.g. once per minute) somewhere in the fleet.
   704  func (d *Dispatcher) Sweep(ctx context.Context) error {
   705  	if d.Sweeper == nil {
   706  		return errors.New("can't sweep: the Sweeper is not set")
   707  	}
   708  	sub, err := currentSubmitter(ctx)
   709  	if err != nil {
   710  		return err
   711  	}
   712  	return d.Sweeper.sweep(ctx, sub, reminderKeySpaceBytes)
   713  }
   714  
   715  // InstallTasksRoutes installs tasks HTTP routes under the given prefix.
   716  //
   717  // The exposed HTTP endpoints are called by Cloud Tasks service when it is time
   718  // to execute a task.
   719  func (d *Dispatcher) InstallTasksRoutes(r *router.Router, prefix string) {
   720  	if prefix == "" {
   721  		prefix = "/internal/tasks/"
   722  	} else if !strings.HasPrefix(prefix, "/") {
   723  		panic("the prefix should start with /")
   724  	}
   725  
   726  	var mw router.MiddlewareChain
   727  	if !d.DisableAuth {
   728  		// Tasks are primarily submitted as `PushAs`, but we also accept all
   729  		// `AuthorizedPushers`.
   730  		pushers := append([]string{d.PushAs}, d.AuthorizedPushers...)
   731  		// On GAE X-Appengine-* headers can be trusted. Check we are being called
   732  		// by Cloud Tasks. We don't care by which queue exactly though. It is
   733  		// easier to move tasks between queues that way.
   734  		header := ""
   735  		if d.GAE {
   736  			header = "X-Appengine-Queuename"
   737  		}
   738  		mw = srvinternal.CloudAuthMiddleware(pushers, header, func(c *router.Context) {
   739  			metrics.ServerRejectedCount.Add(c.Request.Context(), 1, "auth")
   740  		})
   741  	}
   742  
   743  	// We don't really care about the exact format of URLs. At the same time
   744  	// accepting all requests under InternalRoutingPrefix is necessary for
   745  	// compatibility with "appengine/tq" which used totally different URL format.
   746  	prefix = strings.TrimRight(prefix, "/") + "/*path"
   747  	r.POST(prefix, mw, func(c *router.Context) {
   748  		body, err := io.ReadAll(c.Request.Body)
   749  		if err != nil {
   750  			httpReply(c, 500, "Failed to read the request", err)
   751  		} else {
   752  			replyWithErr(c, d.handlePush(c.Request.Context(), body, parseHeaders(c.Request.Header)))
   753  		}
   754  	})
   755  }
   756  
   757  // InstallSweepRoute installs a route that initiates a sweep.
   758  //
   759  // It may be called periodically (e.g. by Cloud Scheduler) to launch sweeps.
   760  func (d *Dispatcher) InstallSweepRoute(r *router.Router, path string) {
   761  	var mw router.MiddlewareChain
   762  	if !d.DisableAuth {
   763  		// On GAE X-Appengine-* headers can be trusted. Check we are being called
   764  		// by Cloud Scheduler.
   765  		header := ""
   766  		if d.GAE {
   767  			header = "X-Appengine-Cron"
   768  		}
   769  		mw = srvinternal.CloudAuthMiddleware(d.SweepInitiationLaunchers, header, nil)
   770  	}
   771  
   772  	r.GET(path, mw, func(c *router.Context) {
   773  		err := d.Sweep(c.Request.Context())
   774  		if err != nil && !transient.Tag.In(err) {
   775  			err = Fatal.Apply(err)
   776  		}
   777  		replyWithErr(c, err)
   778  	})
   779  }
   780  
   781  // ReportMetrics writes gauge metrics to tsmon.
   782  //
   783  // This should be called before tsmon flush. By reporting them only here, we
   784  // can avoid hitting tsmon state every time some gauge value changes (which
   785  // can happen very often).
   786  func (d *Dispatcher) ReportMetrics(ctx context.Context) {
   787  	d.mu.RLock()
   788  	defer d.mu.RUnlock()
   789  	for id, cls := range d.clsByID {
   790  		metrics.ServerRunning.Set(ctx, int64(atomic.LoadInt32(&cls.running)), id)
   791  	}
   792  }
   793  
   794  ////////////////////////////////////////////////////////////////////////////////
   795  
   796  var (
   797  	// namespaceRe is used to validate Dispatcher.Namespace.
   798  	namespaceRe = regexp.MustCompile(`^[a-zA-Z_][0-9a-zA-Z_]{0,49}$`)
   799  	// taskClassIDRe is used to validate TaskClass.ID.
   800  	taskClassIDRe = regexp.MustCompile(`^[a-zA-Z0-9_\-.]{1,100}$`)
   801  	// tracer is used to report tracing spans.
   802  	tracer = otel.Tracer("go.chromium.org/luci/server/tq")
   803  )
   804  
   805  const (
   806  	// reminderKeySpaceBytes defines the space of the Reminder Ids.
   807  	//
   808  	// Because Reminder.ID is hex-encoded, actual length is doubled.
   809  	//
   810  	// 16 is chosen is big enough to avoid collisions in practice yet small enough
   811  	// for easier human-debugging of key ranges in queries.
   812  	reminderKeySpaceBytes = 16
   813  
   814  	// happyPathMaxDuration caps how long the happy path will be waited for.
   815  	happyPathMaxDuration = time.Minute
   816  )
   817  
   818  // defaultHeaders returns headers to add to all submitted tasks.
   819  func defaultHeaders() map[string]string {
   820  	return map[string]string{"Content-Type": "application/json"}
   821  }
   822  
   823  // startSpan starts a new span and puts `meta` into its attributes and into
   824  // logger fields.
   825  func startSpan(ctx context.Context, title string, meta map[string]string) (context.Context, trace.Span) {
   826  	attrs := make([]attribute.KeyValue, 0, len(meta))
   827  	fields := make(logging.Fields, len(meta))
   828  	for k, v := range meta {
   829  		attrs = append(attrs, attribute.String(k, v))
   830  		fields[k] = v
   831  	}
   832  	return tracer.Start(logging.SetFields(ctx, fields), title, trace.WithAttributes(attrs...))
   833  }
   834  
   835  // prepPayload converts a task into a reminder.Payload.
   836  func (d *Dispatcher) prepPayload(ctx context.Context, cls *taskClassImpl, t *Task) (*reminder.Payload, error) {
   837  	payload := &reminder.Payload{
   838  		TaskClass: cls.ID,
   839  		Created:   clock.Now(ctx),
   840  		Raw:       t.Payload, // used on a happy path only (essentially only in tests)
   841  	}
   842  	var err error
   843  	switch cls.backend {
   844  	case backendCloudTasks:
   845  		payload.CreateTaskRequest, err = d.prepCloudTasksRequest(ctx, cls, t)
   846  	case backendPubSub:
   847  		payload.PublishRequest, err = d.prepPubSubRequest(ctx, cls, t)
   848  	default:
   849  		panic("impossible")
   850  	}
   851  	return payload, err
   852  }
   853  
   854  // prepCloudTasksRequest prepares Cloud Tasks request based on a *Task.
   855  func (d *Dispatcher) prepCloudTasksRequest(ctx context.Context, cls *taskClassImpl, t *Task) (*taskspb.CreateTaskRequest, error) {
   856  	queueID, err := d.queueID(cls.Queue)
   857  	if err != nil {
   858  		return nil, err
   859  	}
   860  
   861  	taskID := ""
   862  	if t.DeduplicationKey != "" {
   863  		taskID = queueID + "/tasks/" + cls.taskName(t, d.Namespace)
   864  	}
   865  
   866  	var scheduleTime *timestamppb.Timestamp
   867  	switch {
   868  	case !t.ETA.IsZero():
   869  		if t.Delay != 0 {
   870  			return nil, errors.New("bad task: either ETA or Delay should be given, not both")
   871  		}
   872  		scheduleTime = timestamppb.New(t.ETA)
   873  	case t.Delay > 0:
   874  		scheduleTime = timestamppb.New(clock.Now(ctx).Add(t.Delay))
   875  	}
   876  
   877  	// E.g. ("example.com", "/internal/tasks/t/<class>[/<title>]").
   878  	// Note: relativeURI is discarded when using custom payload.
   879  	host, relativeURI, err := d.taskTarget(cls, t)
   880  	if err != nil {
   881  		return nil, err
   882  	}
   883  
   884  	var payload *CustomPayload
   885  	if cls.Custom != nil {
   886  		if payload, err = cls.Custom(ctx, t.Payload); err != nil {
   887  			return nil, err
   888  		}
   889  	}
   890  	if payload == nil {
   891  		// This is not really a "custom" payload, we are just reusing the struct.
   892  		payload = &CustomPayload{
   893  			Method:      "POST",
   894  			RelativeURI: relativeURI,
   895  			Meta:        defaultHeaders(),
   896  		}
   897  		if payload.Body, err = cls.serialize(t); err != nil {
   898  			return nil, err
   899  		}
   900  	} else {
   901  		// We'll likely be mutating the headers below, make a copy.
   902  		meta := make(map[string]string, len(payload.Meta))
   903  		for k, v := range payload.Meta {
   904  			meta[k] = v
   905  		}
   906  		payload.Meta = meta
   907  	}
   908  
   909  	// Inject tracing headers.
   910  	if traceCtx := traceContext(ctx); traceCtx != "" {
   911  		payload.Meta[TraceContextHeader] = traceCtx
   912  	}
   913  
   914  	// Inject magic header with ETA.
   915  	if scheduleTime == nil {
   916  		payload.Meta[ExpectedETAHeader] = makeETAHeader(clock.Now(ctx))
   917  	} else {
   918  		payload.Meta[ExpectedETAHeader] = makeETAHeader(scheduleTime.AsTime())
   919  	}
   920  
   921  	method := taskspb.HttpMethod(taskspb.HttpMethod_value[payload.Method])
   922  	if method == 0 {
   923  		return nil, errors.Reason("bad HTTP method %q", payload.Method).Err()
   924  	}
   925  	if !strings.HasPrefix(payload.RelativeURI, "/") {
   926  		return nil, errors.Reason("bad relative URI %q", payload.RelativeURI).Err()
   927  	}
   928  
   929  	// We need to populate one of Task.MessageType oneof alternatives. It has
   930  	// unexported type, so we have to instantiate the message now and then mutate
   931  	// it.
   932  	req := &taskspb.CreateTaskRequest{
   933  		Parent: queueID,
   934  		Task: &taskspb.Task{
   935  			Name:         taskID,
   936  			ScheduleTime: scheduleTime,
   937  			// TODO(vadimsh): Make DispatchDeadline configurable?
   938  		},
   939  	}
   940  
   941  	// On GAE we by default push to the GAE itself.
   942  	if host == "" && d.GAE {
   943  		req.Task.MessageType = &taskspb.Task_AppEngineHttpRequest{
   944  			AppEngineHttpRequest: &taskspb.AppEngineHttpRequest{
   945  				HttpMethod:  method,
   946  				RelativeUri: payload.RelativeURI,
   947  				Headers:     payload.Meta,
   948  				Body:        payload.Body,
   949  			},
   950  		}
   951  		return req, nil
   952  	}
   953  
   954  	// Elsewhere pick up some defaults mostly used only in tests.
   955  	if host == "" {
   956  		host = "127.0.0.1"
   957  	}
   958  	pushAs := d.PushAs
   959  	if d.PushAs == "" {
   960  		pushAs = "default@example.com"
   961  	}
   962  
   963  	req.Task.MessageType = &taskspb.Task_HttpRequest{
   964  		HttpRequest: &taskspb.HttpRequest{
   965  			HttpMethod: method,
   966  			Url:        "https://" + host + payload.RelativeURI,
   967  			Headers:    payload.Meta,
   968  			Body:       payload.Body,
   969  			AuthorizationHeader: &taskspb.HttpRequest_OidcToken{
   970  				OidcToken: &taskspb.OidcToken{
   971  					ServiceAccountEmail: pushAs,
   972  				},
   973  			},
   974  		},
   975  	}
   976  	return req, nil
   977  }
   978  
   979  // makeETAHeader converts the given time into a decimal string representing
   980  // the number of seconds since the Unix epoch with microsecond resolution.
   981  func makeETAHeader(t time.Time) string {
   982  	mics := t.UnixNano() / 1000
   983  	return fmt.Sprintf("%d.%06d", mics/1e6, mics%1e6)
   984  }
   985  
   986  // queueID expands `id` into a full queue name if necessary.
   987  func (d *Dispatcher) queueID(id string) (string, error) {
   988  	if strings.HasPrefix(id, "projects/") {
   989  		return id, nil // already full name
   990  	}
   991  	project := d.CloudProject
   992  	if project == "" {
   993  		project = "default"
   994  	}
   995  	region := d.CloudRegion
   996  	if region == "" {
   997  		region = "default"
   998  	}
   999  	return fmt.Sprintf("projects/%s/locations/%s/queues/%s", project, region, id), nil
  1000  }
  1001  
  1002  // taskTarget constructs a target URL for a task.
  1003  //
  1004  // `host` will be "" if no explicit host is configured anywhere. On GAE this
  1005  // means "send the task back to the GAE app". On non-GAE this indicates to use
  1006  // default "127.0.0.1" which is really usable only in tests.
  1007  func (d *Dispatcher) taskTarget(cls *taskClassImpl, t *Task) (host string, relativeURI string, err error) {
  1008  	if cls.TargetHost != "" {
  1009  		host = cls.TargetHost
  1010  	} else {
  1011  		host = d.DefaultTargetHost
  1012  	}
  1013  
  1014  	pfx := cls.RoutingPrefix
  1015  	if pfx == "" {
  1016  		pfx = d.DefaultRoutingPrefix
  1017  	}
  1018  	if pfx == "" {
  1019  		pfx = "/internal/tasks/t/"
  1020  	}
  1021  
  1022  	if !strings.HasPrefix(pfx, "/") {
  1023  		return "", "", errors.Reason("bad routing prefix %q: must start with /", pfx).Err()
  1024  	}
  1025  	if !strings.HasSuffix(pfx, "/") {
  1026  		pfx += "/"
  1027  	}
  1028  
  1029  	relativeURI = pfx + cls.ID
  1030  	switch {
  1031  	case t.Title == "":
  1032  		return
  1033  	case strings.ContainsRune(t.Title, ' '):
  1034  		return "", "", errors.Reason("bad task title %q: must not contain spaces", t.Title).Err()
  1035  	case len(relativeURI)+1+len(t.Title) > 2083:
  1036  		return "", "", errors.Reason("bad task title %q: too long;"+
  1037  			" must not exceed 2083 characters when combined with %q", t.Title, relativeURI).Err()
  1038  	default:
  1039  		relativeURI += "/" + t.Title
  1040  		return
  1041  	}
  1042  }
  1043  
  1044  // prepPubSubRequest prepares Cloud PubSub request based on a *Task.
  1045  func (d *Dispatcher) prepPubSubRequest(ctx context.Context, cls *taskClassImpl, t *Task) (*pubsubpb.PublishRequest, error) {
  1046  	if t.DeduplicationKey != "" {
  1047  		return nil, errors.New("can't use DeduplicationKey with PubSub tasks")
  1048  	}
  1049  	if t.Delay != 0 || !t.ETA.IsZero() {
  1050  		return nil, errors.New("can't use Delay or ETA with PubSub tasks")
  1051  	}
  1052  
  1053  	topicID, err := d.topicID(cls.Topic)
  1054  	if err != nil {
  1055  		return nil, err
  1056  	}
  1057  
  1058  	var payload *CustomPayload
  1059  	if cls.Custom != nil {
  1060  		if payload, err = cls.Custom(ctx, t.Payload); err != nil {
  1061  			return nil, err
  1062  		}
  1063  	}
  1064  	if payload == nil {
  1065  		// This is not really a "custom" payload, we are just reusing the struct.
  1066  		payload = &CustomPayload{}
  1067  		if payload.Body, err = cls.serialize(t); err != nil {
  1068  			return nil, err
  1069  		}
  1070  	}
  1071  
  1072  	msg := &pubsubpb.PubsubMessage{
  1073  		Data:       payload.Body,
  1074  		Attributes: make(map[string]string, len(payload.Meta)+1),
  1075  	}
  1076  	for k, v := range payload.Meta {
  1077  		msg.Attributes[k] = v
  1078  	}
  1079  	if traceCtx := traceContext(ctx); traceCtx != "" {
  1080  		msg.Attributes[TraceContextHeader] = traceCtx
  1081  	}
  1082  
  1083  	return &pubsubpb.PublishRequest{
  1084  		Topic:    topicID,
  1085  		Messages: []*pubsubpb.PubsubMessage{msg},
  1086  	}, nil
  1087  }
  1088  
  1089  // topicID expands `id` into a full topic name if necessary.
  1090  func (d *Dispatcher) topicID(id string) (string, error) {
  1091  	if strings.HasPrefix(id, "projects/") {
  1092  		return id, nil // already full name
  1093  	}
  1094  	project := d.CloudProject
  1095  	if project == "" {
  1096  		project = "default"
  1097  	}
  1098  	return fmt.Sprintf("projects/%s/topics/%s", project, id), nil
  1099  }
  1100  
  1101  // attachToReminder makes a reminder and attaches the payload to it, thus
  1102  // mutating the payload with reminder's ID.
  1103  //
  1104  // Returns the constructed reminder. It will eventually be stored in the
  1105  // database to remind the sweeper to submit the task if best-effort
  1106  // post-transactional submit fails.
  1107  func (d *Dispatcher) attachToReminder(ctx context.Context, payload *reminder.Payload) (*reminder.Reminder, error) {
  1108  	buf := make([]byte, reminderKeySpaceBytes)
  1109  	if _, err := io.ReadFull(cryptorand.Get(ctx), buf); err != nil {
  1110  		return nil, errors.Annotate(err, "failed to get random bytes").Tag(transient.Tag).Err()
  1111  	}
  1112  
  1113  	// Note: length of the generated ID here is different from the length of IDs
  1114  	// we generate when using DeduplicationKey, so there'll be no collisions
  1115  	// between two different sorts of named tasks.
  1116  	r := &reminder.Reminder{ID: hex.EncodeToString(buf)}
  1117  
  1118  	// Bound FreshUntil to at most current context deadline.
  1119  	r.FreshUntil = clock.Now(ctx).Add(happyPathMaxDuration)
  1120  	if deadline, ok := ctx.Deadline(); ok && r.FreshUntil.After(deadline) {
  1121  		// TODO(tandrii): allow propagating custom deadline for the async happy
  1122  		// path which won't bind the context's deadline.
  1123  		r.FreshUntil = deadline
  1124  	}
  1125  	r.FreshUntil = r.FreshUntil.UTC().Truncate(reminder.FreshUntilPrecision)
  1126  
  1127  	return r, r.AttachPayload(payload)
  1128  }
  1129  
  1130  // isValidQueue is true if q looks like "projects/.../locations/.../queues/...".
  1131  func isValidQueue(q string) bool {
  1132  	chunks := strings.Split(q, "/")
  1133  	return len(chunks) == 6 &&
  1134  		chunks[0] == "projects" &&
  1135  		chunks[1] != "" &&
  1136  		chunks[2] == "locations" &&
  1137  		chunks[3] != "" &&
  1138  		chunks[4] == "queues" &&
  1139  		chunks[5] != ""
  1140  }
  1141  
  1142  // isValidTopic is true if t looks like "projects/.../topics/...".
  1143  func isValidTopic(t string) bool {
  1144  	chunks := strings.Split(t, "/")
  1145  	return len(chunks) == 4 &&
  1146  		chunks[0] == "projects" &&
  1147  		chunks[1] != "" &&
  1148  		chunks[2] == "topics" &&
  1149  		chunks[3] != ""
  1150  }
  1151  
  1152  // handlePush handles one incoming task.
  1153  //
  1154  // Returns errors annotated in the same style as errors from Handler, see its
  1155  // doc.
  1156  func (d *Dispatcher) handlePush(ctx context.Context, body []byte, info ExecutionInfo) error {
  1157  	// See taskClassImpl.serialize().
  1158  	env := envelope{}
  1159  	if err := json.Unmarshal(body, &env); err != nil {
  1160  		metrics.ServerRejectedCount.Add(ctx, 1, "bad_request")
  1161  		return errors.Annotate(err, "not a valid JSON body").Tag(httpStatus400).Err()
  1162  	}
  1163  
  1164  	// Find the matching registered task class. Newer tasks always have `class`
  1165  	// set. Older ones have `type` instead.
  1166  	var cls *taskClassImpl
  1167  	var h Handler
  1168  	var err error
  1169  	if env.Class != "" {
  1170  		cls, h, err = d.classByID(env.Class)
  1171  	} else if env.Type != "" {
  1172  		cls, h, err = d.classByTyp(env.Type)
  1173  	} else {
  1174  		err = errors.Reason("malformed task body, no class").Tag(httpStatus400).Err()
  1175  	}
  1176  	if err != nil {
  1177  		logging.Debugf(ctx, "TQ: %s", body)
  1178  		metrics.ServerRejectedCount.Add(ctx, 1, "unknown_class")
  1179  		return err
  1180  	}
  1181  
  1182  	if !cls.Quiet {
  1183  		logging.Debugf(ctx, "TQ: %s", body)
  1184  		if info.submitterTraceContext != "" {
  1185  			logging.Debugf(ctx, "TQ: submitted at %s", info.submitterTraceContext)
  1186  		}
  1187  		if info.ExecutionCount != 0 {
  1188  			logging.Debugf(ctx, "TQ: this is a retry: %d previous attempt(s) already failed", info.ExecutionCount)
  1189  			if info.taskRetryReason != "" || info.taskPreviousResponse != "" {
  1190  				logging.Debugf(ctx, "TQ: the previous attempt failed with %s: %s", info.taskPreviousResponse, info.taskRetryReason)
  1191  			}
  1192  		}
  1193  	}
  1194  
  1195  	if h == nil {
  1196  		metrics.ServerRejectedCount.Add(ctx, 1, "no_handler")
  1197  		return errors.Reason("task class %q exists, but has no handler attached", cls.ID).Tag(httpStatus404).Err()
  1198  	}
  1199  
  1200  	msg, err := cls.deserialize(&env)
  1201  	if err != nil {
  1202  		metrics.ServerRejectedCount.Add(ctx, 1, "bad_payload")
  1203  		return errors.Annotate(err, "malformed body of task class %q", cls.ID).Tag(httpStatus400).Err()
  1204  	}
  1205  
  1206  	atomic.AddInt32(&cls.running, 1)
  1207  	defer atomic.AddInt32(&cls.running, -1)
  1208  
  1209  	ctx = context.WithValue(ctx, &executionInfoKey, &info)
  1210  
  1211  	start := clock.Now(ctx)
  1212  	err = h(ctx, msg)
  1213  	dur := clock.Now(ctx).Sub(start)
  1214  
  1215  	result := "OK"
  1216  	switch {
  1217  	case Fatal.In(err):
  1218  		result = "fatal"
  1219  	case Ignore.In(err):
  1220  		result = "ignore"
  1221  	case transient.Tag.In(err):
  1222  		result = "transient"
  1223  	case err != nil:
  1224  		result = "retry"
  1225  	}
  1226  
  1227  	retry := info.ExecutionCount
  1228  	if retry > metrics.MaxRetryFieldValue {
  1229  		retry = metrics.MaxRetryFieldValue
  1230  	}
  1231  
  1232  	metrics.ServerHandledCount.Add(ctx, 1, cls.ID, result, retry)
  1233  	metrics.ServerDurationMS.Add(ctx, float64(dur.Milliseconds()), cls.ID, result)
  1234  	if !info.expectedETA.IsZero() {
  1235  		latency := clock.Since(ctx, info.expectedETA).Milliseconds()
  1236  		if latency < 0 {
  1237  			latency = 0
  1238  		}
  1239  		metrics.ServerTaskLatency.Add(ctx, float64(latency), cls.ID, result, retry)
  1240  	}
  1241  
  1242  	if err != nil && cls.QuietOnError {
  1243  		err = quietOnError.Apply(err)
  1244  	}
  1245  	return err
  1246  }
  1247  
  1248  // classByID returns a task class given its ID or an error if no such class.
  1249  //
  1250  // Reads cls.Handler while under the lock as well, since it may be concurrently
  1251  // modified by AttachHandler.
  1252  func (d *Dispatcher) classByID(id string) (*taskClassImpl, Handler, error) {
  1253  	d.mu.RLock()
  1254  	defer d.mu.RUnlock()
  1255  	if cls := d.clsByID[id]; cls != nil {
  1256  		return cls, cls.Handler, nil
  1257  	}
  1258  	return nil, nil, errors.Reason("no task class with ID %q is registered", id).Tag(httpStatus404).Err()
  1259  }
  1260  
  1261  // classByMsg returns a task class given proto message or an error if no
  1262  // such class.
  1263  //
  1264  // Reads cls.Handler while under the lock as well, since it may be concurrently
  1265  // modified by AttachHandler.
  1266  func (d *Dispatcher) classByMsg(msg proto.Message) (*taskClassImpl, Handler, error) {
  1267  	typ := msg.ProtoReflect().Type()
  1268  	d.mu.RLock()
  1269  	defer d.mu.RUnlock()
  1270  	if cls := d.clsByTyp[typ]; cls != nil {
  1271  		return cls, cls.Handler, nil
  1272  	}
  1273  	return nil, nil, errors.Reason("no task class matching type %q is registered", typ.Descriptor().FullName()).Tag(httpStatus404).Err()
  1274  }
  1275  
  1276  // classByTyp returns a task class given proto message name or an error if no
  1277  // such class.
  1278  //
  1279  // Reads cls.Handler while under the lock as well, since it may be concurrently
  1280  // modified by AttachHandler.
  1281  func (d *Dispatcher) classByTyp(typ string) (*taskClassImpl, Handler, error) {
  1282  	msgTyp, _ := protoregistry.GlobalTypes.FindMessageByName(protoreflect.FullName(typ))
  1283  	if msgTyp == nil {
  1284  		return nil, nil, errors.Reason("no proto message %q is registered", typ).Tag(httpStatus404).Err()
  1285  	}
  1286  	d.mu.RLock()
  1287  	defer d.mu.RUnlock()
  1288  	if cls := d.clsByTyp[msgTyp]; cls != nil {
  1289  		return cls, cls.Handler, nil
  1290  	}
  1291  	return nil, nil, errors.Reason("no task class matching type %q is registered", typ).Tag(httpStatus404).Err()
  1292  }
  1293  
  1294  ////////////////////////////////////////////////////////////////////////////////
  1295  
  1296  type taskBackend int
  1297  
  1298  const (
  1299  	backendCloudTasks taskBackend = 1
  1300  	backendPubSub     taskBackend = 2
  1301  )
  1302  
  1303  // taskClassImpl knows how to prepare and handle tasks of a particular class.
  1304  type taskClassImpl struct {
  1305  	TaskClass
  1306  	disp      *Dispatcher
  1307  	protoType protoreflect.MessageType
  1308  	backend   taskBackend
  1309  	running   int32
  1310  }
  1311  
  1312  // envelope is what we put into all Cloud Tasks.
  1313  type envelope struct {
  1314  	Class string           `json:"class,omitempty"` // ID of TaskClass
  1315  	Type  string           `json:"type,omitempty"`  // for compatibility with appengine/tq
  1316  	Body  *json.RawMessage `json:"body"`            // JSONPB-serialized Task.Payload
  1317  }
  1318  
  1319  // AttachHandler implements TaskClassRef interface.
  1320  func (cls *taskClassImpl) AttachHandler(h Handler) {
  1321  	cls.disp.mu.Lock()
  1322  	defer cls.disp.mu.Unlock()
  1323  	if h == nil {
  1324  		panic("The handler must not be nil")
  1325  	}
  1326  	if cls.Handler != nil {
  1327  		panic("The task class has a handler attached already")
  1328  	}
  1329  	cls.Handler = h
  1330  }
  1331  
  1332  // Definition implements TaskClassRef interface.
  1333  func (cls *taskClassImpl) Definition() TaskClass {
  1334  	return cls.TaskClass
  1335  }
  1336  
  1337  // taskName returns a short ID for the task to use to dedup it.
  1338  func (cls *taskClassImpl) taskName(t *Task, namespace string) string {
  1339  	h := sha256.New()
  1340  	h.Write([]byte(namespace))
  1341  	h.Write([]byte{0})
  1342  	h.Write([]byte(cls.ID))
  1343  	h.Write([]byte{0})
  1344  	h.Write([]byte(t.DeduplicationKey))
  1345  	return hex.EncodeToString(h.Sum(nil))
  1346  }
  1347  
  1348  // serialize serializes the task body into JSONPB.
  1349  func (cls *taskClassImpl) serialize(t *Task) ([]byte, error) {
  1350  	opts := protojson.MarshalOptions{
  1351  		Indent:         "\t",
  1352  		UseEnumNumbers: true,
  1353  	}
  1354  	blob, err := opts.Marshal(t.Payload)
  1355  	if err != nil {
  1356  		return nil, errors.Annotate(err, "failed to serialize %q", proto.MessageName(t.Payload)).Err()
  1357  	}
  1358  	raw := json.RawMessage(blob)
  1359  	return json.MarshalIndent(envelope{
  1360  		Class: cls.ID,
  1361  		Type:  string(proto.MessageName(t.Payload)),
  1362  		Body:  &raw,
  1363  	}, "", "\t")
  1364  }
  1365  
  1366  // deserialize instantiates a proto message based on its serialized body.
  1367  func (cls *taskClassImpl) deserialize(env *envelope) (proto.Message, error) {
  1368  	if env.Body == nil {
  1369  		return nil, errors.Reason("no body").Err()
  1370  	}
  1371  	opts := protojson.UnmarshalOptions{
  1372  		DiscardUnknown: true,
  1373  	}
  1374  	msg := cls.protoType.New().Interface()
  1375  	if err := opts.Unmarshal(*env.Body, msg); err != nil {
  1376  		return nil, err
  1377  	}
  1378  	return msg, nil
  1379  }
  1380  
  1381  ////////////////////////////////////////////////////////////////////////////////
  1382  
  1383  // traceContext returns a tracing context for TraceContextHeader header or "".
  1384  //
  1385  // We use Cloud Trace propagation format.
  1386  func traceContext(ctx context.Context) string {
  1387  	span := trace.SpanContextFromContext(ctx)
  1388  	if !span.IsValid() {
  1389  		return ""
  1390  	}
  1391  	headers := make(propagation.MapCarrier, 1)
  1392  	(propagator.CloudTraceFormatPropagator{}).Inject(ctx, headers)
  1393  	return headers[propagator.TraceContextHeaderName]
  1394  }
  1395  
  1396  // parseHeaders examines headers of the incoming Cloud Tasks push.
  1397  func parseHeaders(h http.Header) ExecutionInfo {
  1398  	magicHeader := func(key string) string {
  1399  		if val := h.Get("X-AppEngine-" + key); val != "" {
  1400  			return val
  1401  		}
  1402  		return h.Get("X-CloudTasks-" + key)
  1403  	}
  1404  
  1405  	var execCount int64
  1406  	if count := magicHeader("TaskExecutionCount"); count != "" {
  1407  		execCount, _ = strconv.ParseInt(count, 10, 32)
  1408  	}
  1409  
  1410  	var eta time.Time
  1411  	if s := h.Get(ExpectedETAHeader); s != "" {
  1412  		// Expected format is "<seconds(int64)>.<microseconds(int32)>".
  1413  		parts := strings.Split(s, ".")
  1414  		if len(parts) == 2 {
  1415  			secs, errS := strconv.ParseInt(parts[0], 10, 64)
  1416  			micros, errM := strconv.ParseInt(parts[1], 10, 32)
  1417  			if errS == nil && errM == nil {
  1418  				eta = time.Unix(secs, micros*1000)
  1419  			}
  1420  		}
  1421  	}
  1422  
  1423  	return ExecutionInfo{
  1424  		ExecutionCount:        int(execCount),
  1425  		TaskID:                magicHeader("TaskName"),
  1426  		taskRetryReason:       magicHeader("TaskRetryReason"),
  1427  		taskPreviousResponse:  magicHeader("TaskPreviousResponse"),
  1428  		submitterTraceContext: h.Get(TraceContextHeader),
  1429  		expectedETA:           eta,
  1430  	}
  1431  }
  1432  
  1433  // httpReply writes and logs HTTP response.
  1434  //
  1435  // `msg` is sent to the caller as is. `err` is logged, but not sent.
  1436  func httpReply(c *router.Context, code int, msg string, err error) {
  1437  	if err != nil && !quietOnError.In(err) {
  1438  		if Ignore.In(err) {
  1439  			logging.Warningf(c.Request.Context(), "server/tq task %s: %s", msg, err)
  1440  		} else {
  1441  			logging.Errorf(c.Request.Context(), "server/tq task %s: %s", msg, err)
  1442  		}
  1443  	}
  1444  	if code == http.StatusNoContent {
  1445  		msg = ""
  1446  	}
  1447  	http.Error(c.Writer, msg, code)
  1448  }
  1449  
  1450  // replyWithErr calls httpReply deriving status code from `err`.
  1451  func replyWithErr(c *router.Context, err error) {
  1452  	switch {
  1453  	case err == nil:
  1454  		httpReply(c, http.StatusOK /* 200 */, "OK", nil)
  1455  	case Fatal.In(err):
  1456  		httpReply(c, http.StatusAccepted /* 202 */, "fatal error", err)
  1457  	case Ignore.In(err):
  1458  		httpReply(c, http.StatusNoContent /* 204 */, "ignored error", err)
  1459  	case transient.Tag.In(err):
  1460  		httpReply(c, http.StatusInternalServerError /* 500 */, "transient error", err)
  1461  	default:
  1462  		status := http.StatusTooManyRequests
  1463  		if code, ok := errors.TagValueIn(httpStatusKey, err); ok {
  1464  			status = code.(int)
  1465  		}
  1466  		httpReply(c, status, "error", err)
  1467  	}
  1468  }