go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/task/task.go (about)

     1  // Copyright 2015 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package task defines interface between Scheduler engine and implementations
    16  // of particular tasks (such as URL fetch tasks, Swarming tasks, DM tasks, etc).
    17  //
    18  // Its subpackages contain concrete realizations of Manager interface.
    19  package task
    20  
    21  import (
    22  	"context"
    23  	"net/http"
    24  	"time"
    25  
    26  	"github.com/golang/protobuf/proto"
    27  	"google.golang.org/api/pubsub/v1"
    28  	"google.golang.org/protobuf/types/known/structpb"
    29  
    30  	"go.chromium.org/luci/auth/identity"
    31  	"go.chromium.org/luci/config/validation"
    32  	"go.chromium.org/luci/server/auth"
    33  
    34  	"go.chromium.org/luci/scheduler/appengine/internal"
    35  )
    36  
    37  // Status is status of a single job invocation.
    38  type Status string
    39  
    40  const (
    41  	// StatusStarting means the task is about to start.
    42  	StatusStarting Status = "STARTING"
    43  	// StatusRetrying means the task was starting, but the launch failed in some
    44  	// transient way. The start attempt is retried in this case a bunch of times,
    45  	// until eventually the task moves into either StatusRunning or one of the
    46  	// final states. The only possible transition into StatusRetrying is from
    47  	// StatusStarting. A running task can only succeed or fail.
    48  	StatusRetrying Status = "RETRYING"
    49  	// StatusRunning means the task has started and is running now.
    50  	StatusRunning Status = "RUNNING"
    51  	// StatusSucceeded means the task finished with success.
    52  	StatusSucceeded Status = "SUCCEEDED"
    53  	// StatusFailed means the task finished with error or failed to start.
    54  	StatusFailed Status = "FAILED"
    55  	// StatusOverrun means the task should have been started, but previous one is
    56  	// still running.
    57  	StatusOverrun Status = "OVERRUN"
    58  	// StatusAborted means the task was forcefully aborted (manually or due to
    59  	// hard deadline).
    60  	StatusAborted Status = "ABORTED"
    61  )
    62  
    63  // Initial returns true if Status is Starting or Retrying.
    64  //
    65  // These statuses indicate an invocation before LaunchTask (perhaps, a retry of
    66  // it) is finished with the invocation.
    67  func (s Status) Initial() bool {
    68  	return s == StatusStarting || s == StatusRetrying
    69  }
    70  
    71  // Final returns true if Status represents some final status.
    72  func (s Status) Final() bool {
    73  	switch s {
    74  	case StatusSucceeded, StatusFailed, StatusOverrun, StatusAborted:
    75  		return true
    76  	default:
    77  		return false
    78  	}
    79  }
    80  
    81  // Traits describes properties that influence how the scheduler engine manages
    82  // tasks handled by this Manager.
    83  type Traits struct {
    84  	// Multistage is true if Manager uses Starting -> Running -> Finished
    85  	// state chain for all invocations (instead of just Starting -> Finished).
    86  	//
    87  	// This is the case for "heavy" tasks that can run for undetermined amount
    88  	// of time (e.g. Swarming and Buildbucket tasks). By switching invocation
    89  	// state to Running, the Manager acknowledges that it takes responsibility for
    90  	// eventually moving the invocation to Finished state (perhaps in response to
    91  	// a PubSub notification or a timer tick). In other words, once an invocation
    92  	// is in Running state, the schedule engine will not automatically keep track
    93  	// of it's healthiness (it's the responsibility of the Manager now).
    94  	//
    95  	// For smaller tasks (that finish in seconds, e.g. gitiles poller tasks) it is
    96  	// simpler and more efficient just to do everything in LaunchTask and then
    97  	// move the invocation to Finished state. By doing so, the Manager avoids
    98  	// implementing healthiness checks, piggybacking on LaunchTask retries
    99  	// automatically performed by the scheduler engine.
   100  	//
   101  	// Currently this trait only influences the UI. Invocations with
   102  	// Multistage == false don't show up as "Starting" in the UI (they are
   103  	// displayed as "Running" instead, since it makes more sense from end-user
   104  	// perspective).
   105  	Multistage bool
   106  }
   107  
   108  // Manager knows how to work with a particular kind of tasks (e.g. URL fetch
   109  // tasks, Swarming tasks, etc): how to deserialize, validate and execute them.
   110  //
   111  // Manager uses Controller to talk back to the scheduler engine.
   112  type Manager interface {
   113  	// Name returns task manager name. It identifies the corresponding kind
   114  	// of tasks and used in various resource names (e.g. PubSub topic names).
   115  	Name() string
   116  
   117  	// ProtoMessageType returns a pointer to protobuf message struct that
   118  	// describes config for the task kind, e.g. &UrlFetchTask{}. Will be used
   119  	// only for its type signature.
   120  	ProtoMessageType() proto.Message
   121  
   122  	// Traits returns properties that influence how the scheduler engine manages
   123  	// tasks handled by this Manager.
   124  	//
   125  	// See Traits struct for more details.
   126  	Traits() Traits
   127  
   128  	// ValidateProtoMessage verifies task definition proto message makes sense.
   129  	// msg must have same underlying type as ProtoMessageType() return value.
   130  	//
   131  	// realmID is a full realm name (as "<project>:<realm>") of the job whose
   132  	// definition is being validated. It is never empty, but may be a @legacy
   133  	// realm.
   134  	//
   135  	// Errors are returned via validation.Context.
   136  	ValidateProtoMessage(c *validation.Context, msg proto.Message, realmID string)
   137  
   138  	// LaunchTask starts (or starts and finishes in one go) the task.
   139  	//
   140  	// Manager's responsibilities:
   141  	//  * To move the task to some state other than StatusStarting
   142  	//    (by changing ctl.State().Status). If at some point the task has moved
   143  	//    to StatusRunning, the manager MUST setup some way to track the task's
   144  	//    progress to eventually move it to some final state. It can be a status
   145  	//    check via a timer (see `AddTimer` below), or a PubSub callback (see
   146  	//    `PrepareTopic` below).
   147  	//  * Be idempotent, if possible, using ctl.InvocationID() as an operation
   148  	//    key.
   149  	//  * Not to use supplied controller outside of LaunchTask call.
   150  	//  * Not to use supplied controller concurrently without synchronization.
   151  	//
   152  	// If `LaunchTask` crashes before returning or returns a transient error, it
   153  	// will be called again later, receiving exact same ctl.InvocationID().
   154  	//
   155  	// TaskManager may optionally use ctl.Save() to checkpoint progress and save
   156  	// debug log. ctl.Save() is also implicitly called by the engine when
   157  	// `LaunchTask` returns.
   158  	LaunchTask(c context.Context, ctl Controller) error
   159  
   160  	// AbortTask is called to opportunistically abort launched task.
   161  	//
   162  	// It is called right before the job is forcefully switched to a failed state.
   163  	// The engine does not wait for the task runner to acknowledge this action.
   164  	//
   165  	// AbortTask must be idempotent since it may be called multiple times in case
   166  	// of errors.
   167  	AbortTask(c context.Context, ctl Controller) error
   168  
   169  	// ExamineNotification is called to extract the auth token from the incoming
   170  	// PubSub message.
   171  	//
   172  	// It should return an empty string if the message is unrecognized/malformed
   173  	// or there's no auth token in it. Note that the PubSub message here is not
   174  	// yet validated and can be a total garbage (or even be malicious).
   175  	//
   176  	// See PrepareTopic for more info.
   177  	ExamineNotification(c context.Context, msg *pubsub.PubsubMessage) string
   178  
   179  	// HandleNotification is called whenever engine receives a PubSub message sent
   180  	// to a topic created with Controller.PrepareTopic. Expect duplicated and
   181  	// out-of-order messages here. HandleNotification must be idempotent.
   182  	//
   183  	// Returns transient error to trigger a redeliver of the message, no error to
   184  	// to acknowledge the message and fatal error to move the invocation to failed
   185  	// state.
   186  	//
   187  	// Any modifications made to the invocation state will be saved regardless of
   188  	// the return value (to save the debug log).
   189  	HandleNotification(c context.Context, ctl Controller, msg *pubsub.PubsubMessage) error
   190  
   191  	// HandleTimer is called to process timers set up by Controller.AddTimer.
   192  	//
   193  	// Expect duplicated or delayed events here. HandleTimer must be idempotent.
   194  	//
   195  	// Returns a transient error to trigger a redelivery of the event (the
   196  	// invocation state won't be saved in this case), no error to acknowledge the
   197  	// event and a fatal error to move the invocation to failed state.
   198  	HandleTimer(c context.Context, ctl Controller, name string, payload []byte) error
   199  
   200  	// GetDebugState returns debug info about the state persisted by the manager.
   201  	GetDebugState(c context.Context, ctl ControllerReadOnly) (*internal.DebugManagerState, error)
   202  }
   203  
   204  // Controller is passed to LaunchTask by the scheduler engine. It gives Manager
   205  // control over one job invocation. Manager must not use it outside of
   206  // LaunchTask. Controller implementation is generally not thread safe (but it's
   207  // fine to use it from multiple goroutines if access is protected by a lock).
   208  //
   209  // All methods that accept context.Context expect contexts derived from ones
   210  // passed to 'Manager' methods. A derived context can be used to set custom
   211  // deadlines for some potentially expensive methods like 'PrepareTopic'.
   212  type Controller interface {
   213  	ControllerReadOnly
   214  
   215  	// State returns a mutable portion of task invocation state.
   216  	//
   217  	// TaskManager can modify it in-place and then call Controller.Save to persist
   218  	// the changes. The state will also be saved by the engine automatically if
   219  	// Manager doesn't call Save.
   220  	State() *State
   221  
   222  	// AddTimer sets up a new delayed call to Manager.HandleTimer.
   223  	//
   224  	// Timers are active as long as the invocation is not in one of the final
   225  	// states. There is no way to cancel a timer (ignore HandleTimer call
   226  	// instead).
   227  	//
   228  	// 'title' will be visible in logs, it should convey a purpose for this timer.
   229  	// It doesn't have to be unique.
   230  	//
   231  	// 'payload' is any byte blob carried verbatim to Manager.HandleTimer.
   232  	//
   233  	// All timers are actually enabled in Save(), in the same transaction that
   234  	// updates the job state.
   235  	AddTimer(c context.Context, delay time.Duration, title string, payload []byte)
   236  
   237  	// PrepareTopic create PubSub topic for notifications related to the task and
   238  	// adds given publisher to its ACL.
   239  	//
   240  	// It returns full name of the topic and a token that will be used to
   241  	// authenticate the PubSub message and bind it to the task the Controller is
   242  	// operating on now. Topic name and its configuration are controlled by the
   243  	// Engine. The publisher to the topic must put the token somewhere inside
   244  	// the message. The engine will ask the task manager to extract the token
   245  	// from the message via ExamineNotification, then it will validate the token
   246  	// and eventually call HandleNotification.
   247  	//
   248  	// 'publisher' can be a service account email, or an URL to some luci service.
   249  	// If URL is given, its /auth/api/v1/server/info endpoint will be used to
   250  	// grab a corresponding service account name. All service that use luci auth
   251  	// component expose this endpoint.
   252  	PrepareTopic(c context.Context, publisher string) (topic string, token string, err error)
   253  
   254  	// EmitTrigger delivers a given trigger to all jobs which are triggered by
   255  	// current one.
   256  	EmitTrigger(ctx context.Context, trigger *internal.Trigger)
   257  
   258  	// Save updates the state of the task in the persistent store.
   259  	//
   260  	// It also schedules all pending timer ticks added via AddTimer.
   261  	//
   262  	// Will be called by the engine after it launches the task. May also be called
   263  	// by the Manager itself, even multiple times (e.g. once to notify that the
   264  	// task has started, a second time to notify it has finished).
   265  	//
   266  	// Returns error if it couldn't save the invocation state. It is fine to
   267  	// ignore it. The engine will attempt to Save the invocation at the end anyway
   268  	// and it will properly handle the error if it happens again.
   269  	Save(c context.Context) error
   270  }
   271  
   272  // ControllerReadOnly is a subset of Controller interface with methods that do
   273  // not mutate the job's state.
   274  type ControllerReadOnly interface {
   275  	// JobID returns full job ID the controller is operating on.
   276  	JobID() string
   277  
   278  	// InvocationID returns unique identifier of this particular invocation.
   279  	InvocationID() int64
   280  
   281  	// RealmID returns the full realm ID of the job ("<project>:<realm>").
   282  	RealmID() string
   283  
   284  	// Request contains parameters of the invocation supplied when it was created.
   285  	Request() Request
   286  
   287  	// Task is proto message with task definition.
   288  	//
   289  	// It is guaranteed to have same underlying type as manager.ProtoMessageType()
   290  	// return value.
   291  	Task() proto.Message
   292  
   293  	// DebugLog appends a line to the free form text log of the task.
   294  	DebugLog(format string, args ...any)
   295  
   296  	// GetClient returns http.Client that is configured to use job's service
   297  	// account credentials to talk to other services.
   298  	GetClient(c context.Context, opts ...auth.RPCOption) (*http.Client, error)
   299  }
   300  
   301  // State is mutable portion of the task invocation state.
   302  //
   303  // It can be mutated by TaskManager directly.
   304  type State struct {
   305  	Status   Status // overall status of the invocation, see the enum
   306  	TaskData []byte // storage for TaskManager-specific task data
   307  	ViewURL  string // URL to human readable task page, shows in UI
   308  }
   309  
   310  // Request contains parameters of the invocation supplied when it was created.
   311  //
   312  // They are calculated from the pending triggers when the invocation is
   313  // initiated.
   314  type Request struct {
   315  	// TriggeredBy contains ID of an end user that triggered this invocation (e.g
   316  	// through UI or API) or an empty string if it was triggered by the engine or
   317  	// it is a result of a multiple different triggers.
   318  	//
   319  	// Mostly FYI.
   320  	TriggeredBy identity.Identity
   321  
   322  	// IncomingTriggers is a list of all triggers consumed by this invocation.
   323  	//
   324  	// Already sorted by time they were emitted (oldest first).
   325  	IncomingTriggers []*internal.Trigger
   326  
   327  	// Properties are arbitrary key-value pairs derived from the triggers by the
   328  	// triggering policy function and interpreted by the triggered task manager.
   329  	Properties *structpb.Struct
   330  
   331  	// Tags are arbitrary "<key>:<value>" pairs derived from the triggers by the
   332  	// triggering policy function.
   333  	//
   334  	// Primarily used for indexing and correlation of jobs/invocations with
   335  	// each other (including across different services). Task managers can pass
   336  	// them down the stack.
   337  	Tags []string
   338  
   339  	// DebugLog is optional multi-line string to put in the invocation debug log
   340  	// when it starts.
   341  	//
   342  	// It is used to report debug information (produced by the engine triggering
   343  	// guts) to the invocation debug log (visible via UI).
   344  	//
   345  	// This field is used internally by the engine. Task managers will never see
   346  	// it set.
   347  	DebugLog string
   348  }
   349  
   350  // LastTrigger is the most recent trigger from IncomingTriggers list or nil.
   351  func (r *Request) LastTrigger() *internal.Trigger {
   352  	if t := r.IncomingTriggers; len(t) > 0 {
   353  		return t[len(t)-1]
   354  	}
   355  	return nil
   356  }
   357  
   358  // TriggerIDs extracts list of IDs from IncomingTriggers.
   359  //
   360  // This is useful in tests for asserts.
   361  func (r *Request) TriggerIDs() []string {
   362  	ids := make([]string, len(r.IncomingTriggers))
   363  	for i, t := range r.IncomingTriggers {
   364  		ids[i] = t.Id
   365  	}
   366  	return ids
   367  }
   368  
   369  // StringProperty returns a value of string property or "" if no such property
   370  // or it has a different type.
   371  //
   372  // This is useful in tests for asserts.
   373  func (r *Request) StringProperty(k string) string {
   374  	if r.Properties == nil {
   375  		return ""
   376  	}
   377  	prop := r.Properties.Fields[k]
   378  	if prop == nil {
   379  		return ""
   380  	}
   381  	return prop.GetStringValue()
   382  }