go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/task/task.go (about) 1 // Copyright 2015 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package task defines interface between Scheduler engine and implementations 16 // of particular tasks (such as URL fetch tasks, Swarming tasks, DM tasks, etc). 17 // 18 // Its subpackages contain concrete realizations of Manager interface. 19 package task 20 21 import ( 22 "context" 23 "net/http" 24 "time" 25 26 "github.com/golang/protobuf/proto" 27 "google.golang.org/api/pubsub/v1" 28 "google.golang.org/protobuf/types/known/structpb" 29 30 "go.chromium.org/luci/auth/identity" 31 "go.chromium.org/luci/config/validation" 32 "go.chromium.org/luci/server/auth" 33 34 "go.chromium.org/luci/scheduler/appengine/internal" 35 ) 36 37 // Status is status of a single job invocation. 38 type Status string 39 40 const ( 41 // StatusStarting means the task is about to start. 42 StatusStarting Status = "STARTING" 43 // StatusRetrying means the task was starting, but the launch failed in some 44 // transient way. The start attempt is retried in this case a bunch of times, 45 // until eventually the task moves into either StatusRunning or one of the 46 // final states. The only possible transition into StatusRetrying is from 47 // StatusStarting. A running task can only succeed or fail. 48 StatusRetrying Status = "RETRYING" 49 // StatusRunning means the task has started and is running now. 50 StatusRunning Status = "RUNNING" 51 // StatusSucceeded means the task finished with success. 52 StatusSucceeded Status = "SUCCEEDED" 53 // StatusFailed means the task finished with error or failed to start. 54 StatusFailed Status = "FAILED" 55 // StatusOverrun means the task should have been started, but previous one is 56 // still running. 57 StatusOverrun Status = "OVERRUN" 58 // StatusAborted means the task was forcefully aborted (manually or due to 59 // hard deadline). 60 StatusAborted Status = "ABORTED" 61 ) 62 63 // Initial returns true if Status is Starting or Retrying. 64 // 65 // These statuses indicate an invocation before LaunchTask (perhaps, a retry of 66 // it) is finished with the invocation. 67 func (s Status) Initial() bool { 68 return s == StatusStarting || s == StatusRetrying 69 } 70 71 // Final returns true if Status represents some final status. 72 func (s Status) Final() bool { 73 switch s { 74 case StatusSucceeded, StatusFailed, StatusOverrun, StatusAborted: 75 return true 76 default: 77 return false 78 } 79 } 80 81 // Traits describes properties that influence how the scheduler engine manages 82 // tasks handled by this Manager. 83 type Traits struct { 84 // Multistage is true if Manager uses Starting -> Running -> Finished 85 // state chain for all invocations (instead of just Starting -> Finished). 86 // 87 // This is the case for "heavy" tasks that can run for undetermined amount 88 // of time (e.g. Swarming and Buildbucket tasks). By switching invocation 89 // state to Running, the Manager acknowledges that it takes responsibility for 90 // eventually moving the invocation to Finished state (perhaps in response to 91 // a PubSub notification or a timer tick). In other words, once an invocation 92 // is in Running state, the schedule engine will not automatically keep track 93 // of it's healthiness (it's the responsibility of the Manager now). 94 // 95 // For smaller tasks (that finish in seconds, e.g. gitiles poller tasks) it is 96 // simpler and more efficient just to do everything in LaunchTask and then 97 // move the invocation to Finished state. By doing so, the Manager avoids 98 // implementing healthiness checks, piggybacking on LaunchTask retries 99 // automatically performed by the scheduler engine. 100 // 101 // Currently this trait only influences the UI. Invocations with 102 // Multistage == false don't show up as "Starting" in the UI (they are 103 // displayed as "Running" instead, since it makes more sense from end-user 104 // perspective). 105 Multistage bool 106 } 107 108 // Manager knows how to work with a particular kind of tasks (e.g. URL fetch 109 // tasks, Swarming tasks, etc): how to deserialize, validate and execute them. 110 // 111 // Manager uses Controller to talk back to the scheduler engine. 112 type Manager interface { 113 // Name returns task manager name. It identifies the corresponding kind 114 // of tasks and used in various resource names (e.g. PubSub topic names). 115 Name() string 116 117 // ProtoMessageType returns a pointer to protobuf message struct that 118 // describes config for the task kind, e.g. &UrlFetchTask{}. Will be used 119 // only for its type signature. 120 ProtoMessageType() proto.Message 121 122 // Traits returns properties that influence how the scheduler engine manages 123 // tasks handled by this Manager. 124 // 125 // See Traits struct for more details. 126 Traits() Traits 127 128 // ValidateProtoMessage verifies task definition proto message makes sense. 129 // msg must have same underlying type as ProtoMessageType() return value. 130 // 131 // realmID is a full realm name (as "<project>:<realm>") of the job whose 132 // definition is being validated. It is never empty, but may be a @legacy 133 // realm. 134 // 135 // Errors are returned via validation.Context. 136 ValidateProtoMessage(c *validation.Context, msg proto.Message, realmID string) 137 138 // LaunchTask starts (or starts and finishes in one go) the task. 139 // 140 // Manager's responsibilities: 141 // * To move the task to some state other than StatusStarting 142 // (by changing ctl.State().Status). If at some point the task has moved 143 // to StatusRunning, the manager MUST setup some way to track the task's 144 // progress to eventually move it to some final state. It can be a status 145 // check via a timer (see `AddTimer` below), or a PubSub callback (see 146 // `PrepareTopic` below). 147 // * Be idempotent, if possible, using ctl.InvocationID() as an operation 148 // key. 149 // * Not to use supplied controller outside of LaunchTask call. 150 // * Not to use supplied controller concurrently without synchronization. 151 // 152 // If `LaunchTask` crashes before returning or returns a transient error, it 153 // will be called again later, receiving exact same ctl.InvocationID(). 154 // 155 // TaskManager may optionally use ctl.Save() to checkpoint progress and save 156 // debug log. ctl.Save() is also implicitly called by the engine when 157 // `LaunchTask` returns. 158 LaunchTask(c context.Context, ctl Controller) error 159 160 // AbortTask is called to opportunistically abort launched task. 161 // 162 // It is called right before the job is forcefully switched to a failed state. 163 // The engine does not wait for the task runner to acknowledge this action. 164 // 165 // AbortTask must be idempotent since it may be called multiple times in case 166 // of errors. 167 AbortTask(c context.Context, ctl Controller) error 168 169 // ExamineNotification is called to extract the auth token from the incoming 170 // PubSub message. 171 // 172 // It should return an empty string if the message is unrecognized/malformed 173 // or there's no auth token in it. Note that the PubSub message here is not 174 // yet validated and can be a total garbage (or even be malicious). 175 // 176 // See PrepareTopic for more info. 177 ExamineNotification(c context.Context, msg *pubsub.PubsubMessage) string 178 179 // HandleNotification is called whenever engine receives a PubSub message sent 180 // to a topic created with Controller.PrepareTopic. Expect duplicated and 181 // out-of-order messages here. HandleNotification must be idempotent. 182 // 183 // Returns transient error to trigger a redeliver of the message, no error to 184 // to acknowledge the message and fatal error to move the invocation to failed 185 // state. 186 // 187 // Any modifications made to the invocation state will be saved regardless of 188 // the return value (to save the debug log). 189 HandleNotification(c context.Context, ctl Controller, msg *pubsub.PubsubMessage) error 190 191 // HandleTimer is called to process timers set up by Controller.AddTimer. 192 // 193 // Expect duplicated or delayed events here. HandleTimer must be idempotent. 194 // 195 // Returns a transient error to trigger a redelivery of the event (the 196 // invocation state won't be saved in this case), no error to acknowledge the 197 // event and a fatal error to move the invocation to failed state. 198 HandleTimer(c context.Context, ctl Controller, name string, payload []byte) error 199 200 // GetDebugState returns debug info about the state persisted by the manager. 201 GetDebugState(c context.Context, ctl ControllerReadOnly) (*internal.DebugManagerState, error) 202 } 203 204 // Controller is passed to LaunchTask by the scheduler engine. It gives Manager 205 // control over one job invocation. Manager must not use it outside of 206 // LaunchTask. Controller implementation is generally not thread safe (but it's 207 // fine to use it from multiple goroutines if access is protected by a lock). 208 // 209 // All methods that accept context.Context expect contexts derived from ones 210 // passed to 'Manager' methods. A derived context can be used to set custom 211 // deadlines for some potentially expensive methods like 'PrepareTopic'. 212 type Controller interface { 213 ControllerReadOnly 214 215 // State returns a mutable portion of task invocation state. 216 // 217 // TaskManager can modify it in-place and then call Controller.Save to persist 218 // the changes. The state will also be saved by the engine automatically if 219 // Manager doesn't call Save. 220 State() *State 221 222 // AddTimer sets up a new delayed call to Manager.HandleTimer. 223 // 224 // Timers are active as long as the invocation is not in one of the final 225 // states. There is no way to cancel a timer (ignore HandleTimer call 226 // instead). 227 // 228 // 'title' will be visible in logs, it should convey a purpose for this timer. 229 // It doesn't have to be unique. 230 // 231 // 'payload' is any byte blob carried verbatim to Manager.HandleTimer. 232 // 233 // All timers are actually enabled in Save(), in the same transaction that 234 // updates the job state. 235 AddTimer(c context.Context, delay time.Duration, title string, payload []byte) 236 237 // PrepareTopic create PubSub topic for notifications related to the task and 238 // adds given publisher to its ACL. 239 // 240 // It returns full name of the topic and a token that will be used to 241 // authenticate the PubSub message and bind it to the task the Controller is 242 // operating on now. Topic name and its configuration are controlled by the 243 // Engine. The publisher to the topic must put the token somewhere inside 244 // the message. The engine will ask the task manager to extract the token 245 // from the message via ExamineNotification, then it will validate the token 246 // and eventually call HandleNotification. 247 // 248 // 'publisher' can be a service account email, or an URL to some luci service. 249 // If URL is given, its /auth/api/v1/server/info endpoint will be used to 250 // grab a corresponding service account name. All service that use luci auth 251 // component expose this endpoint. 252 PrepareTopic(c context.Context, publisher string) (topic string, token string, err error) 253 254 // EmitTrigger delivers a given trigger to all jobs which are triggered by 255 // current one. 256 EmitTrigger(ctx context.Context, trigger *internal.Trigger) 257 258 // Save updates the state of the task in the persistent store. 259 // 260 // It also schedules all pending timer ticks added via AddTimer. 261 // 262 // Will be called by the engine after it launches the task. May also be called 263 // by the Manager itself, even multiple times (e.g. once to notify that the 264 // task has started, a second time to notify it has finished). 265 // 266 // Returns error if it couldn't save the invocation state. It is fine to 267 // ignore it. The engine will attempt to Save the invocation at the end anyway 268 // and it will properly handle the error if it happens again. 269 Save(c context.Context) error 270 } 271 272 // ControllerReadOnly is a subset of Controller interface with methods that do 273 // not mutate the job's state. 274 type ControllerReadOnly interface { 275 // JobID returns full job ID the controller is operating on. 276 JobID() string 277 278 // InvocationID returns unique identifier of this particular invocation. 279 InvocationID() int64 280 281 // RealmID returns the full realm ID of the job ("<project>:<realm>"). 282 RealmID() string 283 284 // Request contains parameters of the invocation supplied when it was created. 285 Request() Request 286 287 // Task is proto message with task definition. 288 // 289 // It is guaranteed to have same underlying type as manager.ProtoMessageType() 290 // return value. 291 Task() proto.Message 292 293 // DebugLog appends a line to the free form text log of the task. 294 DebugLog(format string, args ...any) 295 296 // GetClient returns http.Client that is configured to use job's service 297 // account credentials to talk to other services. 298 GetClient(c context.Context, opts ...auth.RPCOption) (*http.Client, error) 299 } 300 301 // State is mutable portion of the task invocation state. 302 // 303 // It can be mutated by TaskManager directly. 304 type State struct { 305 Status Status // overall status of the invocation, see the enum 306 TaskData []byte // storage for TaskManager-specific task data 307 ViewURL string // URL to human readable task page, shows in UI 308 } 309 310 // Request contains parameters of the invocation supplied when it was created. 311 // 312 // They are calculated from the pending triggers when the invocation is 313 // initiated. 314 type Request struct { 315 // TriggeredBy contains ID of an end user that triggered this invocation (e.g 316 // through UI or API) or an empty string if it was triggered by the engine or 317 // it is a result of a multiple different triggers. 318 // 319 // Mostly FYI. 320 TriggeredBy identity.Identity 321 322 // IncomingTriggers is a list of all triggers consumed by this invocation. 323 // 324 // Already sorted by time they were emitted (oldest first). 325 IncomingTriggers []*internal.Trigger 326 327 // Properties are arbitrary key-value pairs derived from the triggers by the 328 // triggering policy function and interpreted by the triggered task manager. 329 Properties *structpb.Struct 330 331 // Tags are arbitrary "<key>:<value>" pairs derived from the triggers by the 332 // triggering policy function. 333 // 334 // Primarily used for indexing and correlation of jobs/invocations with 335 // each other (including across different services). Task managers can pass 336 // them down the stack. 337 Tags []string 338 339 // DebugLog is optional multi-line string to put in the invocation debug log 340 // when it starts. 341 // 342 // It is used to report debug information (produced by the engine triggering 343 // guts) to the invocation debug log (visible via UI). 344 // 345 // This field is used internally by the engine. Task managers will never see 346 // it set. 347 DebugLog string 348 } 349 350 // LastTrigger is the most recent trigger from IncomingTriggers list or nil. 351 func (r *Request) LastTrigger() *internal.Trigger { 352 if t := r.IncomingTriggers; len(t) > 0 { 353 return t[len(t)-1] 354 } 355 return nil 356 } 357 358 // TriggerIDs extracts list of IDs from IncomingTriggers. 359 // 360 // This is useful in tests for asserts. 361 func (r *Request) TriggerIDs() []string { 362 ids := make([]string, len(r.IncomingTriggers)) 363 for i, t := range r.IncomingTriggers { 364 ids[i] = t.Id 365 } 366 return ids 367 } 368 369 // StringProperty returns a value of string property or "" if no such property 370 // or it has a different type. 371 // 372 // This is useful in tests for asserts. 373 func (r *Request) StringProperty(k string) string { 374 if r.Properties == nil { 375 return "" 376 } 377 prop := r.Properties.Fields[k] 378 if prop == nil { 379 return "" 380 } 381 return prop.GetStringValue() 382 }