go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/triage.go (about) 1 // Copyright 2015 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package engine 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "sync" 22 "time" 23 24 "google.golang.org/protobuf/types/known/timestamppb" 25 26 "go.chromium.org/luci/appengine/tq" 27 "go.chromium.org/luci/common/clock" 28 "go.chromium.org/luci/common/errors" 29 "go.chromium.org/luci/common/logging" 30 "go.chromium.org/luci/common/retry/transient" 31 "go.chromium.org/luci/gae/service/datastore" 32 33 "go.chromium.org/luci/scheduler/appengine/engine/cron" 34 "go.chromium.org/luci/scheduler/appengine/engine/dsset" 35 "go.chromium.org/luci/scheduler/appengine/engine/policy" 36 "go.chromium.org/luci/scheduler/appengine/internal" 37 "go.chromium.org/luci/scheduler/appengine/messages" 38 "go.chromium.org/luci/scheduler/appengine/task" 39 ) 40 41 // TODO(vadimsh): Surface triage status in Monitoring. 42 43 // errTriagePrepareFail is returned by 'prepare' on errors. 44 var errTriagePrepareFail = errors.New("error while fetching sets, see logs", transient.Tag) 45 46 // triageOp is a short lived struct that represents in-flight triage operation. 47 // 48 // A triage is a process of looking at what pending triggers/events a job has 49 // accumulated and deciding what to do with them (e.g. starting new invocations 50 // or waiting). 51 // 52 // There are 3 stages of the triage operation: 53 // - Pre-transaction to gather pending events. 54 // - Transaction to "atomically" consume the events. 55 // - Post-transaction to cleanup garbage, update monitoring, etc. 56 type triageOp struct { 57 // jobID is ID of the job being examined, must be provided externally. 58 jobID string 59 60 // dispatcher routes task queue tasks. 61 dispatcher *tq.Dispatcher 62 63 // policyFactory knows how to instantiate triggering policies. 64 // 65 // Usually it is policy.New, but may be replaced in tests. 66 policyFactory func(*messages.TriggeringPolicy) (policy.Func, error) 67 68 // enqueueInvocations transactionally creates and starts new invocations. 69 // 70 // Implemented by the engine, see engineImpl.enqueueInvocations. 71 enqueueInvocations func(c context.Context, job *Job, req []task.Request) error 72 73 // maxAllowedTriggers limits how many pending triggers are allowed to exist. 74 // 75 // If the pending triggers set has more triggers, oldest ones are forcefully 76 // (without consulting the policy) discarded. 77 // 78 // This mechanism acts as a safeguard against total meltdown caused by large 79 // pending trigger queues. 80 maxAllowedTriggers int 81 82 // The rest of fields are used internally. 83 84 started time.Time // used to derive relative time in the debug log 85 debugLogLock sync.Mutex // the log may be modified from multiple goroutines 86 debugLog bytes.Buffer // appended to by debugInfoLog and debugErrLog 87 debugLogMsgs int // number of messages in the debug log 88 89 finishedSet *invocationIDSet 90 finishedList *dsset.Listing 91 92 triggersSet *triggersSet 93 triggersList *dsset.Listing 94 readyTriggers []*internal.Trigger // same as triggersList, but deserialized and sorted 95 96 garbage dsset.Garbage // collected inside the transaction, cleaned outside 97 txnAttempt int // incremented on each transaction attempt 98 lastTriage time.Time // value of job.LastTriage stored in the transaction 99 } 100 101 // prepare fetches all pending triggers and events from dsset structs. 102 func (op *triageOp) prepare(c context.Context) error { 103 op.started = clock.Now(c).UTC() // used to derive relative time in the debug log 104 op.debugInfoLog(c, "Starting") 105 106 op.finishedSet = recentlyFinishedSet(c, op.jobID) 107 op.triggersSet = pendingTriggersSet(c, op.jobID) 108 109 wg := sync.WaitGroup{} 110 wg.Add(2) 111 112 // Grab all pending triggers to decided what to do with them. 113 var triggersErr error 114 go func() { 115 defer wg.Done() 116 op.triggersList, op.readyTriggers, triggersErr = op.triggersSet.Triggers(c) 117 if triggersErr != nil { 118 op.debugErrLog(c, triggersErr, "Failed to grab pending triggers") 119 } else { 120 sortTriggers(op.readyTriggers) 121 } 122 }() 123 124 // Grab a list of recently finished invocations to remove them from 125 // ActiveInvocations list. 126 var finishedErr error 127 go func() { 128 defer wg.Done() 129 op.finishedList, finishedErr = op.finishedSet.List(c) 130 if finishedErr != nil { 131 op.debugErrLog(c, finishedErr, "Failed to grab recently finished invocations") 132 } 133 }() 134 135 wg.Wait() 136 if triggersErr != nil || finishedErr != nil { 137 return errTriagePrepareFail // the original error is already logged 138 } 139 140 op.debugInfoLog(c, "Pending triggers set: %d items, %d garbage", len(op.triggersList.Items), len(op.triggersList.Garbage)) 141 op.debugInfoLog(c, "Recently finished set: %d items, %d garbage", len(op.finishedList.Items), len(op.finishedList.Garbage)) 142 143 // Remove old tombstones to keep the set tidy. We fail hard here on errors to 144 // make sure progress stops if garbage can't be cleaned up for some reason. 145 // It is better to fail early rather than silently accumulate tons of garbage 146 // until everything grinds to a halt. 147 if err := dsset.CleanupGarbage(c, op.triggersList.Garbage, op.finishedList.Garbage); err != nil { 148 op.debugErrLog(c, err, "Failed to cleanup dsset garbage") 149 return err 150 } 151 152 // Log something mostly to capture the timestamp. 153 op.debugInfoLog(c, "The preparation is finished") 154 return nil 155 } 156 157 // transaction must be called within a job transaction. 158 // 159 // It pops pending triggers/events, producing invocations or other events along 160 // the way. 161 // 162 // This method may be called multiple times (when the transaction is retried). 163 func (op *triageOp) transaction(c context.Context, job *Job) error { 164 op.txnAttempt++ 165 if op.txnAttempt == 1 { 166 op.debugInfoLog(c, "Started the transaction") 167 } else { 168 op.debugInfoLog(c, "Retrying the transaction, attempt %d", op.txnAttempt) 169 } 170 171 // Reset state collected in the transaction in case this is a retry. 172 op.garbage = nil 173 op.lastTriage = time.Time{} 174 175 // Tidy ActiveInvocations list by moving all recently finished invocations to 176 // FinishedInvocations list. 177 tidyOp, err := op.tidyActiveInvocations(c, job) 178 if err != nil { 179 return err // the error is logged inside already 180 } 181 182 // Process pending triggers set by emitting new invocations. Note that this 183 // modifies ActiveInvocations list when emitting invocations. 184 triggersOp, err := op.processTriggers(c, job) 185 if err != nil { 186 return err // the error is logged inside already 187 } 188 189 // If nothing is running anymore, make sure the cron is ticking again. This is 190 // useful for schedules like "with 10min interval" that initiate an invocation 191 // after some time after the previous one finishes. This call submits at most 192 // one task to TQ. Note that there's no harm in calling this multiple times, 193 // only the first call will actually do something. 194 if len(job.ActiveInvocations) == 0 { 195 op.debugInfoLog(c, "Poking the cron") 196 err := pokeCron(c, job, op.dispatcher, func(m *cron.Machine) error { 197 m.RewindIfNecessary() 198 return nil 199 }) 200 if err != nil { 201 op.debugErrLog(c, err, "Failed to rewind the cron machine") 202 return err 203 } 204 } 205 206 // Submit set modifications. This may produce more garbage that we need to 207 // cleanup later (outside the transaction). 208 popOps := []*dsset.PopOp{} 209 if tidyOp != nil { 210 popOps = append(popOps, tidyOp) 211 } 212 if triggersOp != nil { 213 popOps = append(popOps, triggersOp) 214 } 215 op.debugInfoLog(c, "Removing consumed dsset items") 216 if op.garbage, err = dsset.FinishPop(c, popOps...); err != nil { 217 op.debugErrLog(c, err, "Failed to pop consumed dsset items") 218 return err 219 } 220 221 op.lastTriage = clock.Now(c).UTC() 222 job.LastTriage = op.lastTriage 223 224 op.debugInfoLog(c, "Landing the transaction") 225 return nil 226 } 227 228 // finalize is called after the transaction (successfully submitted or not) to 229 // delete any produced garbage, submit the triage log, update monitoring 230 // counters, etc. 231 // 232 // 'success' is true if the transaction successfully landed. 233 // 234 // It is best effort. We can't do anything meaningful if it fails: the 235 // transaction has already landed, there's no way to unland it. 236 func (op *triageOp) finalize(c context.Context, success bool) { 237 // Cleanup the garbage accumulated in the transaction only if the transaction 238 // really landed. If it didn't this it not really a garbage! 239 if success && len(op.garbage) != 0 { 240 op.debugInfoLog(c, "Cleaning up storage of %d dsset items", len(op.garbage)) 241 if err := dsset.CleanupGarbage(c, op.garbage); err != nil { 242 op.debugErrLog(c, err, "Best effort cleanup failed") 243 } 244 } 245 op.debugInfoLog(c, "Done") 246 247 // Save the log. There's nothing we can do if this fails, but we try to detect 248 // and warn on the log staleness (see GetJobTriageLog implementation). 249 if op.lastTriage.IsZero() { 250 op.lastTriage = clock.Now(c).UTC() 251 } 252 log := JobTriageLog{ 253 JobID: op.jobID, 254 LastTriage: op.lastTriage, 255 DebugLog: op.debugLog.String(), 256 } 257 if err := datastore.Put(c, &log); err != nil { 258 logging.WithError(err).Errorf(c, "Failed to store the triage debug log") 259 } 260 } 261 262 //////////////////////////////////////////////////////////////////////////////// 263 264 // debugInfoLog adds a line to the triage debug log and Infof log. 265 func (op *triageOp) debugInfoLog(c context.Context, format string, args ...any) { 266 logging.Infof(c, format, args...) 267 op.appendToLog(c, fmt.Sprintf(format, args...)) 268 } 269 270 // debugErrLog adds a line to the triage debug log and Errorf log. 271 func (op *triageOp) debugErrLog(c context.Context, err error, format string, args ...any) { 272 if err == nil { 273 logging.Errorf(c, format, args...) 274 op.appendToLog(c, fmt.Sprintf("Error: "+format, args...)) 275 } else { 276 logging.WithError(err).Errorf(c, format, args...) 277 op.appendToLog(c, fmt.Sprintf("Error: "+format+" - %s", append(args, err)...)) 278 } 279 } 280 281 // appendToLog adds a line to the debug log, prefixing it with current time. 282 func (op *triageOp) appendToLog(c context.Context, msg string) { 283 const maxMessageCount = 1000 284 285 ms := clock.Now(c).Sub(op.started).Nanoseconds() / 1e6 286 287 op.debugLogLock.Lock() 288 if op.debugLogMsgs <= maxMessageCount { 289 if op.debugLogMsgs < maxMessageCount { 290 fmt.Fprintf(&op.debugLog, "[%03d ms] %s\n", ms, msg) 291 } else { 292 fmt.Fprintf(&op.debugLog, "<the log is too long, truncated here>") 293 } 294 op.debugLogMsgs++ 295 } 296 op.debugLogLock.Unlock() 297 } 298 299 // tidyActiveInvocations removes finished invocations from ActiveInvocations, 300 // and adds them to FinishedInvocations. 301 // 302 // Called within a transaction. Mutates job. Returns an open dsset.PopOp that 303 // must be eventually submitted with dsset.FinishPop. 304 func (op *triageOp) tidyActiveInvocations(c context.Context, job *Job) (*dsset.PopOp, error) { 305 now := clock.Now(c).UTC() 306 307 // Deserialize the list of recently finished invocations, as stored in the 308 // entity. Discard old items right away. If it is broken, log, but proceed. 309 // It is ~OK to loose it (this will temporary cause some API calls to return 310 // incomplete data). 311 finishedRecord, err := filteredFinishedInvs( 312 job.FinishedInvocationsRaw, now.Add(-FinishedInvocationsHorizon)) 313 if err != nil { 314 op.debugErrLog(c, err, "Failed to unmarshal FinishedInvocationsRaw, skipping") 315 } 316 317 // Note that per dsset API we need to do BeginPop if there's some garbage, 318 // even if Items is empty. We can skip this if both lists are empty though. 319 var popOp *dsset.PopOp 320 if len(op.finishedList.Items) != 0 || len(op.finishedList.Garbage) != 0 { 321 popOp, err = op.finishedSet.BeginPop(c, op.finishedList) 322 if err != nil { 323 op.debugErrLog(c, err, "Failed to pop from finished invocations set") 324 return nil, err 325 } 326 327 // Items can have IDs popped by some other transaction already. Collect 328 // only ones consumed by us here. 329 reallyFinished := make(map[int64]struct{}, len(op.finishedList.Items)) 330 for _, itm := range op.finishedList.Items { 331 if popOp.Pop(itm.ID) { 332 reallyFinished[op.finishedSet.ItemToInvID(&itm)] = struct{}{} 333 } 334 } 335 336 // Remove IDs of all finished invocations from ActiveInvocations list, 337 // preserving the order, and add them to the finished invocations list. 338 if len(reallyFinished) != 0 { 339 filtered := make([]int64, 0, len(job.ActiveInvocations)) 340 for _, id := range job.ActiveInvocations { 341 if _, yep := reallyFinished[id]; !yep { 342 filtered = append(filtered, id) // still running 343 continue 344 } 345 op.debugInfoLog(c, "Invocation %d is acknowledged as finished", id) 346 finishedRecord = append(finishedRecord, &internal.FinishedInvocation{ 347 InvocationId: id, 348 Finished: timestamppb.New(now), 349 }) 350 } 351 job.ActiveInvocations = filtered 352 } 353 } 354 355 // Marshal back FinishedInvocationsRaw after it has been updated. 356 job.FinishedInvocationsRaw = marshalFinishedInvs(finishedRecord) 357 358 // Log summary. 359 op.debugInfoLog(c, "Number of active invocations: %d", len(job.ActiveInvocations)) 360 op.debugInfoLog(c, "Number of recently finished: %d", len(finishedRecord)) 361 362 return popOp, nil 363 } 364 365 // processTriggers pops pending triggers, converting them into invocations. 366 func (op *triageOp) processTriggers(c context.Context, job *Job) (*dsset.PopOp, error) { 367 popOp, err := op.triggersSet.BeginPop(c, op.triggersList) 368 if err != nil { 369 op.debugErrLog(c, err, "Failed to pop from pending triggers set") 370 return nil, err 371 } 372 373 // Filter out all triggers already popped by some other transaction. 374 triggers := make([]*internal.Trigger, 0, len(op.readyTriggers)) 375 for _, t := range op.readyTriggers { 376 if popOp.CanPop(t.Id) { 377 triggers = append(triggers, t) 378 } 379 } 380 op.debugInfoLog(c, "Triggers available in this txn: %d", len(triggers)) 381 382 // If the job is paused or disabled, just pop all triggers without starting 383 // anything. Note: there's a best-effort shortcut for ignoring triggers 384 // for paused jobs in execEnqueueTriggersTask. 385 if len(triggers) != 0 { 386 if job.Paused || !job.Enabled { 387 op.debugInfoLog(c, "The job is inactive, clearing the pending triggers queue") 388 for _, t := range triggers { 389 popOp.Pop(t.Id) 390 } 391 return popOp, nil 392 } 393 } 394 395 // Keep the set of pending triggers bounded, to avoid total meltdown caused 396 // by slow triage transactions. 397 if len(triggers) > op.maxAllowedTriggers { 398 dropping := len(triggers) - op.maxAllowedTriggers 399 op.debugErrLog(c, nil, "Too many pending triggers (>%d), dropping %d oldest", op.maxAllowedTriggers, dropping) 400 for i := 0; i < dropping; i++ { 401 popOp.Pop(triggers[i].Id) 402 } 403 triggers = triggers[dropping:] 404 } 405 406 // Ask the policy to convert triggers into invocations. Note that triggers is 407 // not the only input to the triggering policy, so we call it on each triage, 408 // even if there's no pending triggers. 409 op.debugInfoLog(c, "Invoking the triggering policy function") 410 out := op.triggeringPolicy(c, job, triggers) 411 op.debugInfoLog(c, "The policy requested %d new invocations", len(out.Requests)) 412 413 // Actually pop all consumed triggers and start the corresponding invocations. 414 // Note that this modifies job.ActiveInvocations list. 415 if len(out.Requests) != 0 { 416 consumed := 0 417 for _, r := range out.Requests { 418 for _, t := range r.IncomingTriggers { 419 popOp.Pop(t.Id) 420 } 421 consumed += len(r.IncomingTriggers) 422 } 423 if err := op.enqueueInvocations(c, job, out.Requests); err != nil { 424 op.debugErrLog(c, err, "Failed to enqueue some invocations") 425 return nil, err 426 } 427 op.debugInfoLog(c, "New invocations enqueued, consumed %d triggers", consumed) 428 } 429 430 // Discard triggers the policy decided are no longer needed. 431 if len(out.Discard) != 0 { 432 for _, t := range out.Discard { 433 popOp.Pop(t.Id) 434 } 435 op.debugInfoLog(c, "%d triggers discarded, according to policy", len(out.Discard)) 436 } 437 438 return popOp, nil 439 } 440 441 // triggeringPolicy decides how to convert a set of pending triggers into 442 // a bunch of new invocations. 443 // 444 // Called within a job transaction. Must not do any expensive calls. 445 func (op *triageOp) triggeringPolicy(c context.Context, job *Job, triggers []*internal.Trigger) policy.Out { 446 var policyFunc policy.Func 447 p, err := policy.UnmarshalDefinition(job.TriggeringPolicyRaw) 448 if err == nil { 449 policyFunc, err = op.policyFactory(p) 450 } 451 if err != nil { 452 op.debugErrLog(c, err, "Failed to instantiate the triggering policy function, using the default policy instead") 453 policyFunc = policy.Default() 454 } 455 return policyFunc(policyFuncEnv{c, op}, policy.In{ 456 Now: clock.Now(c).UTC(), 457 ActiveInvocations: job.ActiveInvocations, 458 Triggers: triggers, 459 }) 460 } 461 462 // policyFuncEnv implements policy.Environment through triageOp. 463 type policyFuncEnv struct { 464 ctx context.Context 465 op *triageOp 466 } 467 468 func (e policyFuncEnv) DebugLog(format string, args ...any) { 469 e.op.debugInfoLog(e.ctx, format, args...) 470 }