go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/triage.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/triage.go (about)

     1  // Copyright 2015 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package engine
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"sync"
    22  	"time"
    23  
    24  	"google.golang.org/protobuf/types/known/timestamppb"
    25  
    26  	"go.chromium.org/luci/appengine/tq"
    27  	"go.chromium.org/luci/common/clock"
    28  	"go.chromium.org/luci/common/errors"
    29  	"go.chromium.org/luci/common/logging"
    30  	"go.chromium.org/luci/common/retry/transient"
    31  	"go.chromium.org/luci/gae/service/datastore"
    32  
    33  	"go.chromium.org/luci/scheduler/appengine/engine/cron"
    34  	"go.chromium.org/luci/scheduler/appengine/engine/dsset"
    35  	"go.chromium.org/luci/scheduler/appengine/engine/policy"
    36  	"go.chromium.org/luci/scheduler/appengine/internal"
    37  	"go.chromium.org/luci/scheduler/appengine/messages"
    38  	"go.chromium.org/luci/scheduler/appengine/task"
    39  )
    40  
    41  // TODO(vadimsh): Surface triage status in Monitoring.
    42  
    43  // errTriagePrepareFail is returned by 'prepare' on errors.
    44  var errTriagePrepareFail = errors.New("error while fetching sets, see logs", transient.Tag)
    45  
    46  // triageOp is a short lived struct that represents in-flight triage operation.
    47  //
    48  // A triage is a process of looking at what pending triggers/events a job has
    49  // accumulated and deciding what to do with them (e.g. starting new invocations
    50  // or waiting).
    51  //
    52  // There are 3 stages of the triage operation:
    53  //   - Pre-transaction to gather pending events.
    54  //   - Transaction to "atomically" consume the events.
    55  //   - Post-transaction to cleanup garbage, update monitoring, etc.
    56  type triageOp struct {
    57  	// jobID is ID of the job being examined, must be provided externally.
    58  	jobID string
    59  
    60  	// dispatcher routes task queue tasks.
    61  	dispatcher *tq.Dispatcher
    62  
    63  	// policyFactory knows how to instantiate triggering policies.
    64  	//
    65  	// Usually it is policy.New, but may be replaced in tests.
    66  	policyFactory func(*messages.TriggeringPolicy) (policy.Func, error)
    67  
    68  	// enqueueInvocations transactionally creates and starts new invocations.
    69  	//
    70  	// Implemented by the engine, see engineImpl.enqueueInvocations.
    71  	enqueueInvocations func(c context.Context, job *Job, req []task.Request) error
    72  
    73  	// maxAllowedTriggers limits how many pending triggers are allowed to exist.
    74  	//
    75  	// If the pending triggers set has more triggers, oldest ones are forcefully
    76  	// (without consulting the policy) discarded.
    77  	//
    78  	// This mechanism acts as a safeguard against total meltdown caused by large
    79  	// pending trigger queues.
    80  	maxAllowedTriggers int
    81  
    82  	// The rest of fields are used internally.
    83  
    84  	started      time.Time    // used to derive relative time in the debug log
    85  	debugLogLock sync.Mutex   // the log may be modified from multiple goroutines
    86  	debugLog     bytes.Buffer // appended to by debugInfoLog and debugErrLog
    87  	debugLogMsgs int          // number of messages in the debug log
    88  
    89  	finishedSet  *invocationIDSet
    90  	finishedList *dsset.Listing
    91  
    92  	triggersSet   *triggersSet
    93  	triggersList  *dsset.Listing
    94  	readyTriggers []*internal.Trigger // same as triggersList, but deserialized and sorted
    95  
    96  	garbage    dsset.Garbage // collected inside the transaction, cleaned outside
    97  	txnAttempt int           // incremented on each transaction attempt
    98  	lastTriage time.Time     // value of job.LastTriage stored in the transaction
    99  }
   100  
   101  // prepare fetches all pending triggers and events from dsset structs.
   102  func (op *triageOp) prepare(c context.Context) error {
   103  	op.started = clock.Now(c).UTC() // used to derive relative time in the debug log
   104  	op.debugInfoLog(c, "Starting")
   105  
   106  	op.finishedSet = recentlyFinishedSet(c, op.jobID)
   107  	op.triggersSet = pendingTriggersSet(c, op.jobID)
   108  
   109  	wg := sync.WaitGroup{}
   110  	wg.Add(2)
   111  
   112  	// Grab all pending triggers to decided what to do with them.
   113  	var triggersErr error
   114  	go func() {
   115  		defer wg.Done()
   116  		op.triggersList, op.readyTriggers, triggersErr = op.triggersSet.Triggers(c)
   117  		if triggersErr != nil {
   118  			op.debugErrLog(c, triggersErr, "Failed to grab pending triggers")
   119  		} else {
   120  			sortTriggers(op.readyTriggers)
   121  		}
   122  	}()
   123  
   124  	// Grab a list of recently finished invocations to remove them from
   125  	// ActiveInvocations list.
   126  	var finishedErr error
   127  	go func() {
   128  		defer wg.Done()
   129  		op.finishedList, finishedErr = op.finishedSet.List(c)
   130  		if finishedErr != nil {
   131  			op.debugErrLog(c, finishedErr, "Failed to grab recently finished invocations")
   132  		}
   133  	}()
   134  
   135  	wg.Wait()
   136  	if triggersErr != nil || finishedErr != nil {
   137  		return errTriagePrepareFail // the original error is already logged
   138  	}
   139  
   140  	op.debugInfoLog(c, "Pending triggers set:  %d items, %d garbage", len(op.triggersList.Items), len(op.triggersList.Garbage))
   141  	op.debugInfoLog(c, "Recently finished set: %d items, %d garbage", len(op.finishedList.Items), len(op.finishedList.Garbage))
   142  
   143  	// Remove old tombstones to keep the set tidy. We fail hard here on errors to
   144  	// make sure progress stops if garbage can't be cleaned up for some reason.
   145  	// It is better to fail early rather than silently accumulate tons of garbage
   146  	// until everything grinds to a halt.
   147  	if err := dsset.CleanupGarbage(c, op.triggersList.Garbage, op.finishedList.Garbage); err != nil {
   148  		op.debugErrLog(c, err, "Failed to cleanup dsset garbage")
   149  		return err
   150  	}
   151  
   152  	// Log something mostly to capture the timestamp.
   153  	op.debugInfoLog(c, "The preparation is finished")
   154  	return nil
   155  }
   156  
   157  // transaction must be called within a job transaction.
   158  //
   159  // It pops pending triggers/events, producing invocations or other events along
   160  // the way.
   161  //
   162  // This method may be called multiple times (when the transaction is retried).
   163  func (op *triageOp) transaction(c context.Context, job *Job) error {
   164  	op.txnAttempt++
   165  	if op.txnAttempt == 1 {
   166  		op.debugInfoLog(c, "Started the transaction")
   167  	} else {
   168  		op.debugInfoLog(c, "Retrying the transaction, attempt %d", op.txnAttempt)
   169  	}
   170  
   171  	// Reset state collected in the transaction in case this is a retry.
   172  	op.garbage = nil
   173  	op.lastTriage = time.Time{}
   174  
   175  	// Tidy ActiveInvocations list by moving all recently finished invocations to
   176  	// FinishedInvocations list.
   177  	tidyOp, err := op.tidyActiveInvocations(c, job)
   178  	if err != nil {
   179  		return err // the error is logged inside already
   180  	}
   181  
   182  	// Process pending triggers set by emitting new invocations. Note that this
   183  	// modifies ActiveInvocations list when emitting invocations.
   184  	triggersOp, err := op.processTriggers(c, job)
   185  	if err != nil {
   186  		return err // the error is logged inside already
   187  	}
   188  
   189  	// If nothing is running anymore, make sure the cron is ticking again. This is
   190  	// useful for schedules like "with 10min interval" that initiate an invocation
   191  	// after some time after the previous one finishes. This call submits at most
   192  	// one task to TQ. Note that there's no harm in calling this multiple times,
   193  	// only the first call will actually do something.
   194  	if len(job.ActiveInvocations) == 0 {
   195  		op.debugInfoLog(c, "Poking the cron")
   196  		err := pokeCron(c, job, op.dispatcher, func(m *cron.Machine) error {
   197  			m.RewindIfNecessary()
   198  			return nil
   199  		})
   200  		if err != nil {
   201  			op.debugErrLog(c, err, "Failed to rewind the cron machine")
   202  			return err
   203  		}
   204  	}
   205  
   206  	// Submit set modifications. This may produce more garbage that we need to
   207  	// cleanup later (outside the transaction).
   208  	popOps := []*dsset.PopOp{}
   209  	if tidyOp != nil {
   210  		popOps = append(popOps, tidyOp)
   211  	}
   212  	if triggersOp != nil {
   213  		popOps = append(popOps, triggersOp)
   214  	}
   215  	op.debugInfoLog(c, "Removing consumed dsset items")
   216  	if op.garbage, err = dsset.FinishPop(c, popOps...); err != nil {
   217  		op.debugErrLog(c, err, "Failed to pop consumed dsset items")
   218  		return err
   219  	}
   220  
   221  	op.lastTriage = clock.Now(c).UTC()
   222  	job.LastTriage = op.lastTriage
   223  
   224  	op.debugInfoLog(c, "Landing the transaction")
   225  	return nil
   226  }
   227  
   228  // finalize is called after the transaction (successfully submitted or not) to
   229  // delete any produced garbage, submit the triage log, update monitoring
   230  // counters, etc.
   231  //
   232  // 'success' is true if the transaction successfully landed.
   233  //
   234  // It is best effort. We can't do anything meaningful if it fails: the
   235  // transaction has already landed, there's no way to unland it.
   236  func (op *triageOp) finalize(c context.Context, success bool) {
   237  	// Cleanup the garbage accumulated in the transaction only if the transaction
   238  	// really landed. If it didn't this it not really a garbage!
   239  	if success && len(op.garbage) != 0 {
   240  		op.debugInfoLog(c, "Cleaning up storage of %d dsset items", len(op.garbage))
   241  		if err := dsset.CleanupGarbage(c, op.garbage); err != nil {
   242  			op.debugErrLog(c, err, "Best effort cleanup failed")
   243  		}
   244  	}
   245  	op.debugInfoLog(c, "Done")
   246  
   247  	// Save the log. There's nothing we can do if this fails, but we try to detect
   248  	// and warn on the log staleness (see GetJobTriageLog implementation).
   249  	if op.lastTriage.IsZero() {
   250  		op.lastTriage = clock.Now(c).UTC()
   251  	}
   252  	log := JobTriageLog{
   253  		JobID:      op.jobID,
   254  		LastTriage: op.lastTriage,
   255  		DebugLog:   op.debugLog.String(),
   256  	}
   257  	if err := datastore.Put(c, &log); err != nil {
   258  		logging.WithError(err).Errorf(c, "Failed to store the triage debug log")
   259  	}
   260  }
   261  
   262  ////////////////////////////////////////////////////////////////////////////////
   263  
   264  // debugInfoLog adds a line to the triage debug log and Infof log.
   265  func (op *triageOp) debugInfoLog(c context.Context, format string, args ...any) {
   266  	logging.Infof(c, format, args...)
   267  	op.appendToLog(c, fmt.Sprintf(format, args...))
   268  }
   269  
   270  // debugErrLog adds a line to the triage debug log and Errorf log.
   271  func (op *triageOp) debugErrLog(c context.Context, err error, format string, args ...any) {
   272  	if err == nil {
   273  		logging.Errorf(c, format, args...)
   274  		op.appendToLog(c, fmt.Sprintf("Error: "+format, args...))
   275  	} else {
   276  		logging.WithError(err).Errorf(c, format, args...)
   277  		op.appendToLog(c, fmt.Sprintf("Error: "+format+" - %s", append(args, err)...))
   278  	}
   279  }
   280  
   281  // appendToLog adds a line to the debug log, prefixing it with current time.
   282  func (op *triageOp) appendToLog(c context.Context, msg string) {
   283  	const maxMessageCount = 1000
   284  
   285  	ms := clock.Now(c).Sub(op.started).Nanoseconds() / 1e6
   286  
   287  	op.debugLogLock.Lock()
   288  	if op.debugLogMsgs <= maxMessageCount {
   289  		if op.debugLogMsgs < maxMessageCount {
   290  			fmt.Fprintf(&op.debugLog, "[%03d ms] %s\n", ms, msg)
   291  		} else {
   292  			fmt.Fprintf(&op.debugLog, "<the log is too long, truncated here>")
   293  		}
   294  		op.debugLogMsgs++
   295  	}
   296  	op.debugLogLock.Unlock()
   297  }
   298  
   299  // tidyActiveInvocations removes finished invocations from ActiveInvocations,
   300  // and adds them to FinishedInvocations.
   301  //
   302  // Called within a transaction. Mutates job. Returns an open dsset.PopOp that
   303  // must be eventually submitted with dsset.FinishPop.
   304  func (op *triageOp) tidyActiveInvocations(c context.Context, job *Job) (*dsset.PopOp, error) {
   305  	now := clock.Now(c).UTC()
   306  
   307  	// Deserialize the list of recently finished invocations, as stored in the
   308  	// entity. Discard old items right away. If it is broken, log, but proceed.
   309  	// It is ~OK to loose it (this will temporary cause some API calls to return
   310  	// incomplete data).
   311  	finishedRecord, err := filteredFinishedInvs(
   312  		job.FinishedInvocationsRaw, now.Add(-FinishedInvocationsHorizon))
   313  	if err != nil {
   314  		op.debugErrLog(c, err, "Failed to unmarshal FinishedInvocationsRaw, skipping")
   315  	}
   316  
   317  	// Note that per dsset API we need to do BeginPop if there's some garbage,
   318  	// even if Items is empty. We can skip this if both lists are empty though.
   319  	var popOp *dsset.PopOp
   320  	if len(op.finishedList.Items) != 0 || len(op.finishedList.Garbage) != 0 {
   321  		popOp, err = op.finishedSet.BeginPop(c, op.finishedList)
   322  		if err != nil {
   323  			op.debugErrLog(c, err, "Failed to pop from finished invocations set")
   324  			return nil, err
   325  		}
   326  
   327  		// Items can have IDs popped by some other transaction already. Collect
   328  		// only ones consumed by us here.
   329  		reallyFinished := make(map[int64]struct{}, len(op.finishedList.Items))
   330  		for _, itm := range op.finishedList.Items {
   331  			if popOp.Pop(itm.ID) {
   332  				reallyFinished[op.finishedSet.ItemToInvID(&itm)] = struct{}{}
   333  			}
   334  		}
   335  
   336  		// Remove IDs of all finished invocations from ActiveInvocations list,
   337  		// preserving the order, and add them to the finished invocations list.
   338  		if len(reallyFinished) != 0 {
   339  			filtered := make([]int64, 0, len(job.ActiveInvocations))
   340  			for _, id := range job.ActiveInvocations {
   341  				if _, yep := reallyFinished[id]; !yep {
   342  					filtered = append(filtered, id) // still running
   343  					continue
   344  				}
   345  				op.debugInfoLog(c, "Invocation %d is acknowledged as finished", id)
   346  				finishedRecord = append(finishedRecord, &internal.FinishedInvocation{
   347  					InvocationId: id,
   348  					Finished:     timestamppb.New(now),
   349  				})
   350  			}
   351  			job.ActiveInvocations = filtered
   352  		}
   353  	}
   354  
   355  	// Marshal back FinishedInvocationsRaw after it has been updated.
   356  	job.FinishedInvocationsRaw = marshalFinishedInvs(finishedRecord)
   357  
   358  	// Log summary.
   359  	op.debugInfoLog(c, "Number of active invocations: %d", len(job.ActiveInvocations))
   360  	op.debugInfoLog(c, "Number of recently finished:  %d", len(finishedRecord))
   361  
   362  	return popOp, nil
   363  }
   364  
   365  // processTriggers pops pending triggers, converting them into invocations.
   366  func (op *triageOp) processTriggers(c context.Context, job *Job) (*dsset.PopOp, error) {
   367  	popOp, err := op.triggersSet.BeginPop(c, op.triggersList)
   368  	if err != nil {
   369  		op.debugErrLog(c, err, "Failed to pop from pending triggers set")
   370  		return nil, err
   371  	}
   372  
   373  	// Filter out all triggers already popped by some other transaction.
   374  	triggers := make([]*internal.Trigger, 0, len(op.readyTriggers))
   375  	for _, t := range op.readyTriggers {
   376  		if popOp.CanPop(t.Id) {
   377  			triggers = append(triggers, t)
   378  		}
   379  	}
   380  	op.debugInfoLog(c, "Triggers available in this txn: %d", len(triggers))
   381  
   382  	// If the job is paused or disabled, just pop all triggers without starting
   383  	// anything. Note: there's a best-effort shortcut for ignoring triggers
   384  	// for paused jobs in execEnqueueTriggersTask.
   385  	if len(triggers) != 0 {
   386  		if job.Paused || !job.Enabled {
   387  			op.debugInfoLog(c, "The job is inactive, clearing the pending triggers queue")
   388  			for _, t := range triggers {
   389  				popOp.Pop(t.Id)
   390  			}
   391  			return popOp, nil
   392  		}
   393  	}
   394  
   395  	// Keep the set of pending triggers bounded, to avoid total meltdown caused
   396  	// by slow triage transactions.
   397  	if len(triggers) > op.maxAllowedTriggers {
   398  		dropping := len(triggers) - op.maxAllowedTriggers
   399  		op.debugErrLog(c, nil, "Too many pending triggers (>%d), dropping %d oldest", op.maxAllowedTriggers, dropping)
   400  		for i := 0; i < dropping; i++ {
   401  			popOp.Pop(triggers[i].Id)
   402  		}
   403  		triggers = triggers[dropping:]
   404  	}
   405  
   406  	// Ask the policy to convert triggers into invocations. Note that triggers is
   407  	// not the only input to the triggering policy, so we call it on each triage,
   408  	// even if there's no pending triggers.
   409  	op.debugInfoLog(c, "Invoking the triggering policy function")
   410  	out := op.triggeringPolicy(c, job, triggers)
   411  	op.debugInfoLog(c, "The policy requested %d new invocations", len(out.Requests))
   412  
   413  	// Actually pop all consumed triggers and start the corresponding invocations.
   414  	// Note that this modifies job.ActiveInvocations list.
   415  	if len(out.Requests) != 0 {
   416  		consumed := 0
   417  		for _, r := range out.Requests {
   418  			for _, t := range r.IncomingTriggers {
   419  				popOp.Pop(t.Id)
   420  			}
   421  			consumed += len(r.IncomingTriggers)
   422  		}
   423  		if err := op.enqueueInvocations(c, job, out.Requests); err != nil {
   424  			op.debugErrLog(c, err, "Failed to enqueue some invocations")
   425  			return nil, err
   426  		}
   427  		op.debugInfoLog(c, "New invocations enqueued, consumed %d triggers", consumed)
   428  	}
   429  
   430  	// Discard triggers the policy decided are no longer needed.
   431  	if len(out.Discard) != 0 {
   432  		for _, t := range out.Discard {
   433  			popOp.Pop(t.Id)
   434  		}
   435  		op.debugInfoLog(c, "%d triggers discarded, according to policy", len(out.Discard))
   436  	}
   437  
   438  	return popOp, nil
   439  }
   440  
   441  // triggeringPolicy decides how to convert a set of pending triggers into
   442  // a bunch of new invocations.
   443  //
   444  // Called within a job transaction. Must not do any expensive calls.
   445  func (op *triageOp) triggeringPolicy(c context.Context, job *Job, triggers []*internal.Trigger) policy.Out {
   446  	var policyFunc policy.Func
   447  	p, err := policy.UnmarshalDefinition(job.TriggeringPolicyRaw)
   448  	if err == nil {
   449  		policyFunc, err = op.policyFactory(p)
   450  	}
   451  	if err != nil {
   452  		op.debugErrLog(c, err, "Failed to instantiate the triggering policy function, using the default policy instead")
   453  		policyFunc = policy.Default()
   454  	}
   455  	return policyFunc(policyFuncEnv{c, op}, policy.In{
   456  		Now:               clock.Now(c).UTC(),
   457  		ActiveInvocations: job.ActiveInvocations,
   458  		Triggers:          triggers,
   459  	})
   460  }
   461  
   462  // policyFuncEnv implements policy.Environment through triageOp.
   463  type policyFuncEnv struct {
   464  	ctx context.Context
   465  	op  *triageOp
   466  }
   467  
   468  func (e policyFuncEnv) DebugLog(format string, args ...any) {
   469  	e.op.debugInfoLog(e.ctx, format, args...)
   470  }