go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/invocation.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/invocation.go (about)

     1  // Copyright 2017 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package engine
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"math"
    22  	"strings"
    23  	"time"
    24  
    25  	"go.chromium.org/luci/gae/service/datastore"
    26  
    27  	"go.chromium.org/luci/auth/identity"
    28  	"go.chromium.org/luci/common/clock"
    29  	"go.chromium.org/luci/common/data/rand/mathrand"
    30  	"go.chromium.org/luci/common/errors"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/common/retry/transient"
    33  	"go.chromium.org/luci/common/tsmon/distribution"
    34  	"go.chromium.org/luci/common/tsmon/field"
    35  	"go.chromium.org/luci/common/tsmon/metric"
    36  	"go.chromium.org/luci/common/tsmon/types"
    37  
    38  	"go.chromium.org/luci/scheduler/appengine/internal"
    39  	"go.chromium.org/luci/scheduler/appengine/task"
    40  )
    41  
    42  // errInvocationIDConflict is returned by generateInvocationID.
    43  var errInvocationIDConflict = errors.New("could not find available invocationID", transient.Tag)
    44  
    45  const (
    46  	// debugLogSizeLimit is how many bytes the invocation debug log can be before
    47  	// it gets trimmed. See 'trimDebugLog'. The debug log isn't supposed to be
    48  	// huge.
    49  	debugLogSizeLimit = 200000
    50  
    51  	// debugLogTailLines is how many last log lines to keep when trimming the log.
    52  	debugLogTailLines = 100
    53  )
    54  
    55  // Jan 1 2015, in UTC.
    56  var invocationIDEpoch time.Time
    57  
    58  func init() {
    59  	var err error
    60  	invocationIDEpoch, err = time.Parse(time.RFC822, "01 Jan 15 00:00 UTC")
    61  	if err != nil {
    62  		panic(err)
    63  	}
    64  }
    65  
    66  var (
    67  	// distributionBucketerSecToDay 1s to 1 day.
    68  	//
    69  	// 0.05037 is math.log10(math.exp(math.log(24*60*60)/98)), which means
    70  	// the last bucket will contain overflow of everything >=1 day.
    71  	distributionBucketerSecToDay = distribution.GeometricBucketer(math.Pow(10.0, 0.05037), 100)
    72  
    73  	metricInvocationsDurations = metric.NewCumulativeDistribution(
    74  		"luci/scheduler/invocations/durations",
    75  		"Durations of completed invocations (sec).",
    76  		&types.MetricMetadata{Units: types.Seconds},
    77  		distributionBucketerSecToDay,
    78  		field.String("jobID"),
    79  		field.String("status"), // one of final statuses of task.Status enum.
    80  	)
    81  )
    82  
    83  // generateInvocationID is called within a transaction to pick a new Invocation
    84  // ID and ensure it isn't taken yet.
    85  //
    86  // This function essentially pick root key for a new entity group, checking
    87  // that it hasn't been taken yet.
    88  //
    89  // Format of the invocation ID:
    90  //   - highest order bit set to 0 to keep the value positive.
    91  //   - next 43 bits set to negated time since some predefined epoch, in ms.
    92  //   - next 16 bits are generated by math.Rand
    93  //   - next 4 bits set to 0. They indicate ID format.
    94  //
    95  // Makes one attempt at allocating an ID. If it fails (should be extremely
    96  // rare), the entire transaction should be retried. We do it to avoid
    97  // unnecessarily hitting multiple entity groups from a single transaction.
    98  //
    99  // Returns only transient errors.
   100  func generateInvocationID(c context.Context) (int64, error) {
   101  	// See http://play.golang.org/p/POpQzpT4Up.
   102  	invTs := int64(clock.Now(c).UTC().Sub(invocationIDEpoch) / time.Millisecond)
   103  	invTs = ^invTs & 8796093022207 // 0b111....1, 42 bits (clear highest bit)
   104  	invTs = invTs << 20
   105  
   106  	randSuffix := mathrand.Int63n(c, 65536)
   107  	invID := invTs | (randSuffix << 4)
   108  	exists, err := datastore.Exists(c, datastore.NewKey(c, "Invocation", "", invID, nil))
   109  	if err != nil {
   110  		return 0, transient.Tag.Apply(err)
   111  	}
   112  	if !exists.All() {
   113  		return invID, nil
   114  	}
   115  
   116  	return 0, errInvocationIDConflict
   117  }
   118  
   119  // Invocation entity stores single invocation of a job (with perhaps multiple
   120  // attempts due retries if the invocation fails to start).
   121  //
   122  // Root entity. ID is generated based on time by generateInvocationID()
   123  // function.
   124  type Invocation struct {
   125  	_kind  string                `gae:"$kind,Invocation"`
   126  	_extra datastore.PropertyMap `gae:"-,extra"`
   127  
   128  	// ID is identifier of this particular attempt to run a job.
   129  	ID int64 `gae:"$id"`
   130  
   131  	// JobID is '<ProjectID>/<JobName>' string of a parent job.
   132  	//
   133  	// Set when the invocation is created and never changes.
   134  	JobID string `gae:",noindex"`
   135  
   136  	// IndexedJobID is '<ProjectID>/<JobName>' string of a parent job, but it is
   137  	// set only for finished invocations.
   138  	//
   139  	// It is used to make the invocations appear in the listings of finished
   140  	// invocations.
   141  	//
   142  	// We can't use JobID field for this since the invocation launch procedure can
   143  	// potentially generate orphaned "garbage" invocations in some edge cases (if
   144  	// Invocation transaction lands, but separate Job transaction doesn't). They
   145  	// are harmless, but we don't want them to show up in listings.
   146  	IndexedJobID string
   147  
   148  	// RealmID is a global realm name (i.e. "<ProjectID>:...") the invocation
   149  	// belongs to.
   150  	//
   151  	// It is copied from the Job entity when the invocation is created. May be
   152  	// empty for old invocations.
   153  	RealmID string `gae:",noindex"`
   154  
   155  	// Started is time when this invocation was created.
   156  	Started time.Time `gae:",noindex"`
   157  
   158  	// Finished is time when this invocation transitioned to a terminal state.
   159  	Finished time.Time `gae:",noindex"`
   160  
   161  	// TriggeredBy is identity of whoever triggered the invocation, if it was
   162  	// triggered via a single trigger submitted by some external user (not by the
   163  	// service itself).
   164  	//
   165  	// Empty identity string if it was triggered by the service itself.
   166  	TriggeredBy identity.Identity
   167  
   168  	// PropertiesRaw is a blob with serialized task.Request.Properties supplied
   169  	// when the invocation was created.
   170  	//
   171  	// Task managers use it to prepare the parameters for tasks.
   172  	PropertiesRaw []byte `gae:",noindex"`
   173  
   174  	// Tags is a sorted list of indexed "key:value" pairs supplied via
   175  	// task.Request.Tags when the invocation was created.
   176  	//
   177  	// May be passed down the stack by task managers.
   178  	Tags []string
   179  
   180  	// IncomingTriggersRaw is a serialized list of triggers that the invocation
   181  	// consumed.
   182  	//
   183  	// They are popped from job's pending triggers set when the invocation
   184  	// starts.
   185  	//
   186  	// Use IncomingTriggers() function to grab them in deserialized form.
   187  	IncomingTriggersRaw []byte `gae:",noindex"`
   188  
   189  	// OutgoingTriggersRaw is a serialized list of triggers that the invocation
   190  	// produced.
   191  	//
   192  	// They are fanned out into pending trigger sets of corresponding triggered
   193  	// jobs (specified by TriggeredJobIDs).
   194  	//
   195  	// Use OutgoingTriggers() function to grab them in deserialized form.
   196  	OutgoingTriggersRaw []byte `gae:",noindex"`
   197  
   198  	// PendingTimersRaw is a serialized list of pending invocation timers.
   199  	//
   200  	// Timers are emitted by Controller's AddTimer call.
   201  	//
   202  	// Use PendingTimers() function to grab them in deserialized form.
   203  	PendingTimersRaw []byte `gae:",noindex"`
   204  
   205  	// Revision is revision number of config.cfg when this invocation was created.
   206  	// For informational purpose.
   207  	Revision string `gae:",noindex"`
   208  
   209  	// RevisionURL is URL to human readable page with config file at
   210  	// an appropriate revision. For informational purpose.
   211  	RevisionURL string `gae:",noindex"`
   212  
   213  	// Task is the job payload for this invocation in binary serialized form.
   214  	// For informational purpose. See Catalog.UnmarshalTask().
   215  	Task []byte `gae:",noindex"`
   216  
   217  	// TriggeredJobIDs is a list of jobIDs of jobs which this job triggers.
   218  	// The list is sorted and without duplicates.
   219  	TriggeredJobIDs []string `gae:",noindex"`
   220  
   221  	// DebugLog is short free form text log with debug messages.
   222  	DebugLog string `gae:",noindex"`
   223  
   224  	// RetryCount is 0 on a first attempt to launch the task. Increased with each
   225  	// retry. For informational purposes.
   226  	RetryCount int64 `gae:",noindex"`
   227  
   228  	// Status is current status of the invocation (e.g. "RUNNING"), see the enum.
   229  	Status task.Status
   230  
   231  	// ViewURL is optional URL to a human readable page with task status, e.g.
   232  	// Swarming task page. Populated by corresponding TaskManager.
   233  	ViewURL string `gae:",noindex"`
   234  
   235  	// TaskData is a storage where TaskManager can keep task-specific state
   236  	// between calls.
   237  	TaskData []byte `gae:",noindex"`
   238  
   239  	// MutationsCount is used for simple compare-and-swap transaction control.
   240  	//
   241  	// It is incremented on each change to the entity.
   242  	MutationsCount int64 `gae:",noindex"`
   243  }
   244  
   245  // isEqual returns true iff 'e' is equal to 'other'
   246  func (e *Invocation) isEqual(other *Invocation) bool {
   247  	return e == other || (e.ID == other.ID &&
   248  		e.MutationsCount == other.MutationsCount && // compare it first, it changes most often
   249  		e.JobID == other.JobID &&
   250  		e.IndexedJobID == other.IndexedJobID &&
   251  		e.Started.Equal(other.Started) &&
   252  		e.Finished.Equal(other.Finished) &&
   253  		e.TriggeredBy == other.TriggeredBy &&
   254  		bytes.Equal(e.PropertiesRaw, other.PropertiesRaw) &&
   255  		equalSortedLists(e.Tags, other.Tags) &&
   256  		bytes.Equal(e.IncomingTriggersRaw, other.IncomingTriggersRaw) &&
   257  		bytes.Equal(e.OutgoingTriggersRaw, other.OutgoingTriggersRaw) &&
   258  		bytes.Equal(e.PendingTimersRaw, other.PendingTimersRaw) &&
   259  		e.Revision == other.Revision &&
   260  		e.RevisionURL == other.RevisionURL &&
   261  		bytes.Equal(e.Task, other.Task) &&
   262  		equalSortedLists(e.TriggeredJobIDs, other.TriggeredJobIDs) &&
   263  		e.DebugLog == other.DebugLog &&
   264  		e.RetryCount == other.RetryCount &&
   265  		e.Status == other.Status &&
   266  		e.ViewURL == other.ViewURL &&
   267  		bytes.Equal(e.TaskData, other.TaskData))
   268  }
   269  
   270  // GetProjectID parses the ProjectID from the JobID and returns it.
   271  func (e *Invocation) GetProjectID() string {
   272  	parts := strings.Split(e.JobID, "/")
   273  	return parts[0]
   274  }
   275  
   276  // debugLog appends a line to DebugLog field.
   277  func (e *Invocation) debugLog(c context.Context, format string, args ...any) {
   278  	debugLog(c, &e.DebugLog, format, args...)
   279  }
   280  
   281  // trimDebugLog makes sure DebugLog field doesn't exceed limits.
   282  //
   283  // It cuts the middle of the log. We need to do this to keep the entity small
   284  // enough to fit the datastore limits.
   285  func (e *Invocation) trimDebugLog() {
   286  	if len(e.DebugLog) <= debugLogSizeLimit {
   287  		return
   288  	}
   289  
   290  	const cutMsg = "--- the log has been cut here ---"
   291  	giveUp := func() {
   292  		e.DebugLog = e.DebugLog[:debugLogSizeLimit-len(cutMsg)-2] + "\n" + cutMsg + "\n"
   293  	}
   294  
   295  	// We take last debugLogTailLines lines of log and move them "up", so that
   296  	// the total log size is less than debugLogSizeLimit. We then put a line with
   297  	// the message that some log lines have been cut. If these operations are not
   298  	// possible (e.g. we have some giant lines or something), we give up and just
   299  	// cut the end of the log.
   300  
   301  	// Find debugLogTailLines-th "\n" from the end, e.DebugLog[tailStart:] is the
   302  	// log tail.
   303  	tailStart := len(e.DebugLog)
   304  	for i := 0; i < debugLogTailLines; i++ {
   305  		tailStart = strings.LastIndex(e.DebugLog[:tailStart-1], "\n")
   306  		if tailStart <= 0 {
   307  			giveUp()
   308  			return
   309  		}
   310  	}
   311  	tailStart++
   312  
   313  	// Figure out how many bytes of head we can keep to make trimmed log small
   314  	// enough.
   315  	tailLen := len(e.DebugLog) - tailStart + len(cutMsg) + 1
   316  	headSize := debugLogSizeLimit - tailLen
   317  	if headSize <= 0 {
   318  		giveUp()
   319  		return
   320  	}
   321  
   322  	// Find last "\n" in the head.
   323  	headEnd := strings.LastIndex(e.DebugLog[:headSize], "\n")
   324  	if headEnd <= 0 {
   325  		giveUp()
   326  		return
   327  	}
   328  
   329  	// We want to keep 50 lines of the head no matter what.
   330  	headLines := strings.Count(e.DebugLog[:headEnd], "\n")
   331  	if headLines < 50 {
   332  		giveUp()
   333  		return
   334  	}
   335  
   336  	// Remove duplicated 'cutMsg' lines. They may appear if 'debugLog' (followed
   337  	// by 'trimDebugLog') is called on already trimmed log multiple times.
   338  	lines := strings.Split(e.DebugLog[:headEnd], "\n")
   339  	lines = append(lines, cutMsg)
   340  	lines = append(lines, strings.Split(e.DebugLog[tailStart:], "\n")...)
   341  	trimmed := make([]byte, 0, debugLogSizeLimit)
   342  	trimmed = append(trimmed, lines[0]...)
   343  	for i := 1; i < len(lines); i++ {
   344  		if !(lines[i-1] == cutMsg && lines[i] == cutMsg) {
   345  			trimmed = append(trimmed, '\n')
   346  			trimmed = append(trimmed, lines[i]...)
   347  		}
   348  	}
   349  	e.DebugLog = string(trimmed)
   350  }
   351  
   352  // IncomingTriggers is a list of triggers that the invocation consumed.
   353  //
   354  // It is deserialized on the fly from IncomingTriggersRaw.
   355  func (e *Invocation) IncomingTriggers() ([]*internal.Trigger, error) {
   356  	return unmarshalTriggersList(e.IncomingTriggersRaw)
   357  }
   358  
   359  // OutgoingTriggers is a list of triggers that the invocation produced.
   360  //
   361  // It is deserialized on the fly from OutgoingTriggersRaw.
   362  func (e *Invocation) OutgoingTriggers() ([]*internal.Trigger, error) {
   363  	return unmarshalTriggersList(e.OutgoingTriggersRaw)
   364  }
   365  
   366  // PendingTimers is a list of not-yet-consumed invocation timers.
   367  //
   368  // It is deserialized on the fly from PendingTimersRaw.
   369  func (e *Invocation) PendingTimers() ([]*internal.Timer, error) {
   370  	return unmarshalTimersList(e.PendingTimersRaw)
   371  }
   372  
   373  // cleanupUnreferencedInvocations tries to delete given invocations.
   374  //
   375  // This is best effort cleanup after failures. It logs errors, but doesn't
   376  // return them, to indicate that there's nothing we can actually do.
   377  //
   378  // 'invs' is allowed to have nils, they are skipped. Allowed to be called
   379  // within a transaction, ignores it.
   380  func cleanupUnreferencedInvocations(c context.Context, invs []*Invocation) {
   381  	keysToKill := make([]*datastore.Key, 0, len(invs))
   382  	for _, inv := range invs {
   383  		if inv != nil {
   384  			logging.Warningf(c, "Cleaning up inv %d of job %q", inv.ID, inv.JobID)
   385  			keysToKill = append(keysToKill, datastore.KeyForObj(c, inv))
   386  		}
   387  	}
   388  	if err := datastore.Delete(datastore.WithoutTransaction(c), keysToKill); err != nil {
   389  		logging.WithError(err).Warningf(c, "Invocation cleanup failed")
   390  	}
   391  }
   392  
   393  // reportCompletionMetrics reports invocation stats to monitoring.
   394  // Should be called after transaction to save this invocation is completed.
   395  func (e *Invocation) reportCompletionMetrics(c context.Context) {
   396  	if !e.Status.Final() || e.Finished.IsZero() {
   397  		panic(fmt.Errorf("reportCompletionMetrics on incomplete invocation: %v", e))
   398  	}
   399  	duration := e.Finished.Sub(e.Started)
   400  	metricInvocationsDurations.Add(c, duration.Seconds(), e.JobID, string(e.Status))
   401  }