go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/job.go (about)

     1  // Copyright 2017 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package engine
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"hash/fnv"
    22  	"strconv"
    23  	"strings"
    24  	"time"
    25  
    26  	"google.golang.org/protobuf/proto"
    27  
    28  	"go.chromium.org/luci/auth/identity"
    29  	"go.chromium.org/luci/common/errors"
    30  	"go.chromium.org/luci/gae/service/datastore"
    31  
    32  	"go.chromium.org/luci/scheduler/appengine/catalog"
    33  	"go.chromium.org/luci/scheduler/appengine/engine/cron"
    34  	"go.chromium.org/luci/scheduler/appengine/engine/dsset"
    35  	"go.chromium.org/luci/scheduler/appengine/internal"
    36  	"go.chromium.org/luci/scheduler/appengine/schedule"
    37  )
    38  
    39  // FinishedInvocationsHorizon defines how many invocations to keep in the
    40  // Job's FinishedInvocations list.
    41  //
    42  // All entries there that are older than FinishedInvocationsHorizon will be
    43  // evicted next time the list is updated.
    44  const FinishedInvocationsHorizon = 10 * time.Minute
    45  
    46  // Job stores the last known definition of a scheduler job, as well as its
    47  // current state. Root entity, its kind is "Job".
    48  type Job struct {
    49  	_kind  string                `gae:"$kind,Job"`
    50  	_extra datastore.PropertyMap `gae:"-,extra"`
    51  
    52  	// cachedSchedule and cachedScheduleErr are used by ParseSchedule().
    53  	cachedSchedule    *schedule.Schedule `gae:"-"`
    54  	cachedScheduleErr error              `gae:"-"`
    55  
    56  	// JobID is '<ProjectID>/<JobName>' string. JobName is unique with a project,
    57  	// but not globally. JobID is unique globally.
    58  	JobID string `gae:"$id"`
    59  
    60  	// ProjectID exists for indexing. It matches <projectID> portion of JobID.
    61  	ProjectID string
    62  
    63  	// RealmID is a global realm name (i.e. "<ProjectID>:...") the job belongs to.
    64  	RealmID string
    65  
    66  	// Flavor describes what category of jobs this is, see the enum.
    67  	Flavor catalog.JobFlavor `gae:",noindex"`
    68  
    69  	// Enabled is false if the job was disabled or removed from config.
    70  	//
    71  	// Disabled jobs do not show up in UI at all (they are still kept in the
    72  	// datastore though, for audit purposes).
    73  	Enabled bool
    74  
    75  	// Paused is true if no new invocations of the job should be started.
    76  	//
    77  	// Paused jobs ignore the cron scheduler and incoming triggers. Triggers are
    78  	// completely skipped (not even enqueued). Pausing a job clears the pending
    79  	// triggers set.
    80  	Paused bool `gae:",noindex"`
    81  
    82  	// PausedOrResumedWhen is when the job was paused or resumed.
    83  	PausedOrResumedWhen time.Time `gae:",noindex"`
    84  
    85  	// PausedOrResumedBy is who paused or resumed the job the last.
    86  	PausedOrResumedBy identity.Identity `gae:",noindex"`
    87  
    88  	// PausedOrResumedReason is the reason the job was paused or resumed.
    89  	PausedOrResumedReason string `gae:",noindex"`
    90  
    91  	// Revision is last seen job definition revision.
    92  	Revision string `gae:",noindex"`
    93  
    94  	// RevisionURL is URL to human readable page with config file at
    95  	// an appropriate revision.
    96  	RevisionURL string `gae:",noindex"`
    97  
    98  	// Schedule is the job's schedule in regular cron expression format.
    99  	Schedule string `gae:",noindex"`
   100  
   101  	// Task is the job's payload in serialized form. Opaque from the point of view
   102  	// of the engine. See Catalog.UnmarshalTask().
   103  	Task []byte `gae:",noindex"`
   104  
   105  	// TriggeredJobIDs is a list of jobIDs of jobs which this job triggers.
   106  	// The list is sorted and without duplicates.
   107  	TriggeredJobIDs []string `gae:",noindex"`
   108  
   109  	// Cron holds the state of the cron state machine.
   110  	Cron cron.State `gae:",noindex"`
   111  
   112  	// TriggeringPolicyRaw is job's TriggeringPolicy proto in serialized form.
   113  	//
   114  	// It is taken from the job definition stored in the catalog. Used during
   115  	// the triage.
   116  	TriggeringPolicyRaw []byte `gae:",noindex"`
   117  
   118  	// ActiveInvocations is ordered set of active invocation IDs.
   119  	//
   120  	// It contains IDs of pending, running or recently finished invocations,
   121  	// the most recent at the end.
   122  	ActiveInvocations []int64 `gae:",noindex"`
   123  
   124  	// FinishedInvocationsRaw is a list of recently finished invocations, along
   125  	// with the time they finished.
   126  	//
   127  	// It is serialized internal.FinishedInvocationList proto, see db.proto. We
   128  	// store it this way to simplify adding more fields if necessary and to avoid
   129  	// paying the cost of the deserialization if the caller is not interested.
   130  	//
   131  	// This list is used to achieve a perfectly consistent listing of all recent
   132  	// invocations of a job.
   133  	//
   134  	// Entries older than FinishedInvocationsHorizon are evicted from this list
   135  	// during triages. We assume that FinishedInvocationsHorizon is enough for
   136  	// datastore indexes to catch up, so all recent invocations older than the
   137  	// horizon can be fetched using a regular datastore query.
   138  	FinishedInvocationsRaw []byte `gae:",noindex"`
   139  
   140  	// LastTriage is a time when the last triage transaction was committed.
   141  	LastTriage time.Time `gae:",noindex"`
   142  }
   143  
   144  // JobTriageLog contains information about the most recent triage.
   145  //
   146  // To avoid increasing the triage transaction size, and to allow logging triage
   147  // transaction collisions, this entity is saved non-transactionally in a
   148  // separate entity group on a best effort basis.
   149  //
   150  // It means it may occasionally be stale. To detect staleness we duplicate
   151  // LastTriage timestamp here. If Job.LastTriage indicates the triage happened
   152  // sufficiently log ago (by wall clock), but JobTriageLog.LastTriage is still
   153  // old, then the log is stale (since JobTriageLog commit should have landed
   154  // already). When this happens consistently we'll have to use real GAE logs to
   155  // figure out what's wrong.
   156  type JobTriageLog struct {
   157  	_kind  string                `gae:"$kind,JobTriageLog"`
   158  	_extra datastore.PropertyMap `gae:"-,extra"`
   159  
   160  	// JobID is '<ProjectID>/<JobName>' string, matches corresponding Job.JobID.
   161  	JobID string `gae:"$id"`
   162  	// LastTriage is set to exact same value as corresponding Job.LastTriage.
   163  	LastTriage time.Time `gae:",noindex"`
   164  	// DebugLog is short free form text log with debug messages.
   165  	DebugLog string `gae:",noindex"`
   166  
   167  	// stale is populated by GetJobTriageLog if it thinks the log is stale.
   168  	stale bool `gae:"-"`
   169  }
   170  
   171  // Stale is true if the engine thinks the log is stale.
   172  //
   173  // It does it by comparing LastTriage to the job's LastTriage.
   174  func (j *JobTriageLog) Stale() bool {
   175  	return j.stale
   176  }
   177  
   178  // JobName returns name of this Job as defined its project's config.
   179  //
   180  // This is "<name>" part extracted from "<project>/<name>" job ID.
   181  func (e *Job) JobName() string {
   182  	chunks := strings.Split(e.JobID, "/")
   183  	return chunks[1]
   184  }
   185  
   186  // EffectiveSchedule returns schedule string to use for the job, considering its
   187  // Paused field.
   188  //
   189  // Paused jobs always use "triggered" schedule.
   190  func (e *Job) EffectiveSchedule() string {
   191  	if e.Paused {
   192  		return "triggered"
   193  	}
   194  	return e.Schedule
   195  }
   196  
   197  // ParseSchedule returns *Schedule object, parsing e.Schedule field.
   198  //
   199  // If job is paused e.Schedule field is ignored and "triggered" schedule is
   200  // returned instead.
   201  func (e *Job) ParseSchedule() (*schedule.Schedule, error) {
   202  	if e.cachedSchedule == nil && e.cachedScheduleErr == nil {
   203  		hash := fnv.New64()
   204  		hash.Write([]byte(e.JobID))
   205  		seed := hash.Sum64()
   206  		e.cachedSchedule, e.cachedScheduleErr = schedule.Parse(e.EffectiveSchedule(), seed)
   207  		if e.cachedSchedule == nil && e.cachedScheduleErr == nil {
   208  			panic("no schedule and no error")
   209  		}
   210  	}
   211  	return e.cachedSchedule, e.cachedScheduleErr
   212  }
   213  
   214  // IsEqual returns true iff 'e' is equal to 'other'.
   215  func (e *Job) IsEqual(other *Job) bool {
   216  	return e == other || (e.JobID == other.JobID &&
   217  		e.ProjectID == other.ProjectID &&
   218  		e.RealmID == other.RealmID &&
   219  		e.Flavor == other.Flavor &&
   220  		e.Enabled == other.Enabled &&
   221  		e.Paused == other.Paused &&
   222  		e.PausedOrResumedWhen.Equal(other.PausedOrResumedWhen) &&
   223  		e.PausedOrResumedBy == other.PausedOrResumedBy &&
   224  		e.PausedOrResumedReason == other.PausedOrResumedReason &&
   225  		e.Revision == other.Revision &&
   226  		e.RevisionURL == other.RevisionURL &&
   227  		e.Schedule == other.Schedule &&
   228  		e.LastTriage.Equal(other.LastTriage) &&
   229  		bytes.Equal(e.Task, other.Task) &&
   230  		equalSortedLists(e.TriggeredJobIDs, other.TriggeredJobIDs) &&
   231  		e.Cron.Equal(&other.Cron) &&
   232  		bytes.Equal(e.TriggeringPolicyRaw, other.TriggeringPolicyRaw) &&
   233  		equalInt64Lists(e.ActiveInvocations, other.ActiveInvocations) &&
   234  		bytes.Equal(e.FinishedInvocationsRaw, other.FinishedInvocationsRaw))
   235  }
   236  
   237  // MatchesDefinition returns true if job definition in the entity matches the
   238  // one specified by catalog.Definition struct.
   239  func (e *Job) MatchesDefinition(def catalog.Definition) bool {
   240  	return e.JobID == def.JobID &&
   241  		e.RealmID == def.RealmID &&
   242  		e.Flavor == def.Flavor &&
   243  		e.Schedule == def.Schedule &&
   244  		bytes.Equal(e.Task, def.Task) &&
   245  		bytes.Equal(e.TriggeringPolicyRaw, def.TriggeringPolicy) &&
   246  		equalSortedLists(e.TriggeredJobIDs, def.TriggeredJobIDs)
   247  }
   248  
   249  // CronTickTime returns time when the cron job is expected to start again.
   250  //
   251  // May return:
   252  //
   253  //	Zero time if the job is using relative schedule, or not a cron job at all.
   254  //	schedule.DistantFuture if the job is paused.
   255  func (e *Job) CronTickTime() time.Time {
   256  	// Note: LastTick is "last scheduled tick", it is in the future.
   257  	return e.Cron.LastTick.When
   258  }
   259  
   260  // recentlyFinishedSet is a set with IDs of all recently finished invocations.
   261  //
   262  // This is an accumulator of IDs to remove from ActiveInvocations list next
   263  // time we run a triage for the corresponding Job.
   264  //
   265  // Invocation IDs are serialized with fmt.Sprintf("%d").
   266  func recentlyFinishedSet(c context.Context, jobID string) *invocationIDSet {
   267  	return &invocationIDSet{
   268  		Set: dsset.Set{
   269  			ID:              "finished:" + jobID,
   270  			ShardCount:      8,
   271  			TombstonesRoot:  datastore.KeyForObj(c, &Job{JobID: jobID}),
   272  			TombstonesDelay: 30 * time.Minute,
   273  		},
   274  	}
   275  }
   276  
   277  // invocationIDSet is a dsset.Set that stores invocation IDs.
   278  type invocationIDSet struct {
   279  	dsset.Set
   280  }
   281  
   282  // Add adds a bunch of invocation IDs to the set.
   283  func (s *invocationIDSet) Add(c context.Context, ids []int64) error {
   284  	items := make([]dsset.Item, len(ids))
   285  	for i, id := range ids {
   286  		items[i].ID = fmt.Sprintf("%d", id)
   287  	}
   288  	return s.Set.Add(c, items)
   289  }
   290  
   291  // ItemToInvID takes a dsset.Item and returns invocation ID stored there or 0 if
   292  // it's malformed.
   293  func (s *invocationIDSet) ItemToInvID(i *dsset.Item) int64 {
   294  	id, _ := strconv.ParseInt(i.ID, 10, 64)
   295  	return id
   296  }
   297  
   298  // pendingTriggersSet is a set of not yet consumed triggers for the job.
   299  //
   300  // This is incoming triggers. They are processed in the triage procedure,
   301  // resulting in new invocations.
   302  func pendingTriggersSet(c context.Context, jobID string) *triggersSet {
   303  	return &triggersSet{
   304  		Set: dsset.Set{
   305  			ID:              "triggers:" + jobID,
   306  			ShardCount:      8,
   307  			TombstonesRoot:  datastore.KeyForObj(c, &Job{JobID: jobID}),
   308  			TombstonesDelay: 30 * time.Minute,
   309  		},
   310  	}
   311  }
   312  
   313  // triggersSet is a dsset.Set that stores internal.Trigger protos.
   314  type triggersSet struct {
   315  	dsset.Set
   316  }
   317  
   318  // Add adds triggers to the set.
   319  func (s *triggersSet) Add(c context.Context, triggers []*internal.Trigger) error {
   320  	items := make([]dsset.Item, 0, len(triggers))
   321  	for _, t := range triggers {
   322  		blob, err := proto.Marshal(t)
   323  		if err != nil {
   324  			return fmt.Errorf("failed to marshal proto - %s", err)
   325  		}
   326  		items = append(items, dsset.Item{
   327  			ID:    t.Id,
   328  			Value: blob,
   329  		})
   330  	}
   331  	return s.Set.Add(c, items)
   332  }
   333  
   334  // Triggers returns all triggers in the set, in no particular order.
   335  //
   336  // Returns the original dsset listing (that can be used later with BeginPop or
   337  // CleanupGarbage), as well as the actual deserialized set of triggers.
   338  func (s *triggersSet) Triggers(c context.Context) (*dsset.Listing, []*internal.Trigger, error) {
   339  	l, err := s.Set.List(c)
   340  	if err != nil {
   341  		return nil, nil, err
   342  	}
   343  	out := make([]*internal.Trigger, len(l.Items))
   344  	for i, item := range l.Items {
   345  		out[i] = &internal.Trigger{}
   346  		if err := proto.Unmarshal(item.Value, out[i]); err != nil {
   347  			return nil, nil, errors.Annotate(err, "failed to unmarshal trigger").Err()
   348  		}
   349  		if out[i].Id != item.ID {
   350  			return nil, nil, fmt.Errorf("trigger ID in the body (%q) doesn't match item ID %q", out[i].Id, item.ID)
   351  		}
   352  	}
   353  	return l, out, nil
   354  }