go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/job.go (about) 1 // Copyright 2017 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package engine 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "hash/fnv" 22 "strconv" 23 "strings" 24 "time" 25 26 "google.golang.org/protobuf/proto" 27 28 "go.chromium.org/luci/auth/identity" 29 "go.chromium.org/luci/common/errors" 30 "go.chromium.org/luci/gae/service/datastore" 31 32 "go.chromium.org/luci/scheduler/appengine/catalog" 33 "go.chromium.org/luci/scheduler/appengine/engine/cron" 34 "go.chromium.org/luci/scheduler/appengine/engine/dsset" 35 "go.chromium.org/luci/scheduler/appengine/internal" 36 "go.chromium.org/luci/scheduler/appengine/schedule" 37 ) 38 39 // FinishedInvocationsHorizon defines how many invocations to keep in the 40 // Job's FinishedInvocations list. 41 // 42 // All entries there that are older than FinishedInvocationsHorizon will be 43 // evicted next time the list is updated. 44 const FinishedInvocationsHorizon = 10 * time.Minute 45 46 // Job stores the last known definition of a scheduler job, as well as its 47 // current state. Root entity, its kind is "Job". 48 type Job struct { 49 _kind string `gae:"$kind,Job"` 50 _extra datastore.PropertyMap `gae:"-,extra"` 51 52 // cachedSchedule and cachedScheduleErr are used by ParseSchedule(). 53 cachedSchedule *schedule.Schedule `gae:"-"` 54 cachedScheduleErr error `gae:"-"` 55 56 // JobID is '<ProjectID>/<JobName>' string. JobName is unique with a project, 57 // but not globally. JobID is unique globally. 58 JobID string `gae:"$id"` 59 60 // ProjectID exists for indexing. It matches <projectID> portion of JobID. 61 ProjectID string 62 63 // RealmID is a global realm name (i.e. "<ProjectID>:...") the job belongs to. 64 RealmID string 65 66 // Flavor describes what category of jobs this is, see the enum. 67 Flavor catalog.JobFlavor `gae:",noindex"` 68 69 // Enabled is false if the job was disabled or removed from config. 70 // 71 // Disabled jobs do not show up in UI at all (they are still kept in the 72 // datastore though, for audit purposes). 73 Enabled bool 74 75 // Paused is true if no new invocations of the job should be started. 76 // 77 // Paused jobs ignore the cron scheduler and incoming triggers. Triggers are 78 // completely skipped (not even enqueued). Pausing a job clears the pending 79 // triggers set. 80 Paused bool `gae:",noindex"` 81 82 // PausedOrResumedWhen is when the job was paused or resumed. 83 PausedOrResumedWhen time.Time `gae:",noindex"` 84 85 // PausedOrResumedBy is who paused or resumed the job the last. 86 PausedOrResumedBy identity.Identity `gae:",noindex"` 87 88 // PausedOrResumedReason is the reason the job was paused or resumed. 89 PausedOrResumedReason string `gae:",noindex"` 90 91 // Revision is last seen job definition revision. 92 Revision string `gae:",noindex"` 93 94 // RevisionURL is URL to human readable page with config file at 95 // an appropriate revision. 96 RevisionURL string `gae:",noindex"` 97 98 // Schedule is the job's schedule in regular cron expression format. 99 Schedule string `gae:",noindex"` 100 101 // Task is the job's payload in serialized form. Opaque from the point of view 102 // of the engine. See Catalog.UnmarshalTask(). 103 Task []byte `gae:",noindex"` 104 105 // TriggeredJobIDs is a list of jobIDs of jobs which this job triggers. 106 // The list is sorted and without duplicates. 107 TriggeredJobIDs []string `gae:",noindex"` 108 109 // Cron holds the state of the cron state machine. 110 Cron cron.State `gae:",noindex"` 111 112 // TriggeringPolicyRaw is job's TriggeringPolicy proto in serialized form. 113 // 114 // It is taken from the job definition stored in the catalog. Used during 115 // the triage. 116 TriggeringPolicyRaw []byte `gae:",noindex"` 117 118 // ActiveInvocations is ordered set of active invocation IDs. 119 // 120 // It contains IDs of pending, running or recently finished invocations, 121 // the most recent at the end. 122 ActiveInvocations []int64 `gae:",noindex"` 123 124 // FinishedInvocationsRaw is a list of recently finished invocations, along 125 // with the time they finished. 126 // 127 // It is serialized internal.FinishedInvocationList proto, see db.proto. We 128 // store it this way to simplify adding more fields if necessary and to avoid 129 // paying the cost of the deserialization if the caller is not interested. 130 // 131 // This list is used to achieve a perfectly consistent listing of all recent 132 // invocations of a job. 133 // 134 // Entries older than FinishedInvocationsHorizon are evicted from this list 135 // during triages. We assume that FinishedInvocationsHorizon is enough for 136 // datastore indexes to catch up, so all recent invocations older than the 137 // horizon can be fetched using a regular datastore query. 138 FinishedInvocationsRaw []byte `gae:",noindex"` 139 140 // LastTriage is a time when the last triage transaction was committed. 141 LastTriage time.Time `gae:",noindex"` 142 } 143 144 // JobTriageLog contains information about the most recent triage. 145 // 146 // To avoid increasing the triage transaction size, and to allow logging triage 147 // transaction collisions, this entity is saved non-transactionally in a 148 // separate entity group on a best effort basis. 149 // 150 // It means it may occasionally be stale. To detect staleness we duplicate 151 // LastTriage timestamp here. If Job.LastTriage indicates the triage happened 152 // sufficiently log ago (by wall clock), but JobTriageLog.LastTriage is still 153 // old, then the log is stale (since JobTriageLog commit should have landed 154 // already). When this happens consistently we'll have to use real GAE logs to 155 // figure out what's wrong. 156 type JobTriageLog struct { 157 _kind string `gae:"$kind,JobTriageLog"` 158 _extra datastore.PropertyMap `gae:"-,extra"` 159 160 // JobID is '<ProjectID>/<JobName>' string, matches corresponding Job.JobID. 161 JobID string `gae:"$id"` 162 // LastTriage is set to exact same value as corresponding Job.LastTriage. 163 LastTriage time.Time `gae:",noindex"` 164 // DebugLog is short free form text log with debug messages. 165 DebugLog string `gae:",noindex"` 166 167 // stale is populated by GetJobTriageLog if it thinks the log is stale. 168 stale bool `gae:"-"` 169 } 170 171 // Stale is true if the engine thinks the log is stale. 172 // 173 // It does it by comparing LastTriage to the job's LastTriage. 174 func (j *JobTriageLog) Stale() bool { 175 return j.stale 176 } 177 178 // JobName returns name of this Job as defined its project's config. 179 // 180 // This is "<name>" part extracted from "<project>/<name>" job ID. 181 func (e *Job) JobName() string { 182 chunks := strings.Split(e.JobID, "/") 183 return chunks[1] 184 } 185 186 // EffectiveSchedule returns schedule string to use for the job, considering its 187 // Paused field. 188 // 189 // Paused jobs always use "triggered" schedule. 190 func (e *Job) EffectiveSchedule() string { 191 if e.Paused { 192 return "triggered" 193 } 194 return e.Schedule 195 } 196 197 // ParseSchedule returns *Schedule object, parsing e.Schedule field. 198 // 199 // If job is paused e.Schedule field is ignored and "triggered" schedule is 200 // returned instead. 201 func (e *Job) ParseSchedule() (*schedule.Schedule, error) { 202 if e.cachedSchedule == nil && e.cachedScheduleErr == nil { 203 hash := fnv.New64() 204 hash.Write([]byte(e.JobID)) 205 seed := hash.Sum64() 206 e.cachedSchedule, e.cachedScheduleErr = schedule.Parse(e.EffectiveSchedule(), seed) 207 if e.cachedSchedule == nil && e.cachedScheduleErr == nil { 208 panic("no schedule and no error") 209 } 210 } 211 return e.cachedSchedule, e.cachedScheduleErr 212 } 213 214 // IsEqual returns true iff 'e' is equal to 'other'. 215 func (e *Job) IsEqual(other *Job) bool { 216 return e == other || (e.JobID == other.JobID && 217 e.ProjectID == other.ProjectID && 218 e.RealmID == other.RealmID && 219 e.Flavor == other.Flavor && 220 e.Enabled == other.Enabled && 221 e.Paused == other.Paused && 222 e.PausedOrResumedWhen.Equal(other.PausedOrResumedWhen) && 223 e.PausedOrResumedBy == other.PausedOrResumedBy && 224 e.PausedOrResumedReason == other.PausedOrResumedReason && 225 e.Revision == other.Revision && 226 e.RevisionURL == other.RevisionURL && 227 e.Schedule == other.Schedule && 228 e.LastTriage.Equal(other.LastTriage) && 229 bytes.Equal(e.Task, other.Task) && 230 equalSortedLists(e.TriggeredJobIDs, other.TriggeredJobIDs) && 231 e.Cron.Equal(&other.Cron) && 232 bytes.Equal(e.TriggeringPolicyRaw, other.TriggeringPolicyRaw) && 233 equalInt64Lists(e.ActiveInvocations, other.ActiveInvocations) && 234 bytes.Equal(e.FinishedInvocationsRaw, other.FinishedInvocationsRaw)) 235 } 236 237 // MatchesDefinition returns true if job definition in the entity matches the 238 // one specified by catalog.Definition struct. 239 func (e *Job) MatchesDefinition(def catalog.Definition) bool { 240 return e.JobID == def.JobID && 241 e.RealmID == def.RealmID && 242 e.Flavor == def.Flavor && 243 e.Schedule == def.Schedule && 244 bytes.Equal(e.Task, def.Task) && 245 bytes.Equal(e.TriggeringPolicyRaw, def.TriggeringPolicy) && 246 equalSortedLists(e.TriggeredJobIDs, def.TriggeredJobIDs) 247 } 248 249 // CronTickTime returns time when the cron job is expected to start again. 250 // 251 // May return: 252 // 253 // Zero time if the job is using relative schedule, or not a cron job at all. 254 // schedule.DistantFuture if the job is paused. 255 func (e *Job) CronTickTime() time.Time { 256 // Note: LastTick is "last scheduled tick", it is in the future. 257 return e.Cron.LastTick.When 258 } 259 260 // recentlyFinishedSet is a set with IDs of all recently finished invocations. 261 // 262 // This is an accumulator of IDs to remove from ActiveInvocations list next 263 // time we run a triage for the corresponding Job. 264 // 265 // Invocation IDs are serialized with fmt.Sprintf("%d"). 266 func recentlyFinishedSet(c context.Context, jobID string) *invocationIDSet { 267 return &invocationIDSet{ 268 Set: dsset.Set{ 269 ID: "finished:" + jobID, 270 ShardCount: 8, 271 TombstonesRoot: datastore.KeyForObj(c, &Job{JobID: jobID}), 272 TombstonesDelay: 30 * time.Minute, 273 }, 274 } 275 } 276 277 // invocationIDSet is a dsset.Set that stores invocation IDs. 278 type invocationIDSet struct { 279 dsset.Set 280 } 281 282 // Add adds a bunch of invocation IDs to the set. 283 func (s *invocationIDSet) Add(c context.Context, ids []int64) error { 284 items := make([]dsset.Item, len(ids)) 285 for i, id := range ids { 286 items[i].ID = fmt.Sprintf("%d", id) 287 } 288 return s.Set.Add(c, items) 289 } 290 291 // ItemToInvID takes a dsset.Item and returns invocation ID stored there or 0 if 292 // it's malformed. 293 func (s *invocationIDSet) ItemToInvID(i *dsset.Item) int64 { 294 id, _ := strconv.ParseInt(i.ID, 10, 64) 295 return id 296 } 297 298 // pendingTriggersSet is a set of not yet consumed triggers for the job. 299 // 300 // This is incoming triggers. They are processed in the triage procedure, 301 // resulting in new invocations. 302 func pendingTriggersSet(c context.Context, jobID string) *triggersSet { 303 return &triggersSet{ 304 Set: dsset.Set{ 305 ID: "triggers:" + jobID, 306 ShardCount: 8, 307 TombstonesRoot: datastore.KeyForObj(c, &Job{JobID: jobID}), 308 TombstonesDelay: 30 * time.Minute, 309 }, 310 } 311 } 312 313 // triggersSet is a dsset.Set that stores internal.Trigger protos. 314 type triggersSet struct { 315 dsset.Set 316 } 317 318 // Add adds triggers to the set. 319 func (s *triggersSet) Add(c context.Context, triggers []*internal.Trigger) error { 320 items := make([]dsset.Item, 0, len(triggers)) 321 for _, t := range triggers { 322 blob, err := proto.Marshal(t) 323 if err != nil { 324 return fmt.Errorf("failed to marshal proto - %s", err) 325 } 326 items = append(items, dsset.Item{ 327 ID: t.Id, 328 Value: blob, 329 }) 330 } 331 return s.Set.Add(c, items) 332 } 333 334 // Triggers returns all triggers in the set, in no particular order. 335 // 336 // Returns the original dsset listing (that can be used later with BeginPop or 337 // CleanupGarbage), as well as the actual deserialized set of triggers. 338 func (s *triggersSet) Triggers(c context.Context) (*dsset.Listing, []*internal.Trigger, error) { 339 l, err := s.Set.List(c) 340 if err != nil { 341 return nil, nil, err 342 } 343 out := make([]*internal.Trigger, len(l.Items)) 344 for i, item := range l.Items { 345 out[i] = &internal.Trigger{} 346 if err := proto.Unmarshal(item.Value, out[i]); err != nil { 347 return nil, nil, errors.Annotate(err, "failed to unmarshal trigger").Err() 348 } 349 if out[i].Id != item.ID { 350 return nil, nil, fmt.Errorf("trigger ID in the body (%q) doesn't match item ID %q", out[i].Id, item.ID) 351 } 352 } 353 return l, out, nil 354 }