go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/scheduler/appengine/engine/invocation.go (about) 1 // Copyright 2017 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package engine 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "math" 22 "strings" 23 "time" 24 25 "go.chromium.org/luci/gae/service/datastore" 26 27 "go.chromium.org/luci/auth/identity" 28 "go.chromium.org/luci/common/clock" 29 "go.chromium.org/luci/common/data/rand/mathrand" 30 "go.chromium.org/luci/common/errors" 31 "go.chromium.org/luci/common/logging" 32 "go.chromium.org/luci/common/retry/transient" 33 "go.chromium.org/luci/common/tsmon/distribution" 34 "go.chromium.org/luci/common/tsmon/field" 35 "go.chromium.org/luci/common/tsmon/metric" 36 "go.chromium.org/luci/common/tsmon/types" 37 38 "go.chromium.org/luci/scheduler/appengine/internal" 39 "go.chromium.org/luci/scheduler/appengine/task" 40 ) 41 42 // errInvocationIDConflict is returned by generateInvocationID. 43 var errInvocationIDConflict = errors.New("could not find available invocationID", transient.Tag) 44 45 const ( 46 // debugLogSizeLimit is how many bytes the invocation debug log can be before 47 // it gets trimmed. See 'trimDebugLog'. The debug log isn't supposed to be 48 // huge. 49 debugLogSizeLimit = 200000 50 51 // debugLogTailLines is how many last log lines to keep when trimming the log. 52 debugLogTailLines = 100 53 ) 54 55 // Jan 1 2015, in UTC. 56 var invocationIDEpoch time.Time 57 58 func init() { 59 var err error 60 invocationIDEpoch, err = time.Parse(time.RFC822, "01 Jan 15 00:00 UTC") 61 if err != nil { 62 panic(err) 63 } 64 } 65 66 var ( 67 // distributionBucketerSecToDay 1s to 1 day. 68 // 69 // 0.05037 is math.log10(math.exp(math.log(24*60*60)/98)), which means 70 // the last bucket will contain overflow of everything >=1 day. 71 distributionBucketerSecToDay = distribution.GeometricBucketer(math.Pow(10.0, 0.05037), 100) 72 73 metricInvocationsDurations = metric.NewCumulativeDistribution( 74 "luci/scheduler/invocations/durations", 75 "Durations of completed invocations (sec).", 76 &types.MetricMetadata{Units: types.Seconds}, 77 distributionBucketerSecToDay, 78 field.String("jobID"), 79 field.String("status"), // one of final statuses of task.Status enum. 80 ) 81 ) 82 83 // generateInvocationID is called within a transaction to pick a new Invocation 84 // ID and ensure it isn't taken yet. 85 // 86 // This function essentially pick root key for a new entity group, checking 87 // that it hasn't been taken yet. 88 // 89 // Format of the invocation ID: 90 // - highest order bit set to 0 to keep the value positive. 91 // - next 43 bits set to negated time since some predefined epoch, in ms. 92 // - next 16 bits are generated by math.Rand 93 // - next 4 bits set to 0. They indicate ID format. 94 // 95 // Makes one attempt at allocating an ID. If it fails (should be extremely 96 // rare), the entire transaction should be retried. We do it to avoid 97 // unnecessarily hitting multiple entity groups from a single transaction. 98 // 99 // Returns only transient errors. 100 func generateInvocationID(c context.Context) (int64, error) { 101 // See http://play.golang.org/p/POpQzpT4Up. 102 invTs := int64(clock.Now(c).UTC().Sub(invocationIDEpoch) / time.Millisecond) 103 invTs = ^invTs & 8796093022207 // 0b111....1, 42 bits (clear highest bit) 104 invTs = invTs << 20 105 106 randSuffix := mathrand.Int63n(c, 65536) 107 invID := invTs | (randSuffix << 4) 108 exists, err := datastore.Exists(c, datastore.NewKey(c, "Invocation", "", invID, nil)) 109 if err != nil { 110 return 0, transient.Tag.Apply(err) 111 } 112 if !exists.All() { 113 return invID, nil 114 } 115 116 return 0, errInvocationIDConflict 117 } 118 119 // Invocation entity stores single invocation of a job (with perhaps multiple 120 // attempts due retries if the invocation fails to start). 121 // 122 // Root entity. ID is generated based on time by generateInvocationID() 123 // function. 124 type Invocation struct { 125 _kind string `gae:"$kind,Invocation"` 126 _extra datastore.PropertyMap `gae:"-,extra"` 127 128 // ID is identifier of this particular attempt to run a job. 129 ID int64 `gae:"$id"` 130 131 // JobID is '<ProjectID>/<JobName>' string of a parent job. 132 // 133 // Set when the invocation is created and never changes. 134 JobID string `gae:",noindex"` 135 136 // IndexedJobID is '<ProjectID>/<JobName>' string of a parent job, but it is 137 // set only for finished invocations. 138 // 139 // It is used to make the invocations appear in the listings of finished 140 // invocations. 141 // 142 // We can't use JobID field for this since the invocation launch procedure can 143 // potentially generate orphaned "garbage" invocations in some edge cases (if 144 // Invocation transaction lands, but separate Job transaction doesn't). They 145 // are harmless, but we don't want them to show up in listings. 146 IndexedJobID string 147 148 // RealmID is a global realm name (i.e. "<ProjectID>:...") the invocation 149 // belongs to. 150 // 151 // It is copied from the Job entity when the invocation is created. May be 152 // empty for old invocations. 153 RealmID string `gae:",noindex"` 154 155 // Started is time when this invocation was created. 156 Started time.Time `gae:",noindex"` 157 158 // Finished is time when this invocation transitioned to a terminal state. 159 Finished time.Time `gae:",noindex"` 160 161 // TriggeredBy is identity of whoever triggered the invocation, if it was 162 // triggered via a single trigger submitted by some external user (not by the 163 // service itself). 164 // 165 // Empty identity string if it was triggered by the service itself. 166 TriggeredBy identity.Identity 167 168 // PropertiesRaw is a blob with serialized task.Request.Properties supplied 169 // when the invocation was created. 170 // 171 // Task managers use it to prepare the parameters for tasks. 172 PropertiesRaw []byte `gae:",noindex"` 173 174 // Tags is a sorted list of indexed "key:value" pairs supplied via 175 // task.Request.Tags when the invocation was created. 176 // 177 // May be passed down the stack by task managers. 178 Tags []string 179 180 // IncomingTriggersRaw is a serialized list of triggers that the invocation 181 // consumed. 182 // 183 // They are popped from job's pending triggers set when the invocation 184 // starts. 185 // 186 // Use IncomingTriggers() function to grab them in deserialized form. 187 IncomingTriggersRaw []byte `gae:",noindex"` 188 189 // OutgoingTriggersRaw is a serialized list of triggers that the invocation 190 // produced. 191 // 192 // They are fanned out into pending trigger sets of corresponding triggered 193 // jobs (specified by TriggeredJobIDs). 194 // 195 // Use OutgoingTriggers() function to grab them in deserialized form. 196 OutgoingTriggersRaw []byte `gae:",noindex"` 197 198 // PendingTimersRaw is a serialized list of pending invocation timers. 199 // 200 // Timers are emitted by Controller's AddTimer call. 201 // 202 // Use PendingTimers() function to grab them in deserialized form. 203 PendingTimersRaw []byte `gae:",noindex"` 204 205 // Revision is revision number of config.cfg when this invocation was created. 206 // For informational purpose. 207 Revision string `gae:",noindex"` 208 209 // RevisionURL is URL to human readable page with config file at 210 // an appropriate revision. For informational purpose. 211 RevisionURL string `gae:",noindex"` 212 213 // Task is the job payload for this invocation in binary serialized form. 214 // For informational purpose. See Catalog.UnmarshalTask(). 215 Task []byte `gae:",noindex"` 216 217 // TriggeredJobIDs is a list of jobIDs of jobs which this job triggers. 218 // The list is sorted and without duplicates. 219 TriggeredJobIDs []string `gae:",noindex"` 220 221 // DebugLog is short free form text log with debug messages. 222 DebugLog string `gae:",noindex"` 223 224 // RetryCount is 0 on a first attempt to launch the task. Increased with each 225 // retry. For informational purposes. 226 RetryCount int64 `gae:",noindex"` 227 228 // Status is current status of the invocation (e.g. "RUNNING"), see the enum. 229 Status task.Status 230 231 // ViewURL is optional URL to a human readable page with task status, e.g. 232 // Swarming task page. Populated by corresponding TaskManager. 233 ViewURL string `gae:",noindex"` 234 235 // TaskData is a storage where TaskManager can keep task-specific state 236 // between calls. 237 TaskData []byte `gae:",noindex"` 238 239 // MutationsCount is used for simple compare-and-swap transaction control. 240 // 241 // It is incremented on each change to the entity. 242 MutationsCount int64 `gae:",noindex"` 243 } 244 245 // isEqual returns true iff 'e' is equal to 'other' 246 func (e *Invocation) isEqual(other *Invocation) bool { 247 return e == other || (e.ID == other.ID && 248 e.MutationsCount == other.MutationsCount && // compare it first, it changes most often 249 e.JobID == other.JobID && 250 e.IndexedJobID == other.IndexedJobID && 251 e.Started.Equal(other.Started) && 252 e.Finished.Equal(other.Finished) && 253 e.TriggeredBy == other.TriggeredBy && 254 bytes.Equal(e.PropertiesRaw, other.PropertiesRaw) && 255 equalSortedLists(e.Tags, other.Tags) && 256 bytes.Equal(e.IncomingTriggersRaw, other.IncomingTriggersRaw) && 257 bytes.Equal(e.OutgoingTriggersRaw, other.OutgoingTriggersRaw) && 258 bytes.Equal(e.PendingTimersRaw, other.PendingTimersRaw) && 259 e.Revision == other.Revision && 260 e.RevisionURL == other.RevisionURL && 261 bytes.Equal(e.Task, other.Task) && 262 equalSortedLists(e.TriggeredJobIDs, other.TriggeredJobIDs) && 263 e.DebugLog == other.DebugLog && 264 e.RetryCount == other.RetryCount && 265 e.Status == other.Status && 266 e.ViewURL == other.ViewURL && 267 bytes.Equal(e.TaskData, other.TaskData)) 268 } 269 270 // GetProjectID parses the ProjectID from the JobID and returns it. 271 func (e *Invocation) GetProjectID() string { 272 parts := strings.Split(e.JobID, "/") 273 return parts[0] 274 } 275 276 // debugLog appends a line to DebugLog field. 277 func (e *Invocation) debugLog(c context.Context, format string, args ...any) { 278 debugLog(c, &e.DebugLog, format, args...) 279 } 280 281 // trimDebugLog makes sure DebugLog field doesn't exceed limits. 282 // 283 // It cuts the middle of the log. We need to do this to keep the entity small 284 // enough to fit the datastore limits. 285 func (e *Invocation) trimDebugLog() { 286 if len(e.DebugLog) <= debugLogSizeLimit { 287 return 288 } 289 290 const cutMsg = "--- the log has been cut here ---" 291 giveUp := func() { 292 e.DebugLog = e.DebugLog[:debugLogSizeLimit-len(cutMsg)-2] + "\n" + cutMsg + "\n" 293 } 294 295 // We take last debugLogTailLines lines of log and move them "up", so that 296 // the total log size is less than debugLogSizeLimit. We then put a line with 297 // the message that some log lines have been cut. If these operations are not 298 // possible (e.g. we have some giant lines or something), we give up and just 299 // cut the end of the log. 300 301 // Find debugLogTailLines-th "\n" from the end, e.DebugLog[tailStart:] is the 302 // log tail. 303 tailStart := len(e.DebugLog) 304 for i := 0; i < debugLogTailLines; i++ { 305 tailStart = strings.LastIndex(e.DebugLog[:tailStart-1], "\n") 306 if tailStart <= 0 { 307 giveUp() 308 return 309 } 310 } 311 tailStart++ 312 313 // Figure out how many bytes of head we can keep to make trimmed log small 314 // enough. 315 tailLen := len(e.DebugLog) - tailStart + len(cutMsg) + 1 316 headSize := debugLogSizeLimit - tailLen 317 if headSize <= 0 { 318 giveUp() 319 return 320 } 321 322 // Find last "\n" in the head. 323 headEnd := strings.LastIndex(e.DebugLog[:headSize], "\n") 324 if headEnd <= 0 { 325 giveUp() 326 return 327 } 328 329 // We want to keep 50 lines of the head no matter what. 330 headLines := strings.Count(e.DebugLog[:headEnd], "\n") 331 if headLines < 50 { 332 giveUp() 333 return 334 } 335 336 // Remove duplicated 'cutMsg' lines. They may appear if 'debugLog' (followed 337 // by 'trimDebugLog') is called on already trimmed log multiple times. 338 lines := strings.Split(e.DebugLog[:headEnd], "\n") 339 lines = append(lines, cutMsg) 340 lines = append(lines, strings.Split(e.DebugLog[tailStart:], "\n")...) 341 trimmed := make([]byte, 0, debugLogSizeLimit) 342 trimmed = append(trimmed, lines[0]...) 343 for i := 1; i < len(lines); i++ { 344 if !(lines[i-1] == cutMsg && lines[i] == cutMsg) { 345 trimmed = append(trimmed, '\n') 346 trimmed = append(trimmed, lines[i]...) 347 } 348 } 349 e.DebugLog = string(trimmed) 350 } 351 352 // IncomingTriggers is a list of triggers that the invocation consumed. 353 // 354 // It is deserialized on the fly from IncomingTriggersRaw. 355 func (e *Invocation) IncomingTriggers() ([]*internal.Trigger, error) { 356 return unmarshalTriggersList(e.IncomingTriggersRaw) 357 } 358 359 // OutgoingTriggers is a list of triggers that the invocation produced. 360 // 361 // It is deserialized on the fly from OutgoingTriggersRaw. 362 func (e *Invocation) OutgoingTriggers() ([]*internal.Trigger, error) { 363 return unmarshalTriggersList(e.OutgoingTriggersRaw) 364 } 365 366 // PendingTimers is a list of not-yet-consumed invocation timers. 367 // 368 // It is deserialized on the fly from PendingTimersRaw. 369 func (e *Invocation) PendingTimers() ([]*internal.Timer, error) { 370 return unmarshalTimersList(e.PendingTimersRaw) 371 } 372 373 // cleanupUnreferencedInvocations tries to delete given invocations. 374 // 375 // This is best effort cleanup after failures. It logs errors, but doesn't 376 // return them, to indicate that there's nothing we can actually do. 377 // 378 // 'invs' is allowed to have nils, they are skipped. Allowed to be called 379 // within a transaction, ignores it. 380 func cleanupUnreferencedInvocations(c context.Context, invs []*Invocation) { 381 keysToKill := make([]*datastore.Key, 0, len(invs)) 382 for _, inv := range invs { 383 if inv != nil { 384 logging.Warningf(c, "Cleaning up inv %d of job %q", inv.ID, inv.JobID) 385 keysToKill = append(keysToKill, datastore.KeyForObj(c, inv)) 386 } 387 } 388 if err := datastore.Delete(datastore.WithoutTransaction(c), keysToKill); err != nil { 389 logging.WithError(err).Warningf(c, "Invocation cleanup failed") 390 } 391 } 392 393 // reportCompletionMetrics reports invocation stats to monitoring. 394 // Should be called after transaction to save this invocation is completed. 395 func (e *Invocation) reportCompletionMetrics(c context.Context) { 396 if !e.Status.Final() || e.Finished.IsZero() { 397 panic(fmt.Errorf("reportCompletionMetrics on incomplete invocation: %v", e)) 398 } 399 duration := e.Finished.Sub(e.Started) 400 metricInvocationsDurations.Add(c, duration.Seconds(), e.JobID, string(e.Status)) 401 }