go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/tryjob/tjcancel/cancellator.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tjcancel contains code in charge of cancelling stale tryjobs. 16 // 17 // Cancellator responds to tasks scheduled when a new patch is uploaded, 18 // looking for and cancelling stale tryjobs. 19 package tjcancel 20 21 import ( 22 "context" 23 "fmt" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 "google.golang.org/protobuf/proto" 30 31 "go.chromium.org/luci/common/clock" 32 "go.chromium.org/luci/common/data/stringset" 33 "go.chromium.org/luci/common/errors" 34 "go.chromium.org/luci/common/logging" 35 "go.chromium.org/luci/common/retry/transient" 36 "go.chromium.org/luci/common/sync/parallel" 37 "go.chromium.org/luci/gae/service/datastore" 38 39 "go.chromium.org/luci/cv/internal/changelist" 40 "go.chromium.org/luci/cv/internal/common" 41 "go.chromium.org/luci/cv/internal/run" 42 "go.chromium.org/luci/cv/internal/tryjob" 43 ) 44 45 // Cancellator is patterned after Updater to support multiple tryjob backends. 46 type Cancellator struct { 47 tn *tryjob.Notifier 48 49 // guards backends map. 50 rwmutex sync.RWMutex 51 backends map[string]cancellatorBackend 52 } 53 54 func NewCancellator(tn *tryjob.Notifier) *Cancellator { 55 c := &Cancellator{ 56 tn: tn, 57 backends: make(map[string]cancellatorBackend), 58 } 59 tn.Bindings.CancelStale.AttachHandler(func(ctx context.Context, payload proto.Message) error { 60 task := payload.(*tryjob.CancelStaleTryjobsTask) 61 ctx = logging.SetField(ctx, "CLID", task.GetClid()) 62 return common.TQifyError(ctx, c.handleTask(ctx, task)) 63 }) 64 return c 65 } 66 67 // RegisterBackend registers a backend. 68 // 69 // Panics if backend for the same kind is already registered. 70 func (c *Cancellator) RegisterBackend(b cancellatorBackend) { 71 kind := b.Kind() 72 if strings.ContainsRune(kind, '/') { 73 panic(fmt.Errorf("backend %T of kind %q must not contain '/'", b, kind)) 74 } 75 c.rwmutex.Lock() 76 defer c.rwmutex.Unlock() 77 if _, exists := c.backends[kind]; exists { 78 panic(fmt.Errorf("backend %q is already registered", kind)) 79 } 80 c.backends[kind] = b 81 } 82 83 func (c *Cancellator) handleTask(ctx context.Context, task *tryjob.CancelStaleTryjobsTask) error { 84 if task.PreviousMinEquivPatchset >= task.CurrentMinEquivPatchset { 85 panic(fmt.Errorf("patchset numbers expected to increase monotonically")) 86 } 87 cl := &changelist.CL{ID: common.CLID(task.GetClid())} 88 if err := datastore.Get(ctx, cl); err != nil { 89 return errors.Annotate(err, "failed to load CL %d", cl.ID).Tag(transient.Tag).Err() 90 } 91 preserveTryjob := false 92 for _, metadata := range cl.Snapshot.GetMetadata() { 93 if metadata.Key == common.FooterCQDoNotCancelTryjobs && strings.ToLower(strings.TrimSpace(metadata.Value)) == "true" { 94 preserveTryjob = true 95 } 96 } 97 if preserveTryjob { 98 logging.Infof(ctx, "skipping cancelling Tryjob as the latest CL has specified %s footer", common.FooterCQDoNotCancelTryjobs) 99 return nil 100 } 101 102 candidates, err := c.fetchCandidates(ctx, cl.ID, task.GetPreviousMinEquivPatchset(), task.GetCurrentMinEquivPatchset()) 103 switch { 104 case err != nil: 105 return err 106 case len(candidates) == 0: 107 logging.Infof(ctx, "no stale Tryjobs to cancel") 108 return nil 109 default: 110 tryjobIDs := make([]string, len(candidates)) 111 for i, tj := range candidates { 112 tryjobIDs[i] = strconv.Itoa(int(tj.ID)) 113 } 114 logging.Infof(ctx, "found stale Tryjobs to cancel: [%s]", strings.Join(tryjobIDs, ", ")) 115 return c.cancelTryjobs(ctx, candidates) 116 } 117 } 118 119 const cancelLaterDuration = 10 * time.Second 120 121 func (c *Cancellator) fetchCandidates(ctx context.Context, clid common.CLID, prevMinEquiPS, curMinEquiPS int32) ([]*tryjob.Tryjob, error) { 122 q := datastore.NewQuery(tryjob.TryjobKind). 123 Gte("CLPatchsets", tryjob.MakeCLPatchset(clid, prevMinEquiPS)). 124 Lt("CLPatchsets", tryjob.MakeCLPatchset(clid, curMinEquiPS)) 125 var candidates []*tryjob.Tryjob 126 err := datastore.Run(ctx, q, func(tj *tryjob.Tryjob) error { 127 switch { 128 case tj.ExternalID == "": 129 // Most likely Tryjob hasn't been triggered in the backend yet. 130 case tj.IsEnded(): 131 case tj.LaunchedBy == "": 132 // Not launched by LUCI CV, may be through `git cl try` command line. 133 case tj.Definition.GetSkipStaleCheck(): 134 default: 135 candidates = append(candidates, tj) 136 } 137 return nil 138 }) 139 switch { 140 case err != nil: 141 return nil, errors.Annotate(err, "failed to run the query to fetch candidate tryjobs for cancellation").Tag(transient.Tag).Err() 142 case len(candidates) == 0: 143 return nil, nil 144 } 145 146 hasAllWatchingRunsEndedFn, err := makeHasAllWatchingRunEndedFn(ctx, candidates) 147 if err != nil { 148 return nil, err 149 } 150 var ret = candidates[:0] // reuse the same slice 151 var cancelLaterScheduled bool 152 for _, candidate := range candidates { 153 switch nonEndedRuns := hasAllWatchingRunsEndedFn(candidate); { 154 case len(nonEndedRuns) > 0 && !cancelLaterScheduled: 155 eta := clock.Now(ctx).UTC().Add(cancelLaterDuration) 156 if err := c.tn.ScheduleCancelStale(ctx, clid, prevMinEquiPS, curMinEquiPS, eta); err != nil { 157 return nil, err 158 } 159 cancelLaterScheduled = true 160 fallthrough 161 case len(nonEndedRuns) > 0: 162 logging.Warningf(ctx, "tryjob %d is still watched by non ended runs %s. This is likely a race condition and those runs will end soon. Will retry cancellation after %s.", candidate.ID, nonEndedRuns, cancelLaterDuration) 163 default: 164 ret = append(ret, candidate) 165 } 166 } 167 return ret, nil 168 169 } 170 171 func makeHasAllWatchingRunEndedFn(ctx context.Context, tryjobs []*tryjob.Tryjob) (func(*tryjob.Tryjob) (nonEnded common.RunIDs), error) { 172 runIDSet := stringset.New(1) // typically only one run. 173 for _, tj := range tryjobs { 174 for _, rid := range tj.AllWatchingRuns() { 175 runIDSet.Add(string(rid)) 176 } 177 } 178 runs, errs := run.LoadRunsFromIDs(common.MakeRunIDs(runIDSet.ToSlice()...)...).Do(ctx) 179 endedRunIDs := make(map[common.RunID]struct{}, len(runs)) 180 for i, r := range runs { 181 switch err := errs[i]; { 182 case err == datastore.ErrNoSuchEntity: 183 return nil, errors.Reason("Tryjob is associated with a non-existent Run %s", r.ID).Err() 184 case err != nil: 185 return nil, errors.Annotate(err, "failed to load run %s", r.ID).Tag(transient.Tag).Err() 186 case run.IsEnded(r.Status): 187 endedRunIDs[r.ID] = struct{}{} 188 } 189 } 190 return func(tj *tryjob.Tryjob) common.RunIDs { 191 var nonEnded common.RunIDs 192 for _, rid := range tj.AllWatchingRuns() { 193 if _, ended := endedRunIDs[rid]; !ended { 194 nonEnded = append(nonEnded, rid) 195 } 196 } 197 return nonEnded 198 }, nil 199 } 200 201 const reason = "LUCI CV no longer needs this Tryjob" 202 203 func (c *Cancellator) cancelTryjobs(ctx context.Context, tjs []*tryjob.Tryjob) error { 204 if len(tjs) == 0 { 205 return nil 206 } 207 errs := parallel.WorkPool(min(8, len(tjs)), func(work chan<- func() error) { 208 for _, tj := range tjs { 209 tj := tj 210 work <- func() error { 211 be, err := c.backendFor(tj) 212 if err != nil { 213 return err 214 } 215 // TODO(crbug/1308930): use Buildbucket's batch API to reduce 216 // number of RPCs. 217 err = be.CancelTryjob(ctx, tj, reason) 218 if err != nil { 219 return errors.Annotate(err, "failed to cancel Tryjob [id=%d, eid=%s]", tj.ID, tj.ExternalID).Err() 220 } 221 return datastore.RunInTransaction(ctx, func(ctx context.Context) error { 222 if err := datastore.Get(ctx, tj); err != nil { 223 return errors.Annotate(err, "failed to load Tryjob %d", tj.ID).Tag(transient.Tag).Err() 224 } 225 if tj.IsEnded() { 226 return nil 227 } 228 tj.Status = tryjob.Status_CANCELLED 229 tj.EVersion++ 230 tj.EntityUpdateTime = datastore.RoundTime(clock.Now(ctx).UTC()) 231 if err := datastore.Put(ctx, tj); err != nil { 232 return errors.Annotate(err, "failed to save Tryjob %d", tj.ID).Tag(transient.Tag).Err() 233 } 234 return nil 235 }, nil) 236 } 237 } 238 }) 239 return common.MostSevereError(errs) 240 } 241 242 // cancellatorBackend is implemented by tryjobs backends, e.g. buildbucket. 243 type cancellatorBackend interface { 244 // Kind identifies the backend 245 // 246 // It's also the first part of the Tryjob's ExternalID, e.g. "buildbucket". 247 // Must not contain a slash. 248 Kind() string 249 // CancelTryjob should cancel the tryjob given. 250 // 251 // MUST not modify the given Tryjob object. 252 // If the tryjob was already cancelled, it should not return an error. 253 CancelTryjob(ctx context.Context, tj *tryjob.Tryjob, reason string) error 254 } 255 256 func (c *Cancellator) backendFor(t *tryjob.Tryjob) (cancellatorBackend, error) { 257 kind, err := t.ExternalID.Kind() 258 if err != nil { 259 return nil, err 260 } 261 c.rwmutex.RLock() 262 defer c.rwmutex.RUnlock() 263 if b, exists := c.backends[kind]; exists { 264 return b, nil 265 } 266 return nil, errors.Reason("%q backend is not supported", kind).Err() 267 }