go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/cltriggerer/cltriggerer.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cltriggerer 16 17 import ( 18 "context" 19 "fmt" 20 "sync/atomic" 21 "time" 22 23 "google.golang.org/protobuf/proto" 24 25 "go.chromium.org/luci/common/clock" 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/common/logging" 28 "go.chromium.org/luci/common/retry" 29 "go.chromium.org/luci/common/retry/transient" 30 "go.chromium.org/luci/common/sync/dispatcher" 31 "go.chromium.org/luci/common/sync/dispatcher/buffer" 32 "go.chromium.org/luci/server/tq" 33 34 "go.chromium.org/luci/cv/internal/changelist" 35 "go.chromium.org/luci/cv/internal/common" 36 "go.chromium.org/luci/cv/internal/common/lease" 37 "go.chromium.org/luci/cv/internal/gerrit" 38 "go.chromium.org/luci/cv/internal/metrics" 39 "go.chromium.org/luci/cv/internal/prjmanager" 40 "go.chromium.org/luci/cv/internal/prjmanager/prjpb" 41 "go.chromium.org/luci/cv/internal/run" 42 ) 43 44 const maxConcurrency = 16 45 46 // Triggerer triggers given CLs. 47 type Triggerer struct { 48 pmNotifier *prjmanager.Notifier 49 gFactory gerrit.Factory 50 clUpdater clUpdater 51 clMutator *changelist.Mutator 52 } 53 54 // clUpdater is a subset of the *changelist.Updater which Triggerer needs. 55 type clUpdater interface { 56 Schedule(context.Context, *changelist.UpdateCLTask) error 57 } 58 59 // New creates a Triggerer. 60 func New(n *prjmanager.Notifier, gf gerrit.Factory, clu clUpdater, clm *changelist.Mutator) *Triggerer { 61 v := &Triggerer{ 62 pmNotifier: n, 63 gFactory: gf, 64 clUpdater: clu, 65 clMutator: clm, 66 } 67 n.TasksBinding.TriggerProjectCLDeps.AttachHandler( 68 func(ctx context.Context, payload proto.Message) error { 69 task := payload.(*prjpb.TriggeringCLDepsTask) 70 ctx = logging.SetField(ctx, "project", task.GetLuciProject()) 71 return common.TQifyError(ctx, 72 errors.Annotate(v.process(ctx, task), "triggerer.process").Err()) 73 }, 74 ) 75 return v 76 } 77 78 // Schedule schedules a task for CQVoteTask. 79 func (tr *Triggerer) Schedule(ctx context.Context, t *prjpb.TriggeringCLDepsTask) error { 80 payload := t.GetTriggeringClDeps() 81 if len(payload.GetDepClids()) == 0 { 82 return nil 83 } 84 return tr.pmNotifier.TasksBinding.TQDispatcher.AddTask(ctx, &tq.Task{ 85 Payload: t, 86 Title: fmt.Sprintf("%s/%s/%d-%d", 87 t.GetLuciProject(), payload.GetOperationId(), 88 payload.GetOriginClid(), len(payload.GetDepClids())), 89 // Not allowed in a transaction 90 DeduplicationKey: "", 91 }) 92 } 93 94 func (tr *Triggerer) makeDispatcherChannel(ctx context.Context, task *prjpb.TriggeringCLDepsTask) dispatcher.Channel { 95 concurrency := min(len(task.GetTriggeringClDeps().GetDepClids()), maxConcurrency) 96 prj := task.GetLuciProject() 97 dc, err := dispatcher.NewChannel(ctx, &dispatcher.Options{ 98 ErrorFn: func(failedBatch *buffer.Batch, err error) (retry bool) { 99 _, isLeaseErr := lease.IsAlreadyInLeaseErr(err) 100 return isLeaseErr || transient.Tag.In(err) 101 }, 102 DropFn: dispatcher.DropFnQuiet, 103 Buffer: buffer.Options{ 104 MaxLeases: concurrency, 105 BatchItemsMax: 1, 106 FullBehavior: &buffer.BlockNewItems{ 107 MaxItems: concurrency, 108 }, 109 Retry: makeRetryFactory(), 110 }, 111 }, func(data *buffer.Batch) error { 112 op, ok := data.Data[0].Item.(*triggerDepOp) 113 if !ok { 114 panic(fmt.Errorf("unexpected batch data item type %T", data.Data[0].Item)) 115 } 116 ctx := logging.SetFields(ctx, logging.Fields{"cl": op.depCLID}) 117 return op.execute(ctx, tr.gFactory, prj, tr.clMutator, tr.clUpdater) 118 }) 119 if err != nil { 120 panic(fmt.Errorf("cltriggerer: unexpected failure in dispatcher creation")) 121 } 122 return dc 123 } 124 125 func (tr *Triggerer) process(ctx context.Context, task *prjpb.TriggeringCLDepsTask) (err error) { 126 var isCanceled atomic.Bool 127 payload := task.GetTriggeringClDeps() 128 evt := &prjpb.TriggeringCLDepsCompleted{ 129 OperationId: payload.GetOperationId(), 130 Origin: payload.GetOriginClid(), 131 } 132 ctx = logging.SetField(ctx, "origin_cl", payload.GetOriginClid()) 133 startTS := clock.Now(ctx) 134 defer func() { 135 if err == nil { 136 reportMetrics(ctx, task, evt, isCanceled.Load(), startTS) 137 } 138 }() 139 140 taskCtx, cancel := clock.WithDeadline(ctx, payload.GetDeadline().AsTime()) 141 defer cancel() 142 originCL := &changelist.CL{ID: common.CLID(payload.GetOriginClid())} 143 switch err := changelist.LoadCLs(taskCtx, []*changelist.CL{originCL}); errors.Unwrap(err) { 144 case nil: 145 case context.Canceled, context.DeadlineExceeded: 146 // ctx instead of taskCtx. 147 evt.Incompleted = append(evt.Incompleted, payload.GetDepClids()...) 148 return tr.pmNotifier.NotifyTriggeringCLDepsCompleted(ctx, task.GetLuciProject(), evt) 149 default: 150 // always return a transient to retry fetching the originating CL 151 // until the deadline exceeds. 152 return transient.Tag.Apply(err) 153 } 154 155 // trigger votes in parallel while constantly checking the vote status 156 // of the originating CL. 157 ops := makeTriggerDepOps(originCL.ExternalID.MustURL(), payload, &isCanceled) 158 if ensureOriginCLVote(taskCtx, originCL) { 159 go checkVoteStatus(taskCtx, payload.GetOriginClid(), &isCanceled) 160 dc := tr.makeDispatcherChannel(taskCtx, task) 161 for _, item := range ops { 162 dc.C <- item 163 } 164 dc.Close() 165 <-dc.DrainC 166 } else { 167 // no need, but just for the sake. 168 isCanceled.Store(true) 169 } 170 171 for _, op := range ops { 172 switch { 173 case op.isSucceeded(): 174 // It's possible that the origin CQ vote no longer exists. 175 // If so, OnTriggeringCLDepsCompleted() will check the origin vote 176 // status, and schedule PurgingCLTask for the successfully voted 177 // deps. 178 evt.Succeeded = append(evt.Succeeded, op.depCLID) 179 case op.isPermanentlyFailed(): 180 evt.Failed = append(evt.Failed, op.getCLError()) 181 default: 182 evt.Incompleted = append(evt.Incompleted, op.depCLID) 183 } 184 } 185 // ctx instead of taskCtx to send a notification even if the deadline 186 // exceeds. 187 return tr.pmNotifier.NotifyTriggeringCLDepsCompleted(ctx, task.GetLuciProject(), evt) 188 } 189 190 func makeRetryFactory() retry.Factory { 191 return transient.Only(func() retry.Iterator { 192 return &retry.ExponentialBackoff{ 193 Limited: retry.Limited{ 194 Delay: 100 * time.Millisecond, 195 Retries: -1, // unlimited 196 }, 197 Multiplier: 2, 198 MaxDelay: 30 * time.Second, 199 } 200 }) 201 } 202 203 func ensureOriginCLVote(ctx context.Context, originCL *changelist.CL) bool { 204 switch mode := findCQTriggerMode(originCL); mode { 205 case string(run.FullRun): 206 return true 207 case "": 208 logging.Infof(ctx, "the origin CL %d no longer has CQ vote; stop voting", originCL.ID) 209 return false 210 default: 211 // The originating CL now has CQ+1. This can only happen in the 212 // following scenario. 213 // - at t1, the origin CL gets CQ+2 and TriggeringCLDepsTask is created. 214 // - at t2, the origin CL gets CQ+1, while or before the task process. 215 // 216 // This should be considered as cancelling the CQ vote chain 217 // process. It's OK to skip all the vote ops for the dep CLs. 218 // Then, PM will retriage the originating CL, as necessary. 219 logging.Infof(ctx, "the origin CL %d now has a CQ vote for %q; stop voting", mode) 220 return false 221 } 222 } 223 224 func checkVoteStatus(ctx context.Context, originCLID int64, isCanceled *atomic.Bool) { 225 originCL := &changelist.CL{ID: common.CLID(originCLID)} 226 for { 227 select { 228 case <-ctx.Done(): 229 return 230 case tr := <-clock.After(ctx, 4*time.Second): 231 if tr.Err != nil { 232 return 233 } 234 } 235 if err := changelist.LoadCLs(ctx, []*changelist.CL{originCL}); err == nil { 236 isCanceled.Store(ensureOriginCLVote(ctx, originCL)) 237 } 238 } 239 } 240 241 func taskMetricStatus(isCanceled bool, evt *prjpb.TriggeringCLDepsCompleted) string { 242 switch { 243 case isCanceled: 244 return "CANCELED" 245 case len(evt.GetFailed()) > 0: 246 return "FAILED" 247 case len(evt.GetIncompleted()) > 0: 248 // if isCancelled == false, len(Incompleted) > 0 can happen only if 249 // the context expires. 250 return "TIMEDOUT" 251 default: 252 return "SUCCEEDED" 253 } 254 } 255 256 func reportMetrics(ctx context.Context, task *prjpb.TriggeringCLDepsTask, evt *prjpb.TriggeringCLDepsCompleted, isCanceled bool, startTS time.Time) { 257 payload := task.GetTriggeringClDeps() 258 status := taskMetricStatus(isCanceled, evt) 259 metrics.Internal.CLTriggererTaskCompleted.Add( 260 ctx, 261 1, 262 task.GetLuciProject(), 263 payload.GetConfigGroupName(), 264 len(payload.GetDepClids()), 265 status, 266 ) 267 metrics.Internal.CLTriggererTaskDuration.Add( 268 ctx, 269 float64(clock.Since(ctx, startTS).Milliseconds()), 270 task.GetLuciProject(), 271 payload.GetConfigGroupName(), 272 len(payload.GetDepClids()), 273 status, 274 ) 275 }