go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/longops/reset_triggers.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package longops 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "golang.org/x/sync/errgroup" 23 "google.golang.org/genproto/googleapis/rpc/code" 24 "google.golang.org/protobuf/types/known/timestamppb" 25 26 "go.chromium.org/luci/common/clock" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/logging" 29 "go.chromium.org/luci/common/retry" 30 "go.chromium.org/luci/common/retry/transient" 31 "go.chromium.org/luci/common/sync/dispatcher" 32 "go.chromium.org/luci/common/sync/dispatcher/buffer" 33 34 "go.chromium.org/luci/cv/internal/changelist" 35 "go.chromium.org/luci/cv/internal/common" 36 "go.chromium.org/luci/cv/internal/common/lease" 37 "go.chromium.org/luci/cv/internal/configs/prjcfg" 38 "go.chromium.org/luci/cv/internal/gerrit" 39 "go.chromium.org/luci/cv/internal/gerrit/trigger" 40 "go.chromium.org/luci/cv/internal/metrics" 41 "go.chromium.org/luci/cv/internal/run" 42 "go.chromium.org/luci/cv/internal/run/eventpb" 43 ) 44 45 // ResetTriggersOp resets the triggers for the provided CLs. 46 // 47 // ResetTriggersOp keeps retrying on lease error and transient failure for each 48 // CL till the long op deadline is exceeded or reset either succeeds 49 // or fails non-transiently. 50 // 51 // ResetTriggersOp doesn't obey longop's cancellation request because if 52 // this long op is left half-done, for example, triggers on half of the CLs are 53 // untouched, a new Run may be created for those CLs. 54 // 55 // ResetTriggersOp is a single-use object. 56 type ResetTriggersOp struct { 57 *Base 58 GFactory gerrit.Factory 59 CLMutator *changelist.Mutator 60 // Concurrency is the number of CLs that will be reset concurrently. 61 // 62 // Default is 8. 63 Concurrency int 64 65 // Private fields that will be populated internally during long op execution. 66 67 inputs []trigger.ResetInput 68 results []resetResult 69 70 // testAfterTryResetFn is always called after each try to reset the trigger 71 // of a CL. 72 // 73 // This is only set for testing purpose. 74 testAfterTryResetFn func() 75 } 76 77 const defaultConcurrency = 8 78 79 // Do actually resets the triggers. 80 func (op *ResetTriggersOp) Do(ctx context.Context) (*eventpb.LongOpCompleted, error) { 81 op.assertCalledOnce() 82 83 if err := op.loadInputs(ctx); err != nil { 84 return nil, err 85 } 86 87 op.executeInParallel(ctx) 88 89 longOpStatus := eventpb.LongOpCompleted_SUCCEEDED // be optimistic 90 rt := &eventpb.LongOpCompleted_ResetTriggers{ 91 Results: make([]*eventpb.LongOpCompleted_ResetTriggers_Result, len(op.results)), 92 } 93 var lastTransErr, lastPermErr error 94 for i, result := range op.results { 95 cl := op.inputs[i].CL 96 rt.Results[i] = &eventpb.LongOpCompleted_ResetTriggers_Result{ 97 Id: int64(cl.ID), 98 ExternalId: string(cl.ExternalID), 99 } 100 switch err := result.err; { 101 case err == nil: 102 rt.Results[i].Detail = &eventpb.LongOpCompleted_ResetTriggers_Result_SuccessInfo{ 103 SuccessInfo: &eventpb.LongOpCompleted_ResetTriggers_Result_Success{ 104 ResetAt: timestamppb.New(result.resetAt), 105 }, 106 } 107 default: 108 longOpStatus = eventpb.LongOpCompleted_FAILED 109 rt.Results[i].Detail = &eventpb.LongOpCompleted_ResetTriggers_Result_FailureInfo{ 110 FailureInfo: &eventpb.LongOpCompleted_ResetTriggers_Result_Failure{ 111 FailureMessage: err.Error(), 112 }, 113 } 114 logging.Errorf(ctx, "failed to reset the trigger of CL %d %q: %s", cl.ID, cl.ExternalID, err) 115 if transient.Tag.In(err) { 116 lastTransErr = err 117 } else { 118 lastPermErr = err 119 } 120 } 121 } 122 ret := &eventpb.LongOpCompleted{ 123 Status: longOpStatus, 124 Result: &eventpb.LongOpCompleted_ResetTriggers_{ 125 ResetTriggers: rt, 126 }, 127 } 128 switch ctxErr := ctx.Err(); { 129 // Returns the event in error case as well because the event will be 130 // reported back to Run Manager. 131 case ctxErr == context.DeadlineExceeded: 132 logging.Errorf(ctx, "running out of time to reset triggers") 133 return ret, ctxErr 134 case ctxErr == context.Canceled: 135 logging.Errorf(ctx, "context is cancelled while resetting triggers") 136 return ret, ctxErr 137 case ctxErr != nil: 138 panic(fmt.Errorf("unexpected context error: %s", ctxErr)) 139 case lastPermErr != nil: 140 return ret, lastPermErr 141 case lastTransErr != nil: 142 // Don't return a transient error to prevent long op from retrying. 143 // The transient error should have been retried many times in this long op. 144 return ret, transient.Tag.Off().Apply(lastTransErr) 145 default: 146 return ret, nil 147 } 148 } 149 150 func (op *ResetTriggersOp) loadInputs(ctx context.Context) error { 151 var ( 152 clsToReset []*changelist.CL 153 triggers map[common.CLID]*run.Triggers 154 cfg *prjcfg.ConfigGroup 155 ) 156 eg, ctx := errgroup.WithContext(ctx) 157 requests := op.Op.GetResetTriggers().GetRequests() 158 eg.Go(func() (err error) { 159 clids := make(common.CLIDs, len(requests)) 160 for i, req := range requests { 161 clids[i] = common.CLID(req.Clid) 162 } 163 clsToReset, err = changelist.LoadCLsByIDs(ctx, clids) 164 return err 165 }) 166 eg.Go(func() error { 167 runCLs, err := run.LoadRunCLs(ctx, op.Run.ID, op.Run.CLs) 168 if err != nil { 169 return err 170 } 171 triggers = make(map[common.CLID]*run.Triggers, len(runCLs)) 172 for _, runCL := range runCLs { 173 triggers[runCL.ID] = triggers[runCL.ID].WithTrigger(runCL.Trigger) 174 } 175 return nil 176 }) 177 eg.Go(func() (err error) { 178 cfg, err = prjcfg.GetConfigGroup(ctx, op.Run.ID.LUCIProject(), op.Run.ConfigGroupID) 179 return err 180 }) 181 if err := eg.Wait(); err != nil { 182 return err 183 } 184 185 op.inputs = make([]trigger.ResetInput, len(requests)) 186 op.results = make([]resetResult, len(requests)) 187 luciProject := op.Run.ID.LUCIProject() 188 for i := range requests { 189 cl, req := clsToReset[i], requests[i] 190 op.inputs[i] = trigger.ResetInput{ 191 CL: cl, 192 Triggers: triggers[cl.ID], 193 LUCIProject: luciProject, 194 Message: req.Message, 195 Requester: "Trigger Reset", 196 Notify: req.Notify, 197 LeaseDuration: time.Minute, 198 ConfigGroups: []*prjcfg.ConfigGroup{cfg}, 199 AddToAttentionSet: req.AddToAttention, 200 AttentionReason: req.AddToAttentionReason, 201 GFactory: op.GFactory, 202 CLMutator: op.CLMutator, 203 } 204 op.results[i] = resetResult{ 205 err: errNotAttemptedYet, 206 } 207 } 208 return nil 209 } 210 211 type resetItem struct { 212 index int 213 input trigger.ResetInput 214 } 215 type resetResult struct { 216 resetAt time.Time 217 err error 218 } 219 220 // errNotAttemptedYet is the initial error set in resetResult. 221 var errNotAttemptedYet = errors.New("not attempted reset yet") 222 223 // executeInParallel resets the triggers of the provided CLs in parallel 224 // and keeps retrying on transient or alreadyInLease failure until the context 225 // is done. 226 func (op *ResetTriggersOp) executeInParallel(ctx context.Context) { 227 dc := op.makeDispatcherChannel(ctx) 228 for i, input := range op.inputs { 229 dc.C <- resetItem{index: i, input: input} 230 } 231 dc.Close() 232 <-dc.DrainC 233 } 234 235 func (op *ResetTriggersOp) makeDispatcherChannel(ctx context.Context) dispatcher.Channel { 236 concurrency := op.Concurrency 237 if concurrency == 0 { 238 concurrency = defaultConcurrency 239 } 240 concurrency = min(concurrency, len(op.inputs)) 241 dc, err := dispatcher.NewChannel(ctx, &dispatcher.Options{ 242 ErrorFn: func(failedBatch *buffer.Batch, err error) (retry bool) { 243 _, isLeaseErr := lease.IsAlreadyInLeaseErr(err) 244 return isLeaseErr || transient.Tag.In(err) 245 }, 246 DropFn: dispatcher.DropFnQuiet, 247 Buffer: buffer.Options{ 248 MaxLeases: concurrency, 249 BatchItemsMax: 1, 250 FullBehavior: &buffer.BlockNewItems{ 251 MaxItems: len(op.results), 252 }, 253 Retry: op.makeRetryFactory(), 254 }, 255 }, func(data *buffer.Batch) error { 256 ci, ok := data.Data[0].Item.(resetItem) 257 if !ok { 258 panic(fmt.Errorf("unexpected batch data item %s", data.Data[0].Item)) 259 } 260 result := &op.results[ci.index] 261 result.err = trigger.Reset(ctx, ci.input) 262 gerritErr := "GERRIT_ERROR_NONE" 263 if errCode, ok := trigger.IsResetErrFromGerrit(result.err); ok { 264 if codeString, ok := code.Code_name[int32(errCode)]; ok { 265 gerritErr = codeString 266 } else { 267 gerritErr = fmt.Sprintf("Code(%d)", int64(errCode)) 268 } 269 } 270 metrics.Internal.RunResetTriggerAttempted.Add(ctx, 1, op.Run.ID.LUCIProject(), op.Run.ConfigGroupID.Name(), string(op.Run.Mode), result.err == nil, gerritErr) 271 if result.err == nil { 272 result.resetAt = clock.Now(ctx) 273 } 274 if op.testAfterTryResetFn != nil { 275 op.testAfterTryResetFn() 276 } 277 return result.err 278 }) 279 if err != nil { 280 panic(fmt.Errorf("unexpected failure when creating dispatcher channel: %s", err)) 281 } 282 return dc 283 } 284 285 func (op *ResetTriggersOp) makeRetryFactory() retry.Factory { 286 return lease.RetryIfLeased(transient.Only(func() retry.Iterator { 287 return &retry.ExponentialBackoff{ 288 Limited: retry.Limited{ 289 Delay: 100 * time.Millisecond, 290 Retries: -1, // unlimited 291 }, 292 Multiplier: 2, 293 MaxDelay: 1 * time.Minute, 294 } 295 })) 296 }