go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/longops/reset_triggers.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package longops
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"golang.org/x/sync/errgroup"
    23  	"google.golang.org/genproto/googleapis/rpc/code"
    24  	"google.golang.org/protobuf/types/known/timestamppb"
    25  
    26  	"go.chromium.org/luci/common/clock"
    27  	"go.chromium.org/luci/common/errors"
    28  	"go.chromium.org/luci/common/logging"
    29  	"go.chromium.org/luci/common/retry"
    30  	"go.chromium.org/luci/common/retry/transient"
    31  	"go.chromium.org/luci/common/sync/dispatcher"
    32  	"go.chromium.org/luci/common/sync/dispatcher/buffer"
    33  
    34  	"go.chromium.org/luci/cv/internal/changelist"
    35  	"go.chromium.org/luci/cv/internal/common"
    36  	"go.chromium.org/luci/cv/internal/common/lease"
    37  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    38  	"go.chromium.org/luci/cv/internal/gerrit"
    39  	"go.chromium.org/luci/cv/internal/gerrit/trigger"
    40  	"go.chromium.org/luci/cv/internal/metrics"
    41  	"go.chromium.org/luci/cv/internal/run"
    42  	"go.chromium.org/luci/cv/internal/run/eventpb"
    43  )
    44  
    45  // ResetTriggersOp resets the triggers for the provided CLs.
    46  //
    47  // ResetTriggersOp keeps retrying on lease error and transient failure for each
    48  // CL till the long op deadline is exceeded or reset either succeeds
    49  // or fails non-transiently.
    50  //
    51  // ResetTriggersOp doesn't obey longop's cancellation request because if
    52  // this long op is left half-done, for example, triggers on half of the CLs are
    53  // untouched, a new Run may be created for those CLs.
    54  //
    55  // ResetTriggersOp is a single-use object.
    56  type ResetTriggersOp struct {
    57  	*Base
    58  	GFactory  gerrit.Factory
    59  	CLMutator *changelist.Mutator
    60  	// Concurrency is the number of CLs that will be reset concurrently.
    61  	//
    62  	// Default is 8.
    63  	Concurrency int
    64  
    65  	// Private fields that will be populated internally during long op execution.
    66  
    67  	inputs  []trigger.ResetInput
    68  	results []resetResult
    69  
    70  	// testAfterTryResetFn is always called after each try to reset the trigger
    71  	// of a CL.
    72  	//
    73  	// This is only set for testing purpose.
    74  	testAfterTryResetFn func()
    75  }
    76  
    77  const defaultConcurrency = 8
    78  
    79  // Do actually resets the triggers.
    80  func (op *ResetTriggersOp) Do(ctx context.Context) (*eventpb.LongOpCompleted, error) {
    81  	op.assertCalledOnce()
    82  
    83  	if err := op.loadInputs(ctx); err != nil {
    84  		return nil, err
    85  	}
    86  
    87  	op.executeInParallel(ctx)
    88  
    89  	longOpStatus := eventpb.LongOpCompleted_SUCCEEDED // be optimistic
    90  	rt := &eventpb.LongOpCompleted_ResetTriggers{
    91  		Results: make([]*eventpb.LongOpCompleted_ResetTriggers_Result, len(op.results)),
    92  	}
    93  	var lastTransErr, lastPermErr error
    94  	for i, result := range op.results {
    95  		cl := op.inputs[i].CL
    96  		rt.Results[i] = &eventpb.LongOpCompleted_ResetTriggers_Result{
    97  			Id:         int64(cl.ID),
    98  			ExternalId: string(cl.ExternalID),
    99  		}
   100  		switch err := result.err; {
   101  		case err == nil:
   102  			rt.Results[i].Detail = &eventpb.LongOpCompleted_ResetTriggers_Result_SuccessInfo{
   103  				SuccessInfo: &eventpb.LongOpCompleted_ResetTriggers_Result_Success{
   104  					ResetAt: timestamppb.New(result.resetAt),
   105  				},
   106  			}
   107  		default:
   108  			longOpStatus = eventpb.LongOpCompleted_FAILED
   109  			rt.Results[i].Detail = &eventpb.LongOpCompleted_ResetTriggers_Result_FailureInfo{
   110  				FailureInfo: &eventpb.LongOpCompleted_ResetTriggers_Result_Failure{
   111  					FailureMessage: err.Error(),
   112  				},
   113  			}
   114  			logging.Errorf(ctx, "failed to reset the trigger of CL %d %q: %s", cl.ID, cl.ExternalID, err)
   115  			if transient.Tag.In(err) {
   116  				lastTransErr = err
   117  			} else {
   118  				lastPermErr = err
   119  			}
   120  		}
   121  	}
   122  	ret := &eventpb.LongOpCompleted{
   123  		Status: longOpStatus,
   124  		Result: &eventpb.LongOpCompleted_ResetTriggers_{
   125  			ResetTriggers: rt,
   126  		},
   127  	}
   128  	switch ctxErr := ctx.Err(); {
   129  	// Returns the event in error case as well because the event will be
   130  	// reported back to Run Manager.
   131  	case ctxErr == context.DeadlineExceeded:
   132  		logging.Errorf(ctx, "running out of time to reset triggers")
   133  		return ret, ctxErr
   134  	case ctxErr == context.Canceled:
   135  		logging.Errorf(ctx, "context is cancelled while resetting triggers")
   136  		return ret, ctxErr
   137  	case ctxErr != nil:
   138  		panic(fmt.Errorf("unexpected context error: %s", ctxErr))
   139  	case lastPermErr != nil:
   140  		return ret, lastPermErr
   141  	case lastTransErr != nil:
   142  		// Don't return a transient error to prevent long op from retrying.
   143  		// The transient error should have been retried many times in this long op.
   144  		return ret, transient.Tag.Off().Apply(lastTransErr)
   145  	default:
   146  		return ret, nil
   147  	}
   148  }
   149  
   150  func (op *ResetTriggersOp) loadInputs(ctx context.Context) error {
   151  	var (
   152  		clsToReset []*changelist.CL
   153  		triggers   map[common.CLID]*run.Triggers
   154  		cfg        *prjcfg.ConfigGroup
   155  	)
   156  	eg, ctx := errgroup.WithContext(ctx)
   157  	requests := op.Op.GetResetTriggers().GetRequests()
   158  	eg.Go(func() (err error) {
   159  		clids := make(common.CLIDs, len(requests))
   160  		for i, req := range requests {
   161  			clids[i] = common.CLID(req.Clid)
   162  		}
   163  		clsToReset, err = changelist.LoadCLsByIDs(ctx, clids)
   164  		return err
   165  	})
   166  	eg.Go(func() error {
   167  		runCLs, err := run.LoadRunCLs(ctx, op.Run.ID, op.Run.CLs)
   168  		if err != nil {
   169  			return err
   170  		}
   171  		triggers = make(map[common.CLID]*run.Triggers, len(runCLs))
   172  		for _, runCL := range runCLs {
   173  			triggers[runCL.ID] = triggers[runCL.ID].WithTrigger(runCL.Trigger)
   174  		}
   175  		return nil
   176  	})
   177  	eg.Go(func() (err error) {
   178  		cfg, err = prjcfg.GetConfigGroup(ctx, op.Run.ID.LUCIProject(), op.Run.ConfigGroupID)
   179  		return err
   180  	})
   181  	if err := eg.Wait(); err != nil {
   182  		return err
   183  	}
   184  
   185  	op.inputs = make([]trigger.ResetInput, len(requests))
   186  	op.results = make([]resetResult, len(requests))
   187  	luciProject := op.Run.ID.LUCIProject()
   188  	for i := range requests {
   189  		cl, req := clsToReset[i], requests[i]
   190  		op.inputs[i] = trigger.ResetInput{
   191  			CL:                cl,
   192  			Triggers:          triggers[cl.ID],
   193  			LUCIProject:       luciProject,
   194  			Message:           req.Message,
   195  			Requester:         "Trigger Reset",
   196  			Notify:            req.Notify,
   197  			LeaseDuration:     time.Minute,
   198  			ConfigGroups:      []*prjcfg.ConfigGroup{cfg},
   199  			AddToAttentionSet: req.AddToAttention,
   200  			AttentionReason:   req.AddToAttentionReason,
   201  			GFactory:          op.GFactory,
   202  			CLMutator:         op.CLMutator,
   203  		}
   204  		op.results[i] = resetResult{
   205  			err: errNotAttemptedYet,
   206  		}
   207  	}
   208  	return nil
   209  }
   210  
   211  type resetItem struct {
   212  	index int
   213  	input trigger.ResetInput
   214  }
   215  type resetResult struct {
   216  	resetAt time.Time
   217  	err     error
   218  }
   219  
   220  // errNotAttemptedYet is the initial error set in resetResult.
   221  var errNotAttemptedYet = errors.New("not attempted reset yet")
   222  
   223  // executeInParallel resets the triggers of the provided CLs in parallel
   224  // and keeps retrying on transient or alreadyInLease failure until the context
   225  // is done.
   226  func (op *ResetTriggersOp) executeInParallel(ctx context.Context) {
   227  	dc := op.makeDispatcherChannel(ctx)
   228  	for i, input := range op.inputs {
   229  		dc.C <- resetItem{index: i, input: input}
   230  	}
   231  	dc.Close()
   232  	<-dc.DrainC
   233  }
   234  
   235  func (op *ResetTriggersOp) makeDispatcherChannel(ctx context.Context) dispatcher.Channel {
   236  	concurrency := op.Concurrency
   237  	if concurrency == 0 {
   238  		concurrency = defaultConcurrency
   239  	}
   240  	concurrency = min(concurrency, len(op.inputs))
   241  	dc, err := dispatcher.NewChannel(ctx, &dispatcher.Options{
   242  		ErrorFn: func(failedBatch *buffer.Batch, err error) (retry bool) {
   243  			_, isLeaseErr := lease.IsAlreadyInLeaseErr(err)
   244  			return isLeaseErr || transient.Tag.In(err)
   245  		},
   246  		DropFn: dispatcher.DropFnQuiet,
   247  		Buffer: buffer.Options{
   248  			MaxLeases:     concurrency,
   249  			BatchItemsMax: 1,
   250  			FullBehavior: &buffer.BlockNewItems{
   251  				MaxItems: len(op.results),
   252  			},
   253  			Retry: op.makeRetryFactory(),
   254  		},
   255  	}, func(data *buffer.Batch) error {
   256  		ci, ok := data.Data[0].Item.(resetItem)
   257  		if !ok {
   258  			panic(fmt.Errorf("unexpected batch data item %s", data.Data[0].Item))
   259  		}
   260  		result := &op.results[ci.index]
   261  		result.err = trigger.Reset(ctx, ci.input)
   262  		gerritErr := "GERRIT_ERROR_NONE"
   263  		if errCode, ok := trigger.IsResetErrFromGerrit(result.err); ok {
   264  			if codeString, ok := code.Code_name[int32(errCode)]; ok {
   265  				gerritErr = codeString
   266  			} else {
   267  				gerritErr = fmt.Sprintf("Code(%d)", int64(errCode))
   268  			}
   269  		}
   270  		metrics.Internal.RunResetTriggerAttempted.Add(ctx, 1, op.Run.ID.LUCIProject(), op.Run.ConfigGroupID.Name(), string(op.Run.Mode), result.err == nil, gerritErr)
   271  		if result.err == nil {
   272  			result.resetAt = clock.Now(ctx)
   273  		}
   274  		if op.testAfterTryResetFn != nil {
   275  			op.testAfterTryResetFn()
   276  		}
   277  		return result.err
   278  	})
   279  	if err != nil {
   280  		panic(fmt.Errorf("unexpected failure when creating dispatcher channel: %s", err))
   281  	}
   282  	return dc
   283  }
   284  
   285  func (op *ResetTriggersOp) makeRetryFactory() retry.Factory {
   286  	return lease.RetryIfLeased(transient.Only(func() retry.Iterator {
   287  		return &retry.ExponentialBackoff{
   288  			Limited: retry.Limited{
   289  				Delay:   100 * time.Millisecond,
   290  				Retries: -1, // unlimited
   291  			},
   292  			Multiplier: 2,
   293  			MaxDelay:   1 * time.Minute,
   294  		}
   295  	}))
   296  }