go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/cltriggerer/cltriggerer.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cltriggerer
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"google.golang.org/protobuf/proto"
    24  
    25  	"go.chromium.org/luci/common/clock"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/common/retry"
    29  	"go.chromium.org/luci/common/retry/transient"
    30  	"go.chromium.org/luci/common/sync/dispatcher"
    31  	"go.chromium.org/luci/common/sync/dispatcher/buffer"
    32  	"go.chromium.org/luci/server/tq"
    33  
    34  	"go.chromium.org/luci/cv/internal/changelist"
    35  	"go.chromium.org/luci/cv/internal/common"
    36  	"go.chromium.org/luci/cv/internal/common/lease"
    37  	"go.chromium.org/luci/cv/internal/gerrit"
    38  	"go.chromium.org/luci/cv/internal/metrics"
    39  	"go.chromium.org/luci/cv/internal/prjmanager"
    40  	"go.chromium.org/luci/cv/internal/prjmanager/prjpb"
    41  	"go.chromium.org/luci/cv/internal/run"
    42  )
    43  
    44  const maxConcurrency = 16
    45  
    46  // Triggerer triggers given CLs.
    47  type Triggerer struct {
    48  	pmNotifier *prjmanager.Notifier
    49  	gFactory   gerrit.Factory
    50  	clUpdater  clUpdater
    51  	clMutator  *changelist.Mutator
    52  }
    53  
    54  // clUpdater is a subset of the *changelist.Updater which Triggerer needs.
    55  type clUpdater interface {
    56  	Schedule(context.Context, *changelist.UpdateCLTask) error
    57  }
    58  
    59  // New creates a Triggerer.
    60  func New(n *prjmanager.Notifier, gf gerrit.Factory, clu clUpdater, clm *changelist.Mutator) *Triggerer {
    61  	v := &Triggerer{
    62  		pmNotifier: n,
    63  		gFactory:   gf,
    64  		clUpdater:  clu,
    65  		clMutator:  clm,
    66  	}
    67  	n.TasksBinding.TriggerProjectCLDeps.AttachHandler(
    68  		func(ctx context.Context, payload proto.Message) error {
    69  			task := payload.(*prjpb.TriggeringCLDepsTask)
    70  			ctx = logging.SetField(ctx, "project", task.GetLuciProject())
    71  			return common.TQifyError(ctx,
    72  				errors.Annotate(v.process(ctx, task), "triggerer.process").Err())
    73  		},
    74  	)
    75  	return v
    76  }
    77  
    78  // Schedule schedules a task for CQVoteTask.
    79  func (tr *Triggerer) Schedule(ctx context.Context, t *prjpb.TriggeringCLDepsTask) error {
    80  	payload := t.GetTriggeringClDeps()
    81  	if len(payload.GetDepClids()) == 0 {
    82  		return nil
    83  	}
    84  	return tr.pmNotifier.TasksBinding.TQDispatcher.AddTask(ctx, &tq.Task{
    85  		Payload: t,
    86  		Title: fmt.Sprintf("%s/%s/%d-%d",
    87  			t.GetLuciProject(), payload.GetOperationId(),
    88  			payload.GetOriginClid(), len(payload.GetDepClids())),
    89  		// Not allowed in a transaction
    90  		DeduplicationKey: "",
    91  	})
    92  }
    93  
    94  func (tr *Triggerer) makeDispatcherChannel(ctx context.Context, task *prjpb.TriggeringCLDepsTask) dispatcher.Channel {
    95  	concurrency := min(len(task.GetTriggeringClDeps().GetDepClids()), maxConcurrency)
    96  	prj := task.GetLuciProject()
    97  	dc, err := dispatcher.NewChannel(ctx, &dispatcher.Options{
    98  		ErrorFn: func(failedBatch *buffer.Batch, err error) (retry bool) {
    99  			_, isLeaseErr := lease.IsAlreadyInLeaseErr(err)
   100  			return isLeaseErr || transient.Tag.In(err)
   101  		},
   102  		DropFn: dispatcher.DropFnQuiet,
   103  		Buffer: buffer.Options{
   104  			MaxLeases:     concurrency,
   105  			BatchItemsMax: 1,
   106  			FullBehavior: &buffer.BlockNewItems{
   107  				MaxItems: concurrency,
   108  			},
   109  			Retry: makeRetryFactory(),
   110  		},
   111  	}, func(data *buffer.Batch) error {
   112  		op, ok := data.Data[0].Item.(*triggerDepOp)
   113  		if !ok {
   114  			panic(fmt.Errorf("unexpected batch data item type %T", data.Data[0].Item))
   115  		}
   116  		ctx := logging.SetFields(ctx, logging.Fields{"cl": op.depCLID})
   117  		return op.execute(ctx, tr.gFactory, prj, tr.clMutator, tr.clUpdater)
   118  	})
   119  	if err != nil {
   120  		panic(fmt.Errorf("cltriggerer: unexpected failure in dispatcher creation"))
   121  	}
   122  	return dc
   123  }
   124  
   125  func (tr *Triggerer) process(ctx context.Context, task *prjpb.TriggeringCLDepsTask) (err error) {
   126  	var isCanceled atomic.Bool
   127  	payload := task.GetTriggeringClDeps()
   128  	evt := &prjpb.TriggeringCLDepsCompleted{
   129  		OperationId: payload.GetOperationId(),
   130  		Origin:      payload.GetOriginClid(),
   131  	}
   132  	ctx = logging.SetField(ctx, "origin_cl", payload.GetOriginClid())
   133  	startTS := clock.Now(ctx)
   134  	defer func() {
   135  		if err == nil {
   136  			reportMetrics(ctx, task, evt, isCanceled.Load(), startTS)
   137  		}
   138  	}()
   139  
   140  	taskCtx, cancel := clock.WithDeadline(ctx, payload.GetDeadline().AsTime())
   141  	defer cancel()
   142  	originCL := &changelist.CL{ID: common.CLID(payload.GetOriginClid())}
   143  	switch err := changelist.LoadCLs(taskCtx, []*changelist.CL{originCL}); errors.Unwrap(err) {
   144  	case nil:
   145  	case context.Canceled, context.DeadlineExceeded:
   146  		// ctx instead of taskCtx.
   147  		evt.Incompleted = append(evt.Incompleted, payload.GetDepClids()...)
   148  		return tr.pmNotifier.NotifyTriggeringCLDepsCompleted(ctx, task.GetLuciProject(), evt)
   149  	default:
   150  		// always return a transient to retry fetching the originating CL
   151  		// until the deadline exceeds.
   152  		return transient.Tag.Apply(err)
   153  	}
   154  
   155  	// trigger votes in parallel while constantly checking the vote status
   156  	// of the originating CL.
   157  	ops := makeTriggerDepOps(originCL.ExternalID.MustURL(), payload, &isCanceled)
   158  	if ensureOriginCLVote(taskCtx, originCL) {
   159  		go checkVoteStatus(taskCtx, payload.GetOriginClid(), &isCanceled)
   160  		dc := tr.makeDispatcherChannel(taskCtx, task)
   161  		for _, item := range ops {
   162  			dc.C <- item
   163  		}
   164  		dc.Close()
   165  		<-dc.DrainC
   166  	} else {
   167  		// no need, but just for the sake.
   168  		isCanceled.Store(true)
   169  	}
   170  
   171  	for _, op := range ops {
   172  		switch {
   173  		case op.isSucceeded():
   174  			// It's possible that the origin CQ vote no longer exists.
   175  			// If so, OnTriggeringCLDepsCompleted() will check the origin vote
   176  			// status, and schedule PurgingCLTask for the successfully voted
   177  			// deps.
   178  			evt.Succeeded = append(evt.Succeeded, op.depCLID)
   179  		case op.isPermanentlyFailed():
   180  			evt.Failed = append(evt.Failed, op.getCLError())
   181  		default:
   182  			evt.Incompleted = append(evt.Incompleted, op.depCLID)
   183  		}
   184  	}
   185  	// ctx instead of taskCtx to send a notification even if the deadline
   186  	// exceeds.
   187  	return tr.pmNotifier.NotifyTriggeringCLDepsCompleted(ctx, task.GetLuciProject(), evt)
   188  }
   189  
   190  func makeRetryFactory() retry.Factory {
   191  	return transient.Only(func() retry.Iterator {
   192  		return &retry.ExponentialBackoff{
   193  			Limited: retry.Limited{
   194  				Delay:   100 * time.Millisecond,
   195  				Retries: -1, // unlimited
   196  			},
   197  			Multiplier: 2,
   198  			MaxDelay:   30 * time.Second,
   199  		}
   200  	})
   201  }
   202  
   203  func ensureOriginCLVote(ctx context.Context, originCL *changelist.CL) bool {
   204  	switch mode := findCQTriggerMode(originCL); mode {
   205  	case string(run.FullRun):
   206  		return true
   207  	case "":
   208  		logging.Infof(ctx, "the origin CL %d no longer has CQ vote; stop voting", originCL.ID)
   209  		return false
   210  	default:
   211  		// The originating CL now has CQ+1. This can only happen in the
   212  		// following scenario.
   213  		// - at t1, the origin CL gets CQ+2 and TriggeringCLDepsTask is created.
   214  		// - at t2, the origin CL gets CQ+1, while or before the task process.
   215  		//
   216  		// This should be considered as cancelling the CQ vote chain
   217  		// process. It's OK to skip all the vote ops for the dep CLs.
   218  		// Then, PM will retriage the originating CL, as necessary.
   219  		logging.Infof(ctx, "the origin CL %d now has a CQ vote for %q; stop voting", mode)
   220  		return false
   221  	}
   222  }
   223  
   224  func checkVoteStatus(ctx context.Context, originCLID int64, isCanceled *atomic.Bool) {
   225  	originCL := &changelist.CL{ID: common.CLID(originCLID)}
   226  	for {
   227  		select {
   228  		case <-ctx.Done():
   229  			return
   230  		case tr := <-clock.After(ctx, 4*time.Second):
   231  			if tr.Err != nil {
   232  				return
   233  			}
   234  		}
   235  		if err := changelist.LoadCLs(ctx, []*changelist.CL{originCL}); err == nil {
   236  			isCanceled.Store(ensureOriginCLVote(ctx, originCL))
   237  		}
   238  	}
   239  }
   240  
   241  func taskMetricStatus(isCanceled bool, evt *prjpb.TriggeringCLDepsCompleted) string {
   242  	switch {
   243  	case isCanceled:
   244  		return "CANCELED"
   245  	case len(evt.GetFailed()) > 0:
   246  		return "FAILED"
   247  	case len(evt.GetIncompleted()) > 0:
   248  		// if isCancelled == false, len(Incompleted) > 0 can happen only if
   249  		// the context expires.
   250  		return "TIMEDOUT"
   251  	default:
   252  		return "SUCCEEDED"
   253  	}
   254  }
   255  
   256  func reportMetrics(ctx context.Context, task *prjpb.TriggeringCLDepsTask, evt *prjpb.TriggeringCLDepsCompleted, isCanceled bool, startTS time.Time) {
   257  	payload := task.GetTriggeringClDeps()
   258  	status := taskMetricStatus(isCanceled, evt)
   259  	metrics.Internal.CLTriggererTaskCompleted.Add(
   260  		ctx,
   261  		1,
   262  		task.GetLuciProject(),
   263  		payload.GetConfigGroupName(),
   264  		len(payload.GetDepClids()),
   265  		status,
   266  	)
   267  	metrics.Internal.CLTriggererTaskDuration.Add(
   268  		ctx,
   269  		float64(clock.Since(ctx, startTS).Milliseconds()),
   270  		task.GetLuciProject(),
   271  		payload.GetConfigGroupName(),
   272  		len(payload.GetDepClids()),
   273  		status,
   274  	)
   275  }