go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/tryjob/tjcancel/cancellator.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tjcancel contains code in charge of cancelling stale tryjobs.
    16  //
    17  // Cancellator responds to tasks scheduled when a new patch is uploaded,
    18  // looking for and cancelling stale tryjobs.
    19  package tjcancel
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"google.golang.org/protobuf/proto"
    30  
    31  	"go.chromium.org/luci/common/clock"
    32  	"go.chromium.org/luci/common/data/stringset"
    33  	"go.chromium.org/luci/common/errors"
    34  	"go.chromium.org/luci/common/logging"
    35  	"go.chromium.org/luci/common/retry/transient"
    36  	"go.chromium.org/luci/common/sync/parallel"
    37  	"go.chromium.org/luci/gae/service/datastore"
    38  
    39  	"go.chromium.org/luci/cv/internal/changelist"
    40  	"go.chromium.org/luci/cv/internal/common"
    41  	"go.chromium.org/luci/cv/internal/run"
    42  	"go.chromium.org/luci/cv/internal/tryjob"
    43  )
    44  
    45  // Cancellator is patterned after Updater to support multiple tryjob backends.
    46  type Cancellator struct {
    47  	tn *tryjob.Notifier
    48  
    49  	// guards backends map.
    50  	rwmutex  sync.RWMutex
    51  	backends map[string]cancellatorBackend
    52  }
    53  
    54  func NewCancellator(tn *tryjob.Notifier) *Cancellator {
    55  	c := &Cancellator{
    56  		tn:       tn,
    57  		backends: make(map[string]cancellatorBackend),
    58  	}
    59  	tn.Bindings.CancelStale.AttachHandler(func(ctx context.Context, payload proto.Message) error {
    60  		task := payload.(*tryjob.CancelStaleTryjobsTask)
    61  		ctx = logging.SetField(ctx, "CLID", task.GetClid())
    62  		return common.TQifyError(ctx, c.handleTask(ctx, task))
    63  	})
    64  	return c
    65  }
    66  
    67  // RegisterBackend registers a backend.
    68  //
    69  // Panics if backend for the same kind is already registered.
    70  func (c *Cancellator) RegisterBackend(b cancellatorBackend) {
    71  	kind := b.Kind()
    72  	if strings.ContainsRune(kind, '/') {
    73  		panic(fmt.Errorf("backend %T of kind %q must not contain '/'", b, kind))
    74  	}
    75  	c.rwmutex.Lock()
    76  	defer c.rwmutex.Unlock()
    77  	if _, exists := c.backends[kind]; exists {
    78  		panic(fmt.Errorf("backend %q is already registered", kind))
    79  	}
    80  	c.backends[kind] = b
    81  }
    82  
    83  func (c *Cancellator) handleTask(ctx context.Context, task *tryjob.CancelStaleTryjobsTask) error {
    84  	if task.PreviousMinEquivPatchset >= task.CurrentMinEquivPatchset {
    85  		panic(fmt.Errorf("patchset numbers expected to increase monotonically"))
    86  	}
    87  	cl := &changelist.CL{ID: common.CLID(task.GetClid())}
    88  	if err := datastore.Get(ctx, cl); err != nil {
    89  		return errors.Annotate(err, "failed to load CL %d", cl.ID).Tag(transient.Tag).Err()
    90  	}
    91  	preserveTryjob := false
    92  	for _, metadata := range cl.Snapshot.GetMetadata() {
    93  		if metadata.Key == common.FooterCQDoNotCancelTryjobs && strings.ToLower(strings.TrimSpace(metadata.Value)) == "true" {
    94  			preserveTryjob = true
    95  		}
    96  	}
    97  	if preserveTryjob {
    98  		logging.Infof(ctx, "skipping cancelling Tryjob as the latest CL has specified %s footer", common.FooterCQDoNotCancelTryjobs)
    99  		return nil
   100  	}
   101  
   102  	candidates, err := c.fetchCandidates(ctx, cl.ID, task.GetPreviousMinEquivPatchset(), task.GetCurrentMinEquivPatchset())
   103  	switch {
   104  	case err != nil:
   105  		return err
   106  	case len(candidates) == 0:
   107  		logging.Infof(ctx, "no stale Tryjobs to cancel")
   108  		return nil
   109  	default:
   110  		tryjobIDs := make([]string, len(candidates))
   111  		for i, tj := range candidates {
   112  			tryjobIDs[i] = strconv.Itoa(int(tj.ID))
   113  		}
   114  		logging.Infof(ctx, "found stale Tryjobs to cancel: [%s]", strings.Join(tryjobIDs, ", "))
   115  		return c.cancelTryjobs(ctx, candidates)
   116  	}
   117  }
   118  
   119  const cancelLaterDuration = 10 * time.Second
   120  
   121  func (c *Cancellator) fetchCandidates(ctx context.Context, clid common.CLID, prevMinEquiPS, curMinEquiPS int32) ([]*tryjob.Tryjob, error) {
   122  	q := datastore.NewQuery(tryjob.TryjobKind).
   123  		Gte("CLPatchsets", tryjob.MakeCLPatchset(clid, prevMinEquiPS)).
   124  		Lt("CLPatchsets", tryjob.MakeCLPatchset(clid, curMinEquiPS))
   125  	var candidates []*tryjob.Tryjob
   126  	err := datastore.Run(ctx, q, func(tj *tryjob.Tryjob) error {
   127  		switch {
   128  		case tj.ExternalID == "":
   129  			// Most likely Tryjob hasn't been triggered in the backend yet.
   130  		case tj.IsEnded():
   131  		case tj.LaunchedBy == "":
   132  			// Not launched by LUCI CV, may be through `git cl try` command line.
   133  		case tj.Definition.GetSkipStaleCheck():
   134  		default:
   135  			candidates = append(candidates, tj)
   136  		}
   137  		return nil
   138  	})
   139  	switch {
   140  	case err != nil:
   141  		return nil, errors.Annotate(err, "failed to run the query to fetch candidate tryjobs for cancellation").Tag(transient.Tag).Err()
   142  	case len(candidates) == 0:
   143  		return nil, nil
   144  	}
   145  
   146  	hasAllWatchingRunsEndedFn, err := makeHasAllWatchingRunEndedFn(ctx, candidates)
   147  	if err != nil {
   148  		return nil, err
   149  	}
   150  	var ret = candidates[:0] // reuse the same slice
   151  	var cancelLaterScheduled bool
   152  	for _, candidate := range candidates {
   153  		switch nonEndedRuns := hasAllWatchingRunsEndedFn(candidate); {
   154  		case len(nonEndedRuns) > 0 && !cancelLaterScheduled:
   155  			eta := clock.Now(ctx).UTC().Add(cancelLaterDuration)
   156  			if err := c.tn.ScheduleCancelStale(ctx, clid, prevMinEquiPS, curMinEquiPS, eta); err != nil {
   157  				return nil, err
   158  			}
   159  			cancelLaterScheduled = true
   160  			fallthrough
   161  		case len(nonEndedRuns) > 0:
   162  			logging.Warningf(ctx, "tryjob %d is still watched by non ended runs %s. This is likely a race condition and those runs will end soon. Will retry cancellation after %s.", candidate.ID, nonEndedRuns, cancelLaterDuration)
   163  		default:
   164  			ret = append(ret, candidate)
   165  		}
   166  	}
   167  	return ret, nil
   168  
   169  }
   170  
   171  func makeHasAllWatchingRunEndedFn(ctx context.Context, tryjobs []*tryjob.Tryjob) (func(*tryjob.Tryjob) (nonEnded common.RunIDs), error) {
   172  	runIDSet := stringset.New(1) // typically only one run.
   173  	for _, tj := range tryjobs {
   174  		for _, rid := range tj.AllWatchingRuns() {
   175  			runIDSet.Add(string(rid))
   176  		}
   177  	}
   178  	runs, errs := run.LoadRunsFromIDs(common.MakeRunIDs(runIDSet.ToSlice()...)...).Do(ctx)
   179  	endedRunIDs := make(map[common.RunID]struct{}, len(runs))
   180  	for i, r := range runs {
   181  		switch err := errs[i]; {
   182  		case err == datastore.ErrNoSuchEntity:
   183  			return nil, errors.Reason("Tryjob is associated with a non-existent Run %s", r.ID).Err()
   184  		case err != nil:
   185  			return nil, errors.Annotate(err, "failed to load run %s", r.ID).Tag(transient.Tag).Err()
   186  		case run.IsEnded(r.Status):
   187  			endedRunIDs[r.ID] = struct{}{}
   188  		}
   189  	}
   190  	return func(tj *tryjob.Tryjob) common.RunIDs {
   191  		var nonEnded common.RunIDs
   192  		for _, rid := range tj.AllWatchingRuns() {
   193  			if _, ended := endedRunIDs[rid]; !ended {
   194  				nonEnded = append(nonEnded, rid)
   195  			}
   196  		}
   197  		return nonEnded
   198  	}, nil
   199  }
   200  
   201  const reason = "LUCI CV no longer needs this Tryjob"
   202  
   203  func (c *Cancellator) cancelTryjobs(ctx context.Context, tjs []*tryjob.Tryjob) error {
   204  	if len(tjs) == 0 {
   205  		return nil
   206  	}
   207  	errs := parallel.WorkPool(min(8, len(tjs)), func(work chan<- func() error) {
   208  		for _, tj := range tjs {
   209  			tj := tj
   210  			work <- func() error {
   211  				be, err := c.backendFor(tj)
   212  				if err != nil {
   213  					return err
   214  				}
   215  				// TODO(crbug/1308930): use Buildbucket's batch API to reduce
   216  				// number of RPCs.
   217  				err = be.CancelTryjob(ctx, tj, reason)
   218  				if err != nil {
   219  					return errors.Annotate(err, "failed to cancel Tryjob [id=%d, eid=%s]", tj.ID, tj.ExternalID).Err()
   220  				}
   221  				return datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   222  					if err := datastore.Get(ctx, tj); err != nil {
   223  						return errors.Annotate(err, "failed to load Tryjob %d", tj.ID).Tag(transient.Tag).Err()
   224  					}
   225  					if tj.IsEnded() {
   226  						return nil
   227  					}
   228  					tj.Status = tryjob.Status_CANCELLED
   229  					tj.EVersion++
   230  					tj.EntityUpdateTime = datastore.RoundTime(clock.Now(ctx).UTC())
   231  					if err := datastore.Put(ctx, tj); err != nil {
   232  						return errors.Annotate(err, "failed to save Tryjob %d", tj.ID).Tag(transient.Tag).Err()
   233  					}
   234  					return nil
   235  				}, nil)
   236  			}
   237  		}
   238  	})
   239  	return common.MostSevereError(errs)
   240  }
   241  
   242  // cancellatorBackend is implemented by tryjobs backends, e.g. buildbucket.
   243  type cancellatorBackend interface {
   244  	// Kind identifies the backend
   245  	//
   246  	// It's also the first part of the Tryjob's ExternalID, e.g. "buildbucket".
   247  	// Must not contain a slash.
   248  	Kind() string
   249  	// CancelTryjob should cancel the tryjob given.
   250  	//
   251  	// MUST not modify the given Tryjob object.
   252  	// If the tryjob was already cancelled, it should not return an error.
   253  	CancelTryjob(ctx context.Context, tj *tryjob.Tryjob, reason string) error
   254  }
   255  
   256  func (c *Cancellator) backendFor(t *tryjob.Tryjob) (cancellatorBackend, error) {
   257  	kind, err := t.ExternalID.Kind()
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  	c.rwmutex.RLock()
   262  	defer c.rwmutex.RUnlock()
   263  	if b, exists := c.backends[kind]; exists {
   264  		return b, nil
   265  	}
   266  	return nil, errors.Reason("%q backend is not supported", kind).Err()
   267  }