go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/retention/tryjob.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package retention
    16  
    17  import (
    18  	"context"
    19  	"strconv"
    20  
    21  	"google.golang.org/protobuf/proto"
    22  
    23  	"go.chromium.org/luci/common/clock"
    24  	"go.chromium.org/luci/common/errors"
    25  	"go.chromium.org/luci/common/logging"
    26  	"go.chromium.org/luci/common/retry"
    27  	"go.chromium.org/luci/common/retry/transient"
    28  	"go.chromium.org/luci/common/sync/parallel"
    29  	"go.chromium.org/luci/gae/service/datastore"
    30  	"go.chromium.org/luci/server/tq"
    31  
    32  	"go.chromium.org/luci/cv/internal/common"
    33  	"go.chromium.org/luci/cv/internal/run"
    34  	"go.chromium.org/luci/cv/internal/tryjob"
    35  )
    36  
    37  // tryjobsPerTask controls how many tryjobs to wipeout per TQ task.
    38  const tryjobsPerTask = 800
    39  
    40  // scheduleWipeoutTryjobsTasks schedules tasks to wipe out old tryjobs that are
    41  // out of the retention period.
    42  //
    43  // The tasks will be uniformly distributed over the next 1 hours.
    44  func scheduleWipeoutTryjobsTasks(ctx context.Context, tqd *tq.Dispatcher) error {
    45  	tryjobs, err := tryjob.QueryTryjobIDsUpdatedBefore(ctx, clock.Now(ctx).Add(-retentionPeriod))
    46  	switch {
    47  	case err != nil:
    48  		return err
    49  	case len(tryjobs) == 0:
    50  		logging.Infof(ctx, "no tryjobs to wipe out")
    51  		return nil
    52  	}
    53  
    54  	logging.Infof(ctx, "schedule tasks to wipeout %d tryjobs", len(tryjobs))
    55  	return parallel.WorkPool(min(10, len(tryjobs)/tryjobsPerTask), func(workCh chan<- func() error) {
    56  		for _, chunk := range chunk(tryjobs, tryjobsPerTask) {
    57  			tryjobIDStrs := make([]string, len(chunk))
    58  			for i, tjID := range chunk {
    59  				tryjobIDStrs[i] = strconv.FormatInt(int64(tjID), 10)
    60  			}
    61  			task := &tq.Task{
    62  				Payload: &WipeoutTryjobsTask{
    63  					Ids: common.TryjobIDs(chunk).ToInt64(),
    64  				},
    65  				Delay: common.DistributeOffset(wipeoutTasksDistInterval, tryjobIDStrs...),
    66  			}
    67  			workCh <- func() error {
    68  				return retry.Retry(ctx, retry.Default, func() error {
    69  					return tqd.AddTask(ctx, task)
    70  				}, nil)
    71  			}
    72  		}
    73  	})
    74  }
    75  
    76  func registerWipeoutTryjobsTask(tqd *tq.Dispatcher) {
    77  	tqd.RegisterTaskClass(tq.TaskClass{
    78  		ID:           "wipeout-tryjobs",
    79  		Queue:        "data-retention",
    80  		Prototype:    &WipeoutTryjobsTask{},
    81  		Kind:         tq.NonTransactional,
    82  		Quiet:        true,
    83  		QuietOnError: true,
    84  		Handler: func(ctx context.Context, payload proto.Message) error {
    85  			task := payload.(*WipeoutTryjobsTask)
    86  			err := wipeoutTryjobs(ctx, common.MakeTryjobIDs(task.GetIds()...))
    87  			return common.TQifyError(ctx, err)
    88  		},
    89  	})
    90  }
    91  
    92  // wipeoutTryjobs wipes out the provided tryjobs.
    93  func wipeoutTryjobs(ctx context.Context, ids common.TryjobIDs) error {
    94  	tryjobs, err := loadTryjobsIgnoreMissing(ctx, ids)
    95  	if err != nil {
    96  		return err
    97  	}
    98  	return parallel.WorkPool(min(10, len(tryjobs)), func(workCh chan<- func() error) {
    99  		for _, tj := range tryjobs {
   100  			tj := tj
   101  			workCh <- func() error {
   102  				return wipeoutTryjob(ctx, tj)
   103  			}
   104  		}
   105  	})
   106  }
   107  
   108  func loadTryjobsIgnoreMissing(ctx context.Context, ids common.TryjobIDs) ([]*tryjob.Tryjob, error) {
   109  	tryjobs := make([]*tryjob.Tryjob, len(ids))
   110  	for i, tjID := range ids {
   111  		tryjobs[i] = &tryjob.Tryjob{ID: tjID}
   112  	}
   113  	err := datastore.Get(ctx, tryjobs)
   114  	var merrs errors.MultiError
   115  	switch {
   116  	case err == nil:
   117  		return tryjobs, nil
   118  	case errors.As(err, &merrs):
   119  		ret := tryjobs[:0] // reuse the same slice
   120  		for i, err := range merrs {
   121  			switch {
   122  			case err == nil:
   123  				ret = append(ret, tryjobs[i])
   124  			case !errors.Is(err, datastore.ErrNoSuchEntity):
   125  				count, err := merrs.Summary()
   126  				return nil, errors.Annotate(err, "failed to load %d out of %d tryjobs", count, len(ids)).Tag(transient.Tag).Err()
   127  			}
   128  		}
   129  		return ret, nil
   130  	default:
   131  		return nil, errors.Annotate(err, "failed to load tryjobs").Tag(transient.Tag).Err()
   132  	}
   133  }
   134  
   135  // wipeoutTryjob wipes out the provided tryjob if all runs that use this tryjob
   136  // no longer exists.
   137  func wipeoutTryjob(ctx context.Context, tj *tryjob.Tryjob) error {
   138  	ctx = logging.SetField(ctx, "tryjob", tj.ID)
   139  
   140  	var runs []any
   141  	for _, rid := range tj.AllWatchingRuns() {
   142  		runs = append(runs, &run.Run{ID: rid})
   143  	}
   144  	if len(runs) > 0 {
   145  		switch res, err := datastore.Exists(ctx, runs...); {
   146  		case err != nil:
   147  			return errors.Annotate(err, "failed to check the existence of runs for Tryjob %d", tj.ID).Tag(transient.Tag).Err()
   148  		case res.Any():
   149  			logging.Warningf(ctx, "WipeoutTryjob: skip wipeout because some run(s) using this tryjob still exists")
   150  			return nil
   151  		}
   152  	}
   153  
   154  	if err := tryjob.CondDelete(ctx, tj.ID, tj.EVersion); err != nil {
   155  		return err
   156  	}
   157  	logging.Infof(ctx, "successfully wiped out tryjob %d", tj.ID)
   158  	return nil
   159  }