go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/retention/run.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package retention
    16  
    17  import (
    18  	"context"
    19  	"math"
    20  	"sort"
    21  	"sync"
    22  	"time"
    23  
    24  	"google.golang.org/protobuf/proto"
    25  
    26  	"go.chromium.org/luci/common/clock"
    27  	"go.chromium.org/luci/common/errors"
    28  	"go.chromium.org/luci/common/logging"
    29  	"go.chromium.org/luci/common/retry"
    30  	"go.chromium.org/luci/common/retry/transient"
    31  	"go.chromium.org/luci/common/sync/dispatcher"
    32  	"go.chromium.org/luci/common/sync/dispatcher/buffer"
    33  	"go.chromium.org/luci/common/sync/parallel"
    34  	"go.chromium.org/luci/gae/service/datastore"
    35  	"go.chromium.org/luci/server/tq"
    36  
    37  	"go.chromium.org/luci/cv/internal/common"
    38  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    39  	"go.chromium.org/luci/cv/internal/run"
    40  	"go.chromium.org/luci/cv/internal/run/runquery"
    41  	"go.chromium.org/luci/cv/internal/tryjob"
    42  )
    43  
    44  // runsPerTask controls how many runs to wipeout per TQ task.
    45  const runsPerTask = 200
    46  
    47  // scheduleWipeoutRuns schedules tasks to wipe out old runs that are out of the
    48  // retention period.
    49  //
    50  // The tasks will be uniformly distributed over the next 1 hour.
    51  func scheduleWipeoutRuns(ctx context.Context, tqd *tq.Dispatcher) error {
    52  	// data retention should work for disabled projects as well
    53  	projects, err := prjcfg.GetAllProjectIDs(ctx, false)
    54  	if err != nil {
    55  		return err
    56  	}
    57  
    58  	cutoff := clock.Now(ctx).Add(-retentionPeriod).UTC()
    59  	dc, err := dispatcher.NewChannel(ctx, &dispatcher.Options{
    60  		DropFn: dispatcher.DropFnQuiet,
    61  		Buffer: buffer.Options{
    62  			MaxLeases:     10,
    63  			BatchItemsMax: runsPerTask,
    64  			FullBehavior:  &buffer.InfiniteGrowth{},
    65  			Retry:         retry.Default,
    66  		},
    67  	}, func(b *buffer.Batch) error {
    68  		runIDStrs := make(sort.StringSlice, len(b.Data))
    69  		for i, item := range b.Data {
    70  			runIDStrs[i] = string(item.Item.(common.RunID))
    71  		}
    72  		sort.Sort(runIDStrs)
    73  		task := &tq.Task{
    74  			Payload: &WipeoutRunsTask{
    75  				Ids: runIDStrs,
    76  			},
    77  			Delay: common.DistributeOffset(wipeoutTasksDistInterval, runIDStrs...),
    78  		}
    79  		return tqd.AddTask(ctx, task)
    80  	})
    81  	if err != nil {
    82  		panic(errors.Annotate(err, "failed to create dispatcher to schedule wipeout tasks"))
    83  	}
    84  
    85  	var wg sync.WaitGroup
    86  	wg.Add(len(projects))
    87  	poolErr := parallel.WorkPool(min(8, len(projects)), func(workCh chan<- func() error) {
    88  		for _, proj := range projects {
    89  			proj := proj
    90  			workCh <- func() error {
    91  				defer wg.Done()
    92  				runs, err := findRunsToWipeoutForProject(ctx, proj, cutoff)
    93  				switch {
    94  				case err != nil:
    95  					return errors.Annotate(err, "failed to find runs to wipe out for project %q", proj).Tag(transient.Tag).Err()
    96  				case len(runs) == 0:
    97  					return nil
    98  				}
    99  				logging.Infof(ctx, "found %d runs to wipeout for project %q", len(runs), proj)
   100  				for _, r := range runs {
   101  					dc.C <- r
   102  				}
   103  				return nil
   104  			}
   105  		}
   106  	})
   107  	wg.Wait()
   108  	dc.CloseAndDrain(ctx)
   109  	return poolErr
   110  }
   111  
   112  func findRunsToWipeoutForProject(ctx context.Context, proj string, cutoff time.Time) (common.RunIDs, error) {
   113  	// cutoffRunID is a non-existing run ID used for range query purpose
   114  	// only. All the runs in the query result should be created strictly
   115  	// before the cutoff time.
   116  	cutoffRunID := common.MakeRunID(proj, cutoff, math.MaxInt, []byte("whatever"))
   117  	qb := runquery.ProjectQueryBuilder{
   118  		Project: proj,
   119  	}.Before(cutoffRunID)
   120  	keys, err := qb.GetAllRunKeys(ctx)
   121  	if err != nil {
   122  		return nil, err
   123  	}
   124  	ret := make(common.RunIDs, len(keys))
   125  	for i, key := range keys {
   126  		ret[i] = common.RunID(key.StringID())
   127  	}
   128  	return ret, nil
   129  }
   130  
   131  func registerWipeoutRunsTask(tqd *tq.Dispatcher, rm rm) {
   132  	tqd.RegisterTaskClass(tq.TaskClass{
   133  		ID:           "wipeout-runs",
   134  		Queue:        "data-retention",
   135  		Prototype:    &WipeoutRunsTask{},
   136  		Kind:         tq.NonTransactional,
   137  		Quiet:        true,
   138  		QuietOnError: true,
   139  		Handler: func(ctx context.Context, payload proto.Message) error {
   140  			task := payload.(*WipeoutRunsTask)
   141  			err := wipeoutRuns(ctx, common.MakeRunIDs(task.GetIds()...), rm)
   142  			return common.TQifyError(ctx, err)
   143  		},
   144  	})
   145  }
   146  
   147  // wipeoutRuns wipes out runs for the provided run IDs.
   148  //
   149  // skip runs that do not exist or are still in retention period.
   150  func wipeoutRuns(ctx context.Context, runIDs common.RunIDs, rm rm) error {
   151  	runs, err := run.LoadRunsFromIDs(runIDs...).DoIgnoreNotFound(ctx)
   152  	switch {
   153  	case err != nil:
   154  		return errors.Annotate(err, "failed to load runs").Tag(transient.Tag).Err()
   155  	case len(runs) == 0:
   156  		return nil
   157  	}
   158  
   159  	return parallel.WorkPool(min(10, len(runIDs)), func(workC chan<- func() error) {
   160  		for _, r := range runs {
   161  			r := r
   162  			workC <- func() error {
   163  				return wipeoutRun(ctx, r, rm)
   164  			}
   165  		}
   166  	})
   167  }
   168  
   169  // wipeoutRun wipes out the given run if run is no longer in retention period.
   170  //
   171  // No-op if the run is still in the retention period.
   172  func wipeoutRun(ctx context.Context, r *run.Run, rm rm) error {
   173  	ctx = logging.SetField(ctx, "run", string(r.ID))
   174  	switch {
   175  	case !r.CreateTime.Before(clock.Now(ctx).Add(-retentionPeriod)):
   176  		// skip if it is still in the retention period.
   177  		logging.Warningf(ctx, "WipeoutRun: too young to wipe out: %s < %s",
   178  			clock.Now(ctx).Sub(r.CreateTime), retentionPeriod)
   179  		return nil
   180  	case !run.IsEnded(r.Status):
   181  		logging.Errorf(ctx, "run is eligible for wipeout but run is not ended yet. Poking the run to trigger run cancellation")
   182  		// Poke the non-ended run expecting the run will be cancelled by RunManager.
   183  		// The next cron job would likely wipeout the run.
   184  		if err := rm.PokeNow(ctx, r.ID); err != nil {
   185  			return errors.Annotate(err, "failed to poke run %s", r.ID).Tag(transient.Tag).Err()
   186  		}
   187  		return nil
   188  	}
   189  
   190  	// Find out all the child entities of Run entities. As of Jan. 2024, this
   191  	// includes:
   192  	//  - RunLog
   193  	//  - RunCL
   194  	//  - TryjobExecutionState
   195  	//  - TryjobExecutionLog
   196  	runKey := datastore.KeyForObj(ctx, r)
   197  	var toDelete []*datastore.Key
   198  	q := datastore.NewQuery("").Ancestor(runKey).KeysOnly(true)
   199  	if err := datastore.GetAll(ctx, q, &toDelete); err != nil {
   200  		return errors.Annotate(err, "failed to query all child entities of run %s", r.ID).Tag(transient.Tag).Err()
   201  	}
   202  	toDelete = append(toDelete, runKey)
   203  
   204  	// A run may have a lot of log entities which may cause timeouts if removed
   205  	// within a transaction. Therefore, deleting them first before deleting the
   206  	// rest of the run related entities in a transaction.
   207  	toDelete, err := removeLogEntities(ctx, toDelete)
   208  	if err != nil {
   209  		return errors.Annotate(err, "failed to delete log entities of run %s", r.ID).Tag(transient.Tag).Err()
   210  	}
   211  
   212  	err = datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   213  		switch err := datastore.Get(ctx, &run.Run{ID: r.ID}); {
   214  		case errors.Is(err, datastore.ErrNoSuchEntity):
   215  			// run has been deleted already.
   216  			return nil
   217  		case err != nil:
   218  			return err
   219  		}
   220  		return datastore.Delete(ctx, toDelete)
   221  	}, nil)
   222  
   223  	if err != nil {
   224  		return errors.Annotate(err, "failed to delete run entity for run %s and its child entities in a transaction", r.ID).Tag(transient.Tag).Err()
   225  	}
   226  	logging.Infof(ctx, "successfully wiped out run %s", r.ID)
   227  	return nil
   228  }
   229  
   230  func removeLogEntities(ctx context.Context, toDelete []*datastore.Key) (remaining []*datastore.Key, err error) {
   231  	var logKeys, remainingKeys []*datastore.Key
   232  	for _, key := range toDelete {
   233  		switch key.Kind() {
   234  		case run.RunLogKind, tryjob.TryjobExecutionLogKind:
   235  			logKeys = append(logKeys, key)
   236  		default:
   237  			remainingKeys = append(remainingKeys, key)
   238  		}
   239  	}
   240  	if err := datastore.Delete(ctx, logKeys); err != nil {
   241  		return nil, err
   242  	}
   243  	return remainingKeys, nil
   244  }