go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/retention/run.go (about) 1 // Copyright 2024 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package retention 16 17 import ( 18 "context" 19 "math" 20 "sort" 21 "sync" 22 "time" 23 24 "google.golang.org/protobuf/proto" 25 26 "go.chromium.org/luci/common/clock" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/logging" 29 "go.chromium.org/luci/common/retry" 30 "go.chromium.org/luci/common/retry/transient" 31 "go.chromium.org/luci/common/sync/dispatcher" 32 "go.chromium.org/luci/common/sync/dispatcher/buffer" 33 "go.chromium.org/luci/common/sync/parallel" 34 "go.chromium.org/luci/gae/service/datastore" 35 "go.chromium.org/luci/server/tq" 36 37 "go.chromium.org/luci/cv/internal/common" 38 "go.chromium.org/luci/cv/internal/configs/prjcfg" 39 "go.chromium.org/luci/cv/internal/run" 40 "go.chromium.org/luci/cv/internal/run/runquery" 41 "go.chromium.org/luci/cv/internal/tryjob" 42 ) 43 44 // runsPerTask controls how many runs to wipeout per TQ task. 45 const runsPerTask = 200 46 47 // scheduleWipeoutRuns schedules tasks to wipe out old runs that are out of the 48 // retention period. 49 // 50 // The tasks will be uniformly distributed over the next 1 hour. 51 func scheduleWipeoutRuns(ctx context.Context, tqd *tq.Dispatcher) error { 52 // data retention should work for disabled projects as well 53 projects, err := prjcfg.GetAllProjectIDs(ctx, false) 54 if err != nil { 55 return err 56 } 57 58 cutoff := clock.Now(ctx).Add(-retentionPeriod).UTC() 59 dc, err := dispatcher.NewChannel(ctx, &dispatcher.Options{ 60 DropFn: dispatcher.DropFnQuiet, 61 Buffer: buffer.Options{ 62 MaxLeases: 10, 63 BatchItemsMax: runsPerTask, 64 FullBehavior: &buffer.InfiniteGrowth{}, 65 Retry: retry.Default, 66 }, 67 }, func(b *buffer.Batch) error { 68 runIDStrs := make(sort.StringSlice, len(b.Data)) 69 for i, item := range b.Data { 70 runIDStrs[i] = string(item.Item.(common.RunID)) 71 } 72 sort.Sort(runIDStrs) 73 task := &tq.Task{ 74 Payload: &WipeoutRunsTask{ 75 Ids: runIDStrs, 76 }, 77 Delay: common.DistributeOffset(wipeoutTasksDistInterval, runIDStrs...), 78 } 79 return tqd.AddTask(ctx, task) 80 }) 81 if err != nil { 82 panic(errors.Annotate(err, "failed to create dispatcher to schedule wipeout tasks")) 83 } 84 85 var wg sync.WaitGroup 86 wg.Add(len(projects)) 87 poolErr := parallel.WorkPool(min(8, len(projects)), func(workCh chan<- func() error) { 88 for _, proj := range projects { 89 proj := proj 90 workCh <- func() error { 91 defer wg.Done() 92 runs, err := findRunsToWipeoutForProject(ctx, proj, cutoff) 93 switch { 94 case err != nil: 95 return errors.Annotate(err, "failed to find runs to wipe out for project %q", proj).Tag(transient.Tag).Err() 96 case len(runs) == 0: 97 return nil 98 } 99 logging.Infof(ctx, "found %d runs to wipeout for project %q", len(runs), proj) 100 for _, r := range runs { 101 dc.C <- r 102 } 103 return nil 104 } 105 } 106 }) 107 wg.Wait() 108 dc.CloseAndDrain(ctx) 109 return poolErr 110 } 111 112 func findRunsToWipeoutForProject(ctx context.Context, proj string, cutoff time.Time) (common.RunIDs, error) { 113 // cutoffRunID is a non-existing run ID used for range query purpose 114 // only. All the runs in the query result should be created strictly 115 // before the cutoff time. 116 cutoffRunID := common.MakeRunID(proj, cutoff, math.MaxInt, []byte("whatever")) 117 qb := runquery.ProjectQueryBuilder{ 118 Project: proj, 119 }.Before(cutoffRunID) 120 keys, err := qb.GetAllRunKeys(ctx) 121 if err != nil { 122 return nil, err 123 } 124 ret := make(common.RunIDs, len(keys)) 125 for i, key := range keys { 126 ret[i] = common.RunID(key.StringID()) 127 } 128 return ret, nil 129 } 130 131 func registerWipeoutRunsTask(tqd *tq.Dispatcher, rm rm) { 132 tqd.RegisterTaskClass(tq.TaskClass{ 133 ID: "wipeout-runs", 134 Queue: "data-retention", 135 Prototype: &WipeoutRunsTask{}, 136 Kind: tq.NonTransactional, 137 Quiet: true, 138 QuietOnError: true, 139 Handler: func(ctx context.Context, payload proto.Message) error { 140 task := payload.(*WipeoutRunsTask) 141 err := wipeoutRuns(ctx, common.MakeRunIDs(task.GetIds()...), rm) 142 return common.TQifyError(ctx, err) 143 }, 144 }) 145 } 146 147 // wipeoutRuns wipes out runs for the provided run IDs. 148 // 149 // skip runs that do not exist or are still in retention period. 150 func wipeoutRuns(ctx context.Context, runIDs common.RunIDs, rm rm) error { 151 runs, err := run.LoadRunsFromIDs(runIDs...).DoIgnoreNotFound(ctx) 152 switch { 153 case err != nil: 154 return errors.Annotate(err, "failed to load runs").Tag(transient.Tag).Err() 155 case len(runs) == 0: 156 return nil 157 } 158 159 return parallel.WorkPool(min(10, len(runIDs)), func(workC chan<- func() error) { 160 for _, r := range runs { 161 r := r 162 workC <- func() error { 163 return wipeoutRun(ctx, r, rm) 164 } 165 } 166 }) 167 } 168 169 // wipeoutRun wipes out the given run if run is no longer in retention period. 170 // 171 // No-op if the run is still in the retention period. 172 func wipeoutRun(ctx context.Context, r *run.Run, rm rm) error { 173 ctx = logging.SetField(ctx, "run", string(r.ID)) 174 switch { 175 case !r.CreateTime.Before(clock.Now(ctx).Add(-retentionPeriod)): 176 // skip if it is still in the retention period. 177 logging.Warningf(ctx, "WipeoutRun: too young to wipe out: %s < %s", 178 clock.Now(ctx).Sub(r.CreateTime), retentionPeriod) 179 return nil 180 case !run.IsEnded(r.Status): 181 logging.Errorf(ctx, "run is eligible for wipeout but run is not ended yet. Poking the run to trigger run cancellation") 182 // Poke the non-ended run expecting the run will be cancelled by RunManager. 183 // The next cron job would likely wipeout the run. 184 if err := rm.PokeNow(ctx, r.ID); err != nil { 185 return errors.Annotate(err, "failed to poke run %s", r.ID).Tag(transient.Tag).Err() 186 } 187 return nil 188 } 189 190 // Find out all the child entities of Run entities. As of Jan. 2024, this 191 // includes: 192 // - RunLog 193 // - RunCL 194 // - TryjobExecutionState 195 // - TryjobExecutionLog 196 runKey := datastore.KeyForObj(ctx, r) 197 var toDelete []*datastore.Key 198 q := datastore.NewQuery("").Ancestor(runKey).KeysOnly(true) 199 if err := datastore.GetAll(ctx, q, &toDelete); err != nil { 200 return errors.Annotate(err, "failed to query all child entities of run %s", r.ID).Tag(transient.Tag).Err() 201 } 202 toDelete = append(toDelete, runKey) 203 204 // A run may have a lot of log entities which may cause timeouts if removed 205 // within a transaction. Therefore, deleting them first before deleting the 206 // rest of the run related entities in a transaction. 207 toDelete, err := removeLogEntities(ctx, toDelete) 208 if err != nil { 209 return errors.Annotate(err, "failed to delete log entities of run %s", r.ID).Tag(transient.Tag).Err() 210 } 211 212 err = datastore.RunInTransaction(ctx, func(ctx context.Context) error { 213 switch err := datastore.Get(ctx, &run.Run{ID: r.ID}); { 214 case errors.Is(err, datastore.ErrNoSuchEntity): 215 // run has been deleted already. 216 return nil 217 case err != nil: 218 return err 219 } 220 return datastore.Delete(ctx, toDelete) 221 }, nil) 222 223 if err != nil { 224 return errors.Annotate(err, "failed to delete run entity for run %s and its child entities in a transaction", r.ID).Tag(transient.Tag).Err() 225 } 226 logging.Infof(ctx, "successfully wiped out run %s", r.ID) 227 return nil 228 } 229 230 func removeLogEntities(ctx context.Context, toDelete []*datastore.Key) (remaining []*datastore.Key, err error) { 231 var logKeys, remainingKeys []*datastore.Key 232 for _, key := range toDelete { 233 switch key.Kind() { 234 case run.RunLogKind, tryjob.TryjobExecutionLogKind: 235 logKeys = append(logKeys, key) 236 default: 237 remainingKeys = append(remainingKeys, key) 238 } 239 } 240 if err := datastore.Delete(ctx, logKeys); err != nil { 241 return nil, err 242 } 243 return remainingKeys, nil 244 }