go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/retention/tryjob.go (about) 1 // Copyright 2024 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package retention 16 17 import ( 18 "context" 19 "strconv" 20 21 "google.golang.org/protobuf/proto" 22 23 "go.chromium.org/luci/common/clock" 24 "go.chromium.org/luci/common/errors" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/retry" 27 "go.chromium.org/luci/common/retry/transient" 28 "go.chromium.org/luci/common/sync/parallel" 29 "go.chromium.org/luci/gae/service/datastore" 30 "go.chromium.org/luci/server/tq" 31 32 "go.chromium.org/luci/cv/internal/common" 33 "go.chromium.org/luci/cv/internal/run" 34 "go.chromium.org/luci/cv/internal/tryjob" 35 ) 36 37 // tryjobsPerTask controls how many tryjobs to wipeout per TQ task. 38 const tryjobsPerTask = 800 39 40 // scheduleWipeoutTryjobsTasks schedules tasks to wipe out old tryjobs that are 41 // out of the retention period. 42 // 43 // The tasks will be uniformly distributed over the next 1 hours. 44 func scheduleWipeoutTryjobsTasks(ctx context.Context, tqd *tq.Dispatcher) error { 45 tryjobs, err := tryjob.QueryTryjobIDsUpdatedBefore(ctx, clock.Now(ctx).Add(-retentionPeriod)) 46 switch { 47 case err != nil: 48 return err 49 case len(tryjobs) == 0: 50 logging.Infof(ctx, "no tryjobs to wipe out") 51 return nil 52 } 53 54 logging.Infof(ctx, "schedule tasks to wipeout %d tryjobs", len(tryjobs)) 55 return parallel.WorkPool(min(10, len(tryjobs)/tryjobsPerTask), func(workCh chan<- func() error) { 56 for _, chunk := range chunk(tryjobs, tryjobsPerTask) { 57 tryjobIDStrs := make([]string, len(chunk)) 58 for i, tjID := range chunk { 59 tryjobIDStrs[i] = strconv.FormatInt(int64(tjID), 10) 60 } 61 task := &tq.Task{ 62 Payload: &WipeoutTryjobsTask{ 63 Ids: common.TryjobIDs(chunk).ToInt64(), 64 }, 65 Delay: common.DistributeOffset(wipeoutTasksDistInterval, tryjobIDStrs...), 66 } 67 workCh <- func() error { 68 return retry.Retry(ctx, retry.Default, func() error { 69 return tqd.AddTask(ctx, task) 70 }, nil) 71 } 72 } 73 }) 74 } 75 76 func registerWipeoutTryjobsTask(tqd *tq.Dispatcher) { 77 tqd.RegisterTaskClass(tq.TaskClass{ 78 ID: "wipeout-tryjobs", 79 Queue: "data-retention", 80 Prototype: &WipeoutTryjobsTask{}, 81 Kind: tq.NonTransactional, 82 Quiet: true, 83 QuietOnError: true, 84 Handler: func(ctx context.Context, payload proto.Message) error { 85 task := payload.(*WipeoutTryjobsTask) 86 err := wipeoutTryjobs(ctx, common.MakeTryjobIDs(task.GetIds()...)) 87 return common.TQifyError(ctx, err) 88 }, 89 }) 90 } 91 92 // wipeoutTryjobs wipes out the provided tryjobs. 93 func wipeoutTryjobs(ctx context.Context, ids common.TryjobIDs) error { 94 tryjobs, err := loadTryjobsIgnoreMissing(ctx, ids) 95 if err != nil { 96 return err 97 } 98 return parallel.WorkPool(min(10, len(tryjobs)), func(workCh chan<- func() error) { 99 for _, tj := range tryjobs { 100 tj := tj 101 workCh <- func() error { 102 return wipeoutTryjob(ctx, tj) 103 } 104 } 105 }) 106 } 107 108 func loadTryjobsIgnoreMissing(ctx context.Context, ids common.TryjobIDs) ([]*tryjob.Tryjob, error) { 109 tryjobs := make([]*tryjob.Tryjob, len(ids)) 110 for i, tjID := range ids { 111 tryjobs[i] = &tryjob.Tryjob{ID: tjID} 112 } 113 err := datastore.Get(ctx, tryjobs) 114 var merrs errors.MultiError 115 switch { 116 case err == nil: 117 return tryjobs, nil 118 case errors.As(err, &merrs): 119 ret := tryjobs[:0] // reuse the same slice 120 for i, err := range merrs { 121 switch { 122 case err == nil: 123 ret = append(ret, tryjobs[i]) 124 case !errors.Is(err, datastore.ErrNoSuchEntity): 125 count, err := merrs.Summary() 126 return nil, errors.Annotate(err, "failed to load %d out of %d tryjobs", count, len(ids)).Tag(transient.Tag).Err() 127 } 128 } 129 return ret, nil 130 default: 131 return nil, errors.Annotate(err, "failed to load tryjobs").Tag(transient.Tag).Err() 132 } 133 } 134 135 // wipeoutTryjob wipes out the provided tryjob if all runs that use this tryjob 136 // no longer exists. 137 func wipeoutTryjob(ctx context.Context, tj *tryjob.Tryjob) error { 138 ctx = logging.SetField(ctx, "tryjob", tj.ID) 139 140 var runs []any 141 for _, rid := range tj.AllWatchingRuns() { 142 runs = append(runs, &run.Run{ID: rid}) 143 } 144 if len(runs) > 0 { 145 switch res, err := datastore.Exists(ctx, runs...); { 146 case err != nil: 147 return errors.Annotate(err, "failed to check the existence of runs for Tryjob %d", tj.ID).Tag(transient.Tag).Err() 148 case res.Any(): 149 logging.Warningf(ctx, "WipeoutTryjob: skip wipeout because some run(s) using this tryjob still exists") 150 return nil 151 } 152 } 153 154 if err := tryjob.CondDelete(ctx, tj.ID, tj.EVersion); err != nil { 155 return err 156 } 157 logging.Infof(ctx, "successfully wiped out tryjob %d", tj.ID) 158 return nil 159 }