go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/internal/sweep/distributed.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sweep 16 17 import ( 18 "context" 19 "sync" 20 "sync/atomic" 21 "time" 22 23 "google.golang.org/protobuf/proto" 24 25 "go.chromium.org/luci/common/clock" 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/common/logging" 28 "go.chromium.org/luci/common/sync/parallel" 29 30 "go.chromium.org/luci/server/tq/internal" 31 "go.chromium.org/luci/server/tq/internal/db" 32 "go.chromium.org/luci/server/tq/internal/lessor" 33 "go.chromium.org/luci/server/tq/internal/partition" 34 "go.chromium.org/luci/server/tq/internal/reminder" 35 "go.chromium.org/luci/server/tq/internal/tqpb" 36 ) 37 38 // Distributed implements distributed sweeping. 39 // 40 // Requires its EnqueueSweepTask callback to be configured in a way that 41 // enqueued tasks eventually result in ExecSweepTask call (perhaps in a 42 // different process). 43 type Distributed struct { 44 // EnqueueSweepTask submits the task for execution somewhere in the fleet. 45 EnqueueSweepTask func(ctx context.Context, task *tqpb.SweepTask) error 46 // Submitter is used to submit Cloud Tasks requests. 47 Submitter internal.Submitter 48 } 49 50 // ExecSweepTask executes a previously enqueued sweep task. 51 // 52 // Note: we never want to retry failed ExecSweepTask. These tasks fork. If we 53 // retry on transient errors that are not really transient we may accidentally 54 // blow up with exponential number of tasks. Better just to wait for the next 55 // fresh sweep. For that reason the implementation is careful not to return 56 // errors marked with transient.Tag. 57 func (d *Distributed) ExecSweepTask(ctx context.Context, task *tqpb.SweepTask) error { 58 // The corresponding DB must be registered in the process, otherwise we won't 59 // know how to enumerate reminders. 60 db := db.NonTxnDB(ctx, task.Db) 61 if db == nil { 62 return errors.Reason("no TQ db kind %q registered in the process", task.Db).Err() 63 } 64 65 // Similarly a lessor is needed for coordination. 66 lessor, err := lessor.Get(ctx, task.LessorId) 67 if err != nil { 68 return errors.Annotate(err, "can't initialize lessor %q", task.LessorId).Err() 69 } 70 71 part, err := partition.FromString(task.Partition) 72 if err != nil { 73 return errors.Annotate(err, "bad task payload").Err() 74 } 75 76 // Ensure there is time to process reminders produced by the scan. 77 scanTimeout := time.Minute 78 if d, ok := ctx.Deadline(); ok { 79 scanTimeout = d.Sub(clock.Now(ctx)) / 5 80 } 81 scanCtx, cancel := clock.WithTimeout(ctx, scanTimeout) 82 defer cancel() 83 84 // Discover stale reminders and a list of partitions we need to additionally 85 // scan. Use the configuration passed with the task. 86 reminders, followUp := Scan(scanCtx, &ScanParams{ 87 DB: db, 88 Partition: part, 89 KeySpaceBytes: int(task.KeySpaceBytes), 90 TasksPerScan: int(task.TasksPerScan), 91 SecondaryScanShards: int(task.SecondaryScanShards), 92 Level: int(task.Level), 93 }) 94 95 wg := sync.WaitGroup{} 96 wg.Add(2) 97 lerr := errors.NewLazyMultiError(2) 98 go func() { 99 defer wg.Done() 100 lerr.Assign(0, d.enqueueFollowUp(ctx, task, followUp)) 101 }() 102 go func() { 103 defer wg.Done() 104 count, err := d.processReminders(ctx, lessor, task.LeaseSectionId, db, reminders, int(task.KeySpaceBytes)) 105 if count > 0 { // don't spam log with zeros 106 logging.Infof(ctx, "Successfully processed %d reminder(s)", count) 107 } 108 lerr.Assign(1, err) 109 }() 110 wg.Wait() 111 112 // We don't want to return a complex error that may have transient.Tag 113 // somewhere inside. See the comment above. 114 if lerr.Get() != nil { 115 return errors.New("the sweep finished with errors, see logs") 116 } 117 return nil 118 } 119 120 // enqueueFollowUp enqueues sweep tasks that derive from `orig`. 121 // 122 // Logs errors inside. 123 func (d *Distributed) enqueueFollowUp(ctx context.Context, orig *tqpb.SweepTask, parts partition.SortedPartitions) error { 124 return parallel.WorkPool(16, func(work chan<- func() error) { 125 for _, part := range parts { 126 task := proto.Clone(orig).(*tqpb.SweepTask) 127 task.Partition = part.String() 128 task.Level += 1 // we need to go deeper 129 work <- func() error { 130 if err := d.EnqueueSweepTask(ctx, task); err != nil { 131 logging.Errorf(ctx, "Failed to enqueue the follow up task %q: %s", task.Partition, err) 132 return err 133 } 134 return nil 135 } 136 } 137 }) 138 } 139 140 // processReminders leases sub-ranges of the partition and processes reminders 141 // there. 142 // 143 // Logs errors inside. Returns the total number of successfully processed 144 // reminders. 145 func (d *Distributed) processReminders(ctx context.Context, lessor lessor.Lessor, sectionID string, db db.DB, reminders []*reminder.Reminder, keySpaceBytes int) (int, error) { 146 l := len(reminders) 147 if l == 0 { 148 return 0, nil 149 } 150 desired, err := partition.SpanInclusive(reminders[0].ID, reminders[l-1].ID) 151 if err != nil { 152 logging.Errorf(ctx, "bug: invalid Reminder ID(s): %s", err) 153 return 0, errors.Annotate(err, "invalid Reminder ID(s)").Err() 154 } 155 156 var errProcess error 157 var count int 158 leaseErr := lessor.WithLease(ctx, sectionID, desired, time.Minute, 159 func(leaseCtx context.Context, leased partition.SortedPartitions) { 160 reminders := onlyLeased(reminders, leased, keySpaceBytes) 161 count, errProcess = d.processLeasedReminders(leaseCtx, db, reminders) 162 }) 163 switch { 164 case leaseErr != nil: 165 logging.Errorf(ctx, "Failed to acquire the lease: %s", leaseErr) 166 return 0, errors.Annotate(leaseErr, "failed to acquire the lease").Err() 167 case errProcess != nil: 168 return count, errors.Annotate(errProcess, "failed to process all reminders").Err() 169 default: 170 return count, nil 171 } 172 } 173 174 // processLeasedReminders processes given reminders by splitting them in 175 // batches and calling internal.SubmitBatch for each batch. 176 // 177 // Logs errors inside. Returns the total number of successfully processed 178 // reminders. 179 func (d *Distributed) processLeasedReminders(ctx context.Context, db db.DB, reminders []*reminder.Reminder) (int, error) { 180 const ( 181 batchWorkers = 8 182 batchSize = 50 183 ) 184 var total int32 185 err := parallel.WorkPool(batchWorkers, func(work chan<- func() error) { 186 for { 187 var batch []*reminder.Reminder 188 switch l := len(reminders); { 189 case l == 0: 190 return 191 case l < batchSize: 192 batch, reminders = reminders, nil 193 default: 194 batch, reminders = reminders[:batchSize], reminders[batchSize:] 195 } 196 work <- func() error { 197 processed, err := internal.SubmitBatch(ctx, d.Submitter, db, batch) 198 atomic.AddInt32(&total, int32(processed)) 199 return err 200 } 201 } 202 }) 203 return int(atomic.LoadInt32(&total)), err 204 }