go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/inproc.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tq 16 17 import ( 18 "context" 19 "sync" 20 "sync/atomic" 21 "time" 22 23 "go.chromium.org/luci/common/clock" 24 "go.chromium.org/luci/common/errors" 25 "go.chromium.org/luci/common/logging" 26 27 "go.chromium.org/luci/server/tq/internal/db" 28 "go.chromium.org/luci/server/tq/internal/metrics" 29 "go.chromium.org/luci/server/tq/internal/partition" 30 "go.chromium.org/luci/server/tq/internal/sweep" 31 "go.chromium.org/luci/server/tq/internal/workset" 32 ) 33 34 // InProcSweeperOptions is configuration for the process of "sweeping" of 35 // transactional tasks reminders performed centrally in the current process. 36 type InProcSweeperOptions struct { 37 // SweepShards defines how many concurrent sweeping jobs to run. 38 // 39 // Default is 16. 40 SweepShards int 41 42 // TasksPerScan caps maximum number of tasks that a sweep job will process. 43 // 44 // Defaults to 2048. 45 TasksPerScan int 46 47 // SecondaryScanShards caps the sharding of additional sweep scans to be 48 // performed if the initial scan didn't cover the whole assigned partition. 49 // In practice, this matters only when database is slow or there is a huge 50 // backlog. 51 // 52 // Defaults to 16. 53 SecondaryScanShards int 54 55 // SubmitBatchSize limits a single of a single processed batch. 56 // 57 // When processing a batch, the sweeper loads bodies of all tasks in 58 // the batch, thus this setting directly affects memory usage. There will 59 // be at most SubmitBatchSize*SubmitConcurrentBatches task bodies worked-on at 60 // any moment in time. 61 // 62 // Default is 512. 63 SubmitBatchSize int 64 65 // SubmitConcurrentBatches limits how many submit batches can be worked on 66 // concurrently. 67 // 68 // Default is 8. 69 SubmitConcurrentBatches int 70 } 71 72 // NewInProcSweeper creates a sweeper that performs sweeping in the current 73 // process whenever Sweep is called. 74 func NewInProcSweeper(opts InProcSweeperOptions) Sweeper { 75 if opts.SweepShards == 0 { 76 opts.SweepShards = 16 77 } 78 if opts.TasksPerScan == 0 { 79 opts.TasksPerScan = 2048 80 } 81 if opts.SecondaryScanShards == 0 { 82 opts.SecondaryScanShards = 16 83 } 84 if opts.SubmitBatchSize == 0 { 85 opts.SubmitBatchSize = 512 86 } 87 if opts.SubmitConcurrentBatches == 0 { 88 opts.SubmitConcurrentBatches = 8 89 } 90 return &inprocSweeper{opts: opts} 91 } 92 93 // inprocSweeper implements Sweeper interface. 94 type inprocSweeper struct { 95 opts InProcSweeperOptions 96 running int32 // 1 of already running a sweep 97 } 98 99 // sweep performs as much of the sweep as possible. 100 // 101 // Logs internal errors and carries on. 102 func (s *inprocSweeper) sweep(ctx context.Context, sub Submitter, reminderKeySpaceBytes int) error { 103 if !atomic.CompareAndSwapInt32(&s.running, 0, 1) { 104 return errors.New("a sweep is already running") 105 } 106 defer atomic.StoreInt32(&s.running, 0) 107 108 // We'll sweep all known DBs and have a BatchProcessor per DB kind for 109 // processing reminders in this DB. 110 procs := map[string]*sweep.BatchProcessor{} 111 for _, kind := range db.Kinds() { 112 proc := &sweep.BatchProcessor{ 113 Context: logging.SetField(ctx, "db", kind), 114 DB: db.NonTxnDB(ctx, kind), 115 Submitter: sub, 116 BatchSize: s.opts.SubmitBatchSize, 117 ConcurrentBatches: s.opts.SubmitConcurrentBatches, 118 } 119 if err := proc.Start(); err != nil { 120 for _, running := range procs { 121 running.Stop() 122 } 123 return err 124 } 125 procs[kind] = proc 126 } 127 128 start := clock.Now(ctx) 129 defer func() { 130 dur := clock.Now(ctx).Sub(start) 131 metrics.InprocSweepDurationMS.Add(ctx, float64(dur.Microseconds())) 132 }() 133 134 // Seed all future work: SweepShards scans per DB kind. 135 partitions := partition.Universe(reminderKeySpaceBytes).Split(s.opts.SweepShards) 136 initial := make([]workset.Item, 0, len(procs)*len(partitions)) 137 for _, proc := range procs { 138 for _, p := range partitions { 139 initial = append(initial, &sweep.ScanParams{ 140 DB: proc.DB, 141 Partition: p, 142 KeySpaceBytes: reminderKeySpaceBytes, 143 TasksPerScan: s.opts.TasksPerScan, 144 SecondaryScanShards: s.opts.SecondaryScanShards, 145 Level: 0, 146 }) 147 } 148 } 149 work := workset.New(initial, nil) 150 151 // Run `SweepShards` workers (even if serving multiple DBs) that do scans of 152 // whatever partitions need scanning. Each will feed produced reminders into 153 // a BatchProcessor which will batch-process them. 154 wg := sync.WaitGroup{} 155 wg.Add(s.opts.SweepShards) 156 for i := 0; i < s.opts.SweepShards; i++ { 157 go func() { 158 defer wg.Done() 159 // Pick up some random partition we haven't scanned yet. Scan it, and 160 // enqueue all follow ups. Do until the queue is empty and all scan 161 // workers are done. 162 for { 163 item, done := work.Pop(ctx) 164 if item == nil { 165 return // no more work or the context is done 166 } 167 params := item.(*sweep.ScanParams) 168 var followUp []workset.Item 169 for _, part := range s.scan(ctx, params, procs[params.DB.Kind()]) { 170 params := *params 171 params.Partition = part 172 params.Level += 1 // we need to go deeper 173 followUp = append(followUp, ¶ms) 174 } 175 done(followUp) 176 } 177 }() 178 } 179 wg.Wait() 180 181 // At this point all scanners are done, but BatchProcessors may still be 182 // working. Drain them. 183 for _, proc := range procs { 184 if count := proc.Stop(); count != 0 { 185 logging.Infof(proc.Context, "Successfully processed %d reminder(s)", count) 186 } 187 } 188 189 return ctx.Err() 190 } 191 192 // scan scans a single partition. 193 // 194 // Enqueues discovered reminders for processing into a batch processor. Returns 195 // a list of partitions to scan next. 196 // 197 // Logs errors but otherwise ignores them. 198 func (s *inprocSweeper) scan(ctx context.Context, p *sweep.ScanParams, proc *sweep.BatchProcessor) []*partition.Partition { 199 logging.Infof(ctx, "Sweeping (level %d): %s", p.Level, p.Partition) 200 201 // Don't block for too long in a single scan. 202 scanCtx, cancel := clock.WithTimeout(ctx, time.Minute) 203 defer cancel() 204 reminders, followUp := sweep.Scan(scanCtx, p) 205 206 // Feed all reminders to a batching processor. This blocks if the processor is 207 // filled to the limit already. This is what we want to avoid OOMs. 208 proc.Enqueue(ctx, reminders) 209 210 return followUp 211 }