go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/inproc.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tq
    16  
    17  import (
    18  	"context"
    19  	"sync"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"go.chromium.org/luci/common/clock"
    24  	"go.chromium.org/luci/common/errors"
    25  	"go.chromium.org/luci/common/logging"
    26  
    27  	"go.chromium.org/luci/server/tq/internal/db"
    28  	"go.chromium.org/luci/server/tq/internal/metrics"
    29  	"go.chromium.org/luci/server/tq/internal/partition"
    30  	"go.chromium.org/luci/server/tq/internal/sweep"
    31  	"go.chromium.org/luci/server/tq/internal/workset"
    32  )
    33  
    34  // InProcSweeperOptions is configuration for the process of "sweeping" of
    35  // transactional tasks reminders performed centrally in the current process.
    36  type InProcSweeperOptions struct {
    37  	// SweepShards defines how many concurrent sweeping jobs to run.
    38  	//
    39  	// Default is 16.
    40  	SweepShards int
    41  
    42  	// TasksPerScan caps maximum number of tasks that a sweep job will process.
    43  	//
    44  	// Defaults to 2048.
    45  	TasksPerScan int
    46  
    47  	// SecondaryScanShards caps the sharding of additional sweep scans to be
    48  	// performed if the initial scan didn't cover the whole assigned partition.
    49  	// In practice, this matters only when database is slow or there is a huge
    50  	// backlog.
    51  	//
    52  	// Defaults to 16.
    53  	SecondaryScanShards int
    54  
    55  	// SubmitBatchSize limits a single of a single processed batch.
    56  	//
    57  	// When processing a batch, the sweeper loads bodies of all tasks in
    58  	// the batch, thus this setting directly affects memory usage. There will
    59  	// be at most SubmitBatchSize*SubmitConcurrentBatches task bodies worked-on at
    60  	// any moment in time.
    61  	//
    62  	// Default is 512.
    63  	SubmitBatchSize int
    64  
    65  	// SubmitConcurrentBatches limits how many submit batches can be worked on
    66  	// concurrently.
    67  	//
    68  	// Default is 8.
    69  	SubmitConcurrentBatches int
    70  }
    71  
    72  // NewInProcSweeper creates a sweeper that performs sweeping in the current
    73  // process whenever Sweep is called.
    74  func NewInProcSweeper(opts InProcSweeperOptions) Sweeper {
    75  	if opts.SweepShards == 0 {
    76  		opts.SweepShards = 16
    77  	}
    78  	if opts.TasksPerScan == 0 {
    79  		opts.TasksPerScan = 2048
    80  	}
    81  	if opts.SecondaryScanShards == 0 {
    82  		opts.SecondaryScanShards = 16
    83  	}
    84  	if opts.SubmitBatchSize == 0 {
    85  		opts.SubmitBatchSize = 512
    86  	}
    87  	if opts.SubmitConcurrentBatches == 0 {
    88  		opts.SubmitConcurrentBatches = 8
    89  	}
    90  	return &inprocSweeper{opts: opts}
    91  }
    92  
    93  // inprocSweeper implements Sweeper interface.
    94  type inprocSweeper struct {
    95  	opts    InProcSweeperOptions
    96  	running int32 // 1 of already running a sweep
    97  }
    98  
    99  // sweep performs as much of the sweep as possible.
   100  //
   101  // Logs internal errors and carries on.
   102  func (s *inprocSweeper) sweep(ctx context.Context, sub Submitter, reminderKeySpaceBytes int) error {
   103  	if !atomic.CompareAndSwapInt32(&s.running, 0, 1) {
   104  		return errors.New("a sweep is already running")
   105  	}
   106  	defer atomic.StoreInt32(&s.running, 0)
   107  
   108  	// We'll sweep all known DBs and have a BatchProcessor per DB kind for
   109  	// processing reminders in this DB.
   110  	procs := map[string]*sweep.BatchProcessor{}
   111  	for _, kind := range db.Kinds() {
   112  		proc := &sweep.BatchProcessor{
   113  			Context:           logging.SetField(ctx, "db", kind),
   114  			DB:                db.NonTxnDB(ctx, kind),
   115  			Submitter:         sub,
   116  			BatchSize:         s.opts.SubmitBatchSize,
   117  			ConcurrentBatches: s.opts.SubmitConcurrentBatches,
   118  		}
   119  		if err := proc.Start(); err != nil {
   120  			for _, running := range procs {
   121  				running.Stop()
   122  			}
   123  			return err
   124  		}
   125  		procs[kind] = proc
   126  	}
   127  
   128  	start := clock.Now(ctx)
   129  	defer func() {
   130  		dur := clock.Now(ctx).Sub(start)
   131  		metrics.InprocSweepDurationMS.Add(ctx, float64(dur.Microseconds()))
   132  	}()
   133  
   134  	// Seed all future work: SweepShards scans per DB kind.
   135  	partitions := partition.Universe(reminderKeySpaceBytes).Split(s.opts.SweepShards)
   136  	initial := make([]workset.Item, 0, len(procs)*len(partitions))
   137  	for _, proc := range procs {
   138  		for _, p := range partitions {
   139  			initial = append(initial, &sweep.ScanParams{
   140  				DB:                  proc.DB,
   141  				Partition:           p,
   142  				KeySpaceBytes:       reminderKeySpaceBytes,
   143  				TasksPerScan:        s.opts.TasksPerScan,
   144  				SecondaryScanShards: s.opts.SecondaryScanShards,
   145  				Level:               0,
   146  			})
   147  		}
   148  	}
   149  	work := workset.New(initial, nil)
   150  
   151  	// Run `SweepShards` workers (even if serving multiple DBs) that do scans of
   152  	// whatever partitions need scanning. Each will feed produced reminders into
   153  	// a BatchProcessor which will batch-process them.
   154  	wg := sync.WaitGroup{}
   155  	wg.Add(s.opts.SweepShards)
   156  	for i := 0; i < s.opts.SweepShards; i++ {
   157  		go func() {
   158  			defer wg.Done()
   159  			// Pick up some random partition we haven't scanned yet. Scan it, and
   160  			// enqueue all follow ups. Do until the queue is empty and all scan
   161  			// workers are done.
   162  			for {
   163  				item, done := work.Pop(ctx)
   164  				if item == nil {
   165  					return // no more work or the context is done
   166  				}
   167  				params := item.(*sweep.ScanParams)
   168  				var followUp []workset.Item
   169  				for _, part := range s.scan(ctx, params, procs[params.DB.Kind()]) {
   170  					params := *params
   171  					params.Partition = part
   172  					params.Level += 1 // we need to go deeper
   173  					followUp = append(followUp, &params)
   174  				}
   175  				done(followUp)
   176  			}
   177  		}()
   178  	}
   179  	wg.Wait()
   180  
   181  	// At this point all scanners are done, but BatchProcessors may still be
   182  	// working. Drain them.
   183  	for _, proc := range procs {
   184  		if count := proc.Stop(); count != 0 {
   185  			logging.Infof(proc.Context, "Successfully processed %d reminder(s)", count)
   186  		}
   187  	}
   188  
   189  	return ctx.Err()
   190  }
   191  
   192  // scan scans a single partition.
   193  //
   194  // Enqueues discovered reminders for processing into a batch processor. Returns
   195  // a list of partitions to scan next.
   196  //
   197  // Logs errors but otherwise ignores them.
   198  func (s *inprocSweeper) scan(ctx context.Context, p *sweep.ScanParams, proc *sweep.BatchProcessor) []*partition.Partition {
   199  	logging.Infof(ctx, "Sweeping (level %d): %s", p.Level, p.Partition)
   200  
   201  	// Don't block for too long in a single scan.
   202  	scanCtx, cancel := clock.WithTimeout(ctx, time.Minute)
   203  	defer cancel()
   204  	reminders, followUp := sweep.Scan(scanCtx, p)
   205  
   206  	// Feed all reminders to a batching processor. This blocks if the processor is
   207  	// filled to the limit already. This is what we want to avoid OOMs.
   208  	proc.Enqueue(ctx, reminders)
   209  
   210  	return followUp
   211  }