go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/internal/sweep/distributed.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sweep
    16  
    17  import (
    18  	"context"
    19  	"sync"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"google.golang.org/protobuf/proto"
    24  
    25  	"go.chromium.org/luci/common/clock"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/common/sync/parallel"
    29  
    30  	"go.chromium.org/luci/server/tq/internal"
    31  	"go.chromium.org/luci/server/tq/internal/db"
    32  	"go.chromium.org/luci/server/tq/internal/lessor"
    33  	"go.chromium.org/luci/server/tq/internal/partition"
    34  	"go.chromium.org/luci/server/tq/internal/reminder"
    35  	"go.chromium.org/luci/server/tq/internal/tqpb"
    36  )
    37  
    38  // Distributed implements distributed sweeping.
    39  //
    40  // Requires its EnqueueSweepTask callback to be configured in a way that
    41  // enqueued tasks eventually result in ExecSweepTask call (perhaps in a
    42  // different process).
    43  type Distributed struct {
    44  	// EnqueueSweepTask submits the task for execution somewhere in the fleet.
    45  	EnqueueSweepTask func(ctx context.Context, task *tqpb.SweepTask) error
    46  	// Submitter is used to submit Cloud Tasks requests.
    47  	Submitter internal.Submitter
    48  }
    49  
    50  // ExecSweepTask executes a previously enqueued sweep task.
    51  //
    52  // Note: we never want to retry failed ExecSweepTask. These tasks fork. If we
    53  // retry on transient errors that are not really transient we may accidentally
    54  // blow up with exponential number of tasks. Better just to wait for the next
    55  // fresh sweep. For that reason the implementation is careful not to return
    56  // errors marked with transient.Tag.
    57  func (d *Distributed) ExecSweepTask(ctx context.Context, task *tqpb.SweepTask) error {
    58  	// The corresponding DB must be registered in the process, otherwise we won't
    59  	// know how to enumerate reminders.
    60  	db := db.NonTxnDB(ctx, task.Db)
    61  	if db == nil {
    62  		return errors.Reason("no TQ db kind %q registered in the process", task.Db).Err()
    63  	}
    64  
    65  	// Similarly a lessor is needed for coordination.
    66  	lessor, err := lessor.Get(ctx, task.LessorId)
    67  	if err != nil {
    68  		return errors.Annotate(err, "can't initialize lessor %q", task.LessorId).Err()
    69  	}
    70  
    71  	part, err := partition.FromString(task.Partition)
    72  	if err != nil {
    73  		return errors.Annotate(err, "bad task payload").Err()
    74  	}
    75  
    76  	// Ensure there is time to process reminders produced by the scan.
    77  	scanTimeout := time.Minute
    78  	if d, ok := ctx.Deadline(); ok {
    79  		scanTimeout = d.Sub(clock.Now(ctx)) / 5
    80  	}
    81  	scanCtx, cancel := clock.WithTimeout(ctx, scanTimeout)
    82  	defer cancel()
    83  
    84  	// Discover stale reminders and a list of partitions we need to additionally
    85  	// scan. Use the configuration passed with the task.
    86  	reminders, followUp := Scan(scanCtx, &ScanParams{
    87  		DB:                  db,
    88  		Partition:           part,
    89  		KeySpaceBytes:       int(task.KeySpaceBytes),
    90  		TasksPerScan:        int(task.TasksPerScan),
    91  		SecondaryScanShards: int(task.SecondaryScanShards),
    92  		Level:               int(task.Level),
    93  	})
    94  
    95  	wg := sync.WaitGroup{}
    96  	wg.Add(2)
    97  	lerr := errors.NewLazyMultiError(2)
    98  	go func() {
    99  		defer wg.Done()
   100  		lerr.Assign(0, d.enqueueFollowUp(ctx, task, followUp))
   101  	}()
   102  	go func() {
   103  		defer wg.Done()
   104  		count, err := d.processReminders(ctx, lessor, task.LeaseSectionId, db, reminders, int(task.KeySpaceBytes))
   105  		if count > 0 { // don't spam log with zeros
   106  			logging.Infof(ctx, "Successfully processed %d reminder(s)", count)
   107  		}
   108  		lerr.Assign(1, err)
   109  	}()
   110  	wg.Wait()
   111  
   112  	// We don't want to return a complex error that may have transient.Tag
   113  	// somewhere inside. See the comment above.
   114  	if lerr.Get() != nil {
   115  		return errors.New("the sweep finished with errors, see logs")
   116  	}
   117  	return nil
   118  }
   119  
   120  // enqueueFollowUp enqueues sweep tasks that derive from `orig`.
   121  //
   122  // Logs errors inside.
   123  func (d *Distributed) enqueueFollowUp(ctx context.Context, orig *tqpb.SweepTask, parts partition.SortedPartitions) error {
   124  	return parallel.WorkPool(16, func(work chan<- func() error) {
   125  		for _, part := range parts {
   126  			task := proto.Clone(orig).(*tqpb.SweepTask)
   127  			task.Partition = part.String()
   128  			task.Level += 1 // we need to go deeper
   129  			work <- func() error {
   130  				if err := d.EnqueueSweepTask(ctx, task); err != nil {
   131  					logging.Errorf(ctx, "Failed to enqueue the follow up task %q: %s", task.Partition, err)
   132  					return err
   133  				}
   134  				return nil
   135  			}
   136  		}
   137  	})
   138  }
   139  
   140  // processReminders leases sub-ranges of the partition and processes reminders
   141  // there.
   142  //
   143  // Logs errors inside. Returns the total number of successfully processed
   144  // reminders.
   145  func (d *Distributed) processReminders(ctx context.Context, lessor lessor.Lessor, sectionID string, db db.DB, reminders []*reminder.Reminder, keySpaceBytes int) (int, error) {
   146  	l := len(reminders)
   147  	if l == 0 {
   148  		return 0, nil
   149  	}
   150  	desired, err := partition.SpanInclusive(reminders[0].ID, reminders[l-1].ID)
   151  	if err != nil {
   152  		logging.Errorf(ctx, "bug: invalid Reminder ID(s): %s", err)
   153  		return 0, errors.Annotate(err, "invalid Reminder ID(s)").Err()
   154  	}
   155  
   156  	var errProcess error
   157  	var count int
   158  	leaseErr := lessor.WithLease(ctx, sectionID, desired, time.Minute,
   159  		func(leaseCtx context.Context, leased partition.SortedPartitions) {
   160  			reminders := onlyLeased(reminders, leased, keySpaceBytes)
   161  			count, errProcess = d.processLeasedReminders(leaseCtx, db, reminders)
   162  		})
   163  	switch {
   164  	case leaseErr != nil:
   165  		logging.Errorf(ctx, "Failed to acquire the lease: %s", leaseErr)
   166  		return 0, errors.Annotate(leaseErr, "failed to acquire the lease").Err()
   167  	case errProcess != nil:
   168  		return count, errors.Annotate(errProcess, "failed to process all reminders").Err()
   169  	default:
   170  		return count, nil
   171  	}
   172  }
   173  
   174  // processLeasedReminders processes given reminders by splitting them in
   175  // batches and calling internal.SubmitBatch for each batch.
   176  //
   177  // Logs errors inside. Returns the total number of successfully processed
   178  // reminders.
   179  func (d *Distributed) processLeasedReminders(ctx context.Context, db db.DB, reminders []*reminder.Reminder) (int, error) {
   180  	const (
   181  		batchWorkers = 8
   182  		batchSize    = 50
   183  	)
   184  	var total int32
   185  	err := parallel.WorkPool(batchWorkers, func(work chan<- func() error) {
   186  		for {
   187  			var batch []*reminder.Reminder
   188  			switch l := len(reminders); {
   189  			case l == 0:
   190  				return
   191  			case l < batchSize:
   192  				batch, reminders = reminders, nil
   193  			default:
   194  				batch, reminders = reminders[:batchSize], reminders[batchSize:]
   195  			}
   196  			work <- func() error {
   197  				processed, err := internal.SubmitBatch(ctx, d.Submitter, db, batch)
   198  				atomic.AddInt32(&total, int32(processed))
   199  				return err
   200  			}
   201  		}
   202  	})
   203  	return int(atomic.LoadInt32(&total)), err
   204  }