go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/pubsub/pump.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package pubsub provides a generic way to batch pubsub pull
    16  // notifications.
    17  package pubsub
    18  
    19  import (
    20  	"context"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"cloud.google.com/go/pubsub"
    26  
    27  	"go.chromium.org/luci/common/clock"
    28  	"go.chromium.org/luci/common/errors"
    29  	"go.chromium.org/luci/common/logging"
    30  	"go.chromium.org/luci/common/retry/transient"
    31  	"go.chromium.org/luci/cv/internal/common"
    32  )
    33  
    34  // PullingBatchProcessor batches notifications pulled from a pubsub
    35  // subscription, and calls a custom process function on each batch.
    36  //
    37  // Provides an endpoint to be called by e.g. a cron job that starts the message
    38  // pulling and processing cycle for the specified time. (See .Process())
    39  type PullingBatchProcessor struct {
    40  	// ProcessBatch is a function to handle one batch of messages.
    41  	//
    42  	// The messages aren't yet ack-ed when they are passed to this func.
    43  	// The func is allowed to Ack or Nack them as it sees fit.
    44  	// This can be useful, for example, if processing some messages in a batch
    45  	// succeeds and fails on others.
    46  	//
    47  	// As a fail-safe, PullingBatchProcessor will **always** call Nack() or
    48  	// Ack() on all the messages after ProcessBatch completes.
    49  	// This is fine because PubSub client ignores any subsequent calls to Nack
    50  	// or Ack, thus the fail-safe won't override any Nack/Ack previously issued
    51  	// by the ProcessBatch.
    52  	//
    53  	// The fail-safe uses Nack() if the error returned by the ProcessBatch is
    54  	// transient, thus asking for re-delivery and an eventual retry.
    55  	// Ack() is used otherwise, which prevents retries on permanent errors.
    56  	ProcessBatch ProcessBatchFunc
    57  
    58  	// ProjectID is the project id for the subscription below.
    59  	ProjectID string
    60  
    61  	// SubID is the id of the pubsub subscription to pull messages from.
    62  	// It must exist.
    63  	SubID string
    64  
    65  	// Options are optional. Only nonzero fields will be applied.
    66  	Options Options
    67  }
    68  
    69  // Options control the operation of a PullingBatchProcessor.
    70  type Options struct {
    71  	// ReceiveDuration limits the duration of the Process() execution.
    72  	//
    73  	// It actually determines how long to receive pubsub messages for.
    74  	ReceiveDuration time.Duration
    75  
    76  	// MaxBatchSize limits how many messages to process in a single batch.
    77  	MaxBatchSize int
    78  
    79  	// ConcurrentBatches controls the number of batches being processed
    80  	// concurrently.
    81  	ConcurrentBatches int
    82  }
    83  
    84  // ProcessBatchFunc is the signature that the batch processing function needs
    85  // to have.
    86  type ProcessBatchFunc func(context.Context, []*pubsub.Message) error
    87  
    88  func defaultOptions() Options {
    89  	return Options{
    90  		// 5 minutes is reasonable because e.g. in AppEngine with auto scaling
    91  		// all request handlers can run for up to 10 minutes with the golang
    92  		// runtime.
    93  		//
    94  		// It also isn't too long to wait for old requests to complete when
    95  		// deploying a new version.
    96  		ReceiveDuration: 5 * time.Minute,
    97  
    98  		// 20 is large enough that the advantages of batching are clear
    99  		// e.g. a 95% reduction in fixed overhead per message
   100  		// (only true if the overhead is independent of the batch size),
   101  		// but also not so large that a list of build ids of this size is
   102  		// unwieldy to visually parse, or process manually.
   103  		MaxBatchSize: 20,
   104  
   105  		// Canonical cardinality concerning classic concurrency conundrum.
   106  		// See EWD-310 p.20
   107  		ConcurrentBatches: 5,
   108  	}
   109  }
   110  
   111  // batchErrKind is to communicate to Process() about transient/non-transient errors
   112  // in a processed batch.
   113  type batchErrKind int
   114  
   115  const (
   116  	ok    batchErrKind = iota // No errors occurred in the batch.
   117  	trans                     // Transient errors occurred in the batch, retry the whole batch.
   118  	perm                      // Permanent errors occurred in the batch, drop the whole batch.
   119  )
   120  
   121  // Process is the endpoint that (e.g. by cron job) should be periodically hit
   122  // to operate the PullingBatchProcessor.
   123  //
   124  // It creates the pubsub client and processes notifications for up to
   125  // Options.CronRunTime.
   126  func (pbp *PullingBatchProcessor) Process(ctx context.Context) error {
   127  	client, err := pubsub.NewClient(ctx, pbp.ProjectID)
   128  	if err != nil {
   129  		return err
   130  	}
   131  	defer func() {
   132  		if err := client.Close(); err != nil {
   133  			logging.Errorf(ctx, "failed to close PubSub client: %s", err)
   134  		}
   135  	}()
   136  	return pbp.process(ctx, client)
   137  }
   138  
   139  // process actually does what Process advertises, modulo creation of the
   140  // pubsub client.
   141  //
   142  // Unit tests can call this directly with a mock client.
   143  func (pbp *PullingBatchProcessor) process(ctx context.Context, client *pubsub.Client) error {
   144  	if client == nil {
   145  		return errors.New("cannot run process() without an initialized client")
   146  	}
   147  
   148  	// These are atomically incremented when batch processing results in either
   149  	// kind of error.
   150  	//
   151  	// Process() will return an error if there's one or more permanent errors.
   152  	var permanentErrorCount, transientErrorCount int32
   153  
   154  	sub := client.Subscription(pbp.SubID)
   155  	sub.ReceiveSettings.Synchronous = true
   156  	// Only lease as many messages from pubsub as we can concurrently send.
   157  	sub.ReceiveSettings.MaxOutstandingMessages = pbp.Options.MaxBatchSize * pbp.Options.ConcurrentBatches
   158  
   159  	// Get the first permanent error for surfacing details.
   160  	var firstPermErr error
   161  
   162  	workItems := make(chan *pubsub.Message)
   163  	wg := sync.WaitGroup{}
   164  	wg.Add(pbp.Options.ConcurrentBatches)
   165  	for i := 0; i < pbp.Options.ConcurrentBatches; i++ {
   166  		go func() {
   167  			defer wg.Done()
   168  			for {
   169  				batch := nextBatch(workItems, pbp.Options.MaxBatchSize)
   170  				if batch == nil {
   171  					return
   172  				}
   173  				switch status, err := pbp.onBatch(ctx, batch); status {
   174  				case perm:
   175  					if atomic.AddInt32(&permanentErrorCount, 1) == 1 {
   176  						firstPermErr = err
   177  					}
   178  				case trans:
   179  					atomic.AddInt32(&transientErrorCount, 1)
   180  				}
   181  			}
   182  		}()
   183  	}
   184  
   185  	receiveCtx, receiveCancel := clock.WithTimeout(ctx, pbp.Options.ReceiveDuration)
   186  	defer receiveCancel()
   187  	err := sub.Receive(receiveCtx, func(ctx context.Context, msg *pubsub.Message) {
   188  		workItems <- msg
   189  	})
   190  	close(workItems)
   191  	wg.Wait()
   192  	logging.Debugf(ctx, "Processed: %d batches with transient and %d with permanent errors", transientErrorCount, permanentErrorCount)
   193  
   194  	// Check receive error _after_ worker pool is done to avoid leakages.
   195  	if err != nil {
   196  		// Receive exitted due to something other than timeout or cancellation, i.e. non-retryable service error.
   197  		return errors.Annotate(err, "failed call to pubsub receive").Err()
   198  	}
   199  	if permanentErrorCount > 0 {
   200  		return errors.Reason("Process had non-transient errors. E.g. %q. Review logs for more details.", firstPermErr).Err()
   201  	}
   202  	return nil
   203  }
   204  
   205  func (opts *Options) normalize() {
   206  	defaults := defaultOptions()
   207  	if opts.ReceiveDuration == 0 {
   208  		opts.ReceiveDuration = defaults.ReceiveDuration
   209  	}
   210  	if opts.MaxBatchSize == 0 {
   211  		opts.MaxBatchSize = defaults.MaxBatchSize
   212  	}
   213  	if opts.ConcurrentBatches == 0 {
   214  		opts.ConcurrentBatches = defaults.ConcurrentBatches
   215  	}
   216  }
   217  
   218  // Validate checks missing required fields and normalizes options.
   219  func (pbp *PullingBatchProcessor) Validate() error {
   220  	if pbp.ProjectID == "" {
   221  		return errors.Reason("PullingBatchProcessor.ProjectID is required").Err()
   222  	}
   223  	if pbp.SubID == "" {
   224  		return errors.Reason("PullingBatchProcessor.SubID is required").Err()
   225  	}
   226  	if pbp.ProcessBatch == nil {
   227  		return errors.Reason("PullingBatchProcessor.ProcessBatch is required").Err()
   228  	}
   229  	if pbp.Options.ReceiveDuration < 0 {
   230  		return errors.Reason("Options.ReceiveDuration cannot be negative").Err()
   231  	}
   232  	if pbp.Options.ConcurrentBatches < 0 {
   233  		return errors.Reason("Options.ConcurrentBatches cannot be negative").Err()
   234  	}
   235  	if pbp.Options.MaxBatchSize < 0 {
   236  		return errors.Reason("Options.MaxBatchSize cannot be negative").Err()
   237  	}
   238  	pbp.Options.normalize()
   239  	return nil
   240  }
   241  
   242  func (pbp *PullingBatchProcessor) onBatch(ctx context.Context, msgs []*pubsub.Message) (batchErrKind, error) {
   243  	// Make a copy of the messages slice to prevent losing access to messages
   244  	// (and thus, the ability to ack/nack them) if ProcessBatch were to change
   245  	// the contents of the slice.
   246  	msgsCopy := append(make([]*pubsub.Message, 0, len(msgs)), msgs...)
   247  	err := pbp.ProcessBatch(ctx, msgsCopy)
   248  	// Note that ProcessBatch is allowed to ack/nack messages itself,
   249  	// for those messages, our acking/nacking below will have no effect.
   250  	switch {
   251  	case transient.Tag.In(err):
   252  		// Ask for re-delivery later.
   253  		common.LogError(ctx, errors.Annotate(err, "NACKing for redelivery").Err())
   254  		nackAll(msgs)
   255  		return trans, err
   256  	case err != nil:
   257  		common.LogError(ctx, errors.Annotate(err, "ACKing to avoid retries").Err())
   258  		ackAll(msgs)
   259  		return perm, err
   260  	default:
   261  		ackAll(msgs)
   262  		return ok, err
   263  	}
   264  }
   265  
   266  // nextBatch pulls up to n immediately available items from c.
   267  //
   268  // It blocks until at least one item is available.
   269  // If called with a closed channel, will return nil.
   270  func nextBatch(c <-chan *pubsub.Message, n int) []*pubsub.Message {
   271  	msg, stillOpen := <-c
   272  	if !stillOpen {
   273  		return nil
   274  	}
   275  	out := append(make([]*pubsub.Message, 0, n), msg)
   276  	for len(out) < n {
   277  		select {
   278  		case msg, stillOpen := <-c:
   279  			if !stillOpen {
   280  				return out
   281  			}
   282  			out = append(out, msg)
   283  		default:
   284  			return out
   285  		}
   286  	}
   287  	return out
   288  }
   289  
   290  func ackAll(msgs []*pubsub.Message) {
   291  	for _, msg := range msgs {
   292  		msg.Ack()
   293  	}
   294  }
   295  
   296  func nackAll(msgs []*pubsub.Message) {
   297  	for _, msg := range msgs {
   298  		msg.Nack()
   299  	}
   300  }