go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/pubsub/pump.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package pubsub provides a generic way to batch pubsub pull 16 // notifications. 17 package pubsub 18 19 import ( 20 "context" 21 "sync" 22 "sync/atomic" 23 "time" 24 25 "cloud.google.com/go/pubsub" 26 27 "go.chromium.org/luci/common/clock" 28 "go.chromium.org/luci/common/errors" 29 "go.chromium.org/luci/common/logging" 30 "go.chromium.org/luci/common/retry/transient" 31 "go.chromium.org/luci/cv/internal/common" 32 ) 33 34 // PullingBatchProcessor batches notifications pulled from a pubsub 35 // subscription, and calls a custom process function on each batch. 36 // 37 // Provides an endpoint to be called by e.g. a cron job that starts the message 38 // pulling and processing cycle for the specified time. (See .Process()) 39 type PullingBatchProcessor struct { 40 // ProcessBatch is a function to handle one batch of messages. 41 // 42 // The messages aren't yet ack-ed when they are passed to this func. 43 // The func is allowed to Ack or Nack them as it sees fit. 44 // This can be useful, for example, if processing some messages in a batch 45 // succeeds and fails on others. 46 // 47 // As a fail-safe, PullingBatchProcessor will **always** call Nack() or 48 // Ack() on all the messages after ProcessBatch completes. 49 // This is fine because PubSub client ignores any subsequent calls to Nack 50 // or Ack, thus the fail-safe won't override any Nack/Ack previously issued 51 // by the ProcessBatch. 52 // 53 // The fail-safe uses Nack() if the error returned by the ProcessBatch is 54 // transient, thus asking for re-delivery and an eventual retry. 55 // Ack() is used otherwise, which prevents retries on permanent errors. 56 ProcessBatch ProcessBatchFunc 57 58 // ProjectID is the project id for the subscription below. 59 ProjectID string 60 61 // SubID is the id of the pubsub subscription to pull messages from. 62 // It must exist. 63 SubID string 64 65 // Options are optional. Only nonzero fields will be applied. 66 Options Options 67 } 68 69 // Options control the operation of a PullingBatchProcessor. 70 type Options struct { 71 // ReceiveDuration limits the duration of the Process() execution. 72 // 73 // It actually determines how long to receive pubsub messages for. 74 ReceiveDuration time.Duration 75 76 // MaxBatchSize limits how many messages to process in a single batch. 77 MaxBatchSize int 78 79 // ConcurrentBatches controls the number of batches being processed 80 // concurrently. 81 ConcurrentBatches int 82 } 83 84 // ProcessBatchFunc is the signature that the batch processing function needs 85 // to have. 86 type ProcessBatchFunc func(context.Context, []*pubsub.Message) error 87 88 func defaultOptions() Options { 89 return Options{ 90 // 5 minutes is reasonable because e.g. in AppEngine with auto scaling 91 // all request handlers can run for up to 10 minutes with the golang 92 // runtime. 93 // 94 // It also isn't too long to wait for old requests to complete when 95 // deploying a new version. 96 ReceiveDuration: 5 * time.Minute, 97 98 // 20 is large enough that the advantages of batching are clear 99 // e.g. a 95% reduction in fixed overhead per message 100 // (only true if the overhead is independent of the batch size), 101 // but also not so large that a list of build ids of this size is 102 // unwieldy to visually parse, or process manually. 103 MaxBatchSize: 20, 104 105 // Canonical cardinality concerning classic concurrency conundrum. 106 // See EWD-310 p.20 107 ConcurrentBatches: 5, 108 } 109 } 110 111 // batchErrKind is to communicate to Process() about transient/non-transient errors 112 // in a processed batch. 113 type batchErrKind int 114 115 const ( 116 ok batchErrKind = iota // No errors occurred in the batch. 117 trans // Transient errors occurred in the batch, retry the whole batch. 118 perm // Permanent errors occurred in the batch, drop the whole batch. 119 ) 120 121 // Process is the endpoint that (e.g. by cron job) should be periodically hit 122 // to operate the PullingBatchProcessor. 123 // 124 // It creates the pubsub client and processes notifications for up to 125 // Options.CronRunTime. 126 func (pbp *PullingBatchProcessor) Process(ctx context.Context) error { 127 client, err := pubsub.NewClient(ctx, pbp.ProjectID) 128 if err != nil { 129 return err 130 } 131 defer func() { 132 if err := client.Close(); err != nil { 133 logging.Errorf(ctx, "failed to close PubSub client: %s", err) 134 } 135 }() 136 return pbp.process(ctx, client) 137 } 138 139 // process actually does what Process advertises, modulo creation of the 140 // pubsub client. 141 // 142 // Unit tests can call this directly with a mock client. 143 func (pbp *PullingBatchProcessor) process(ctx context.Context, client *pubsub.Client) error { 144 if client == nil { 145 return errors.New("cannot run process() without an initialized client") 146 } 147 148 // These are atomically incremented when batch processing results in either 149 // kind of error. 150 // 151 // Process() will return an error if there's one or more permanent errors. 152 var permanentErrorCount, transientErrorCount int32 153 154 sub := client.Subscription(pbp.SubID) 155 sub.ReceiveSettings.Synchronous = true 156 // Only lease as many messages from pubsub as we can concurrently send. 157 sub.ReceiveSettings.MaxOutstandingMessages = pbp.Options.MaxBatchSize * pbp.Options.ConcurrentBatches 158 159 // Get the first permanent error for surfacing details. 160 var firstPermErr error 161 162 workItems := make(chan *pubsub.Message) 163 wg := sync.WaitGroup{} 164 wg.Add(pbp.Options.ConcurrentBatches) 165 for i := 0; i < pbp.Options.ConcurrentBatches; i++ { 166 go func() { 167 defer wg.Done() 168 for { 169 batch := nextBatch(workItems, pbp.Options.MaxBatchSize) 170 if batch == nil { 171 return 172 } 173 switch status, err := pbp.onBatch(ctx, batch); status { 174 case perm: 175 if atomic.AddInt32(&permanentErrorCount, 1) == 1 { 176 firstPermErr = err 177 } 178 case trans: 179 atomic.AddInt32(&transientErrorCount, 1) 180 } 181 } 182 }() 183 } 184 185 receiveCtx, receiveCancel := clock.WithTimeout(ctx, pbp.Options.ReceiveDuration) 186 defer receiveCancel() 187 err := sub.Receive(receiveCtx, func(ctx context.Context, msg *pubsub.Message) { 188 workItems <- msg 189 }) 190 close(workItems) 191 wg.Wait() 192 logging.Debugf(ctx, "Processed: %d batches with transient and %d with permanent errors", transientErrorCount, permanentErrorCount) 193 194 // Check receive error _after_ worker pool is done to avoid leakages. 195 if err != nil { 196 // Receive exitted due to something other than timeout or cancellation, i.e. non-retryable service error. 197 return errors.Annotate(err, "failed call to pubsub receive").Err() 198 } 199 if permanentErrorCount > 0 { 200 return errors.Reason("Process had non-transient errors. E.g. %q. Review logs for more details.", firstPermErr).Err() 201 } 202 return nil 203 } 204 205 func (opts *Options) normalize() { 206 defaults := defaultOptions() 207 if opts.ReceiveDuration == 0 { 208 opts.ReceiveDuration = defaults.ReceiveDuration 209 } 210 if opts.MaxBatchSize == 0 { 211 opts.MaxBatchSize = defaults.MaxBatchSize 212 } 213 if opts.ConcurrentBatches == 0 { 214 opts.ConcurrentBatches = defaults.ConcurrentBatches 215 } 216 } 217 218 // Validate checks missing required fields and normalizes options. 219 func (pbp *PullingBatchProcessor) Validate() error { 220 if pbp.ProjectID == "" { 221 return errors.Reason("PullingBatchProcessor.ProjectID is required").Err() 222 } 223 if pbp.SubID == "" { 224 return errors.Reason("PullingBatchProcessor.SubID is required").Err() 225 } 226 if pbp.ProcessBatch == nil { 227 return errors.Reason("PullingBatchProcessor.ProcessBatch is required").Err() 228 } 229 if pbp.Options.ReceiveDuration < 0 { 230 return errors.Reason("Options.ReceiveDuration cannot be negative").Err() 231 } 232 if pbp.Options.ConcurrentBatches < 0 { 233 return errors.Reason("Options.ConcurrentBatches cannot be negative").Err() 234 } 235 if pbp.Options.MaxBatchSize < 0 { 236 return errors.Reason("Options.MaxBatchSize cannot be negative").Err() 237 } 238 pbp.Options.normalize() 239 return nil 240 } 241 242 func (pbp *PullingBatchProcessor) onBatch(ctx context.Context, msgs []*pubsub.Message) (batchErrKind, error) { 243 // Make a copy of the messages slice to prevent losing access to messages 244 // (and thus, the ability to ack/nack them) if ProcessBatch were to change 245 // the contents of the slice. 246 msgsCopy := append(make([]*pubsub.Message, 0, len(msgs)), msgs...) 247 err := pbp.ProcessBatch(ctx, msgsCopy) 248 // Note that ProcessBatch is allowed to ack/nack messages itself, 249 // for those messages, our acking/nacking below will have no effect. 250 switch { 251 case transient.Tag.In(err): 252 // Ask for re-delivery later. 253 common.LogError(ctx, errors.Annotate(err, "NACKing for redelivery").Err()) 254 nackAll(msgs) 255 return trans, err 256 case err != nil: 257 common.LogError(ctx, errors.Annotate(err, "ACKing to avoid retries").Err()) 258 ackAll(msgs) 259 return perm, err 260 default: 261 ackAll(msgs) 262 return ok, err 263 } 264 } 265 266 // nextBatch pulls up to n immediately available items from c. 267 // 268 // It blocks until at least one item is available. 269 // If called with a closed channel, will return nil. 270 func nextBatch(c <-chan *pubsub.Message, n int) []*pubsub.Message { 271 msg, stillOpen := <-c 272 if !stillOpen { 273 return nil 274 } 275 out := append(make([]*pubsub.Message, 0, n), msg) 276 for len(out) < n { 277 select { 278 case msg, stillOpen := <-c: 279 if !stillOpen { 280 return out 281 } 282 out = append(out, msg) 283 default: 284 return out 285 } 286 } 287 return out 288 } 289 290 func ackAll(msgs []*pubsub.Message) { 291 for _, msg := range msgs { 292 msg.Ack() 293 } 294 } 295 296 func nackAll(msgs []*pubsub.Message) { 297 for _, msg := range msgs { 298 msg.Nack() 299 } 300 }