go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/common/sync/dispatcher/coordinator.go (about)

     1  // Copyright 2019 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package dispatcher
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"go.chromium.org/luci/common/clock"
    22  	"go.chromium.org/luci/common/errors"
    23  	"go.chromium.org/luci/common/sync/dispatcher/buffer"
    24  )
    25  
    26  type coordinatorState struct {
    27  	opts Options
    28  	buf  *buffer.Buffer
    29  
    30  	itemCh  <-chan any
    31  	drainCh chan<- struct{}
    32  
    33  	resultCh chan workerResult
    34  
    35  	// Used as a wake-up timer for the coordinator to wake itself up when the
    36  	// buffer will have a batch available due to buffer timeout and/or qps limiter.
    37  	timer clock.Timer
    38  
    39  	// true if itemCh is closed
    40  	closed bool
    41  
    42  	// true if our context is canceled
    43  	canceled bool
    44  }
    45  
    46  type workerResult struct {
    47  	batch *buffer.Batch
    48  	err   error
    49  }
    50  
    51  func (state *coordinatorState) dbg(msg string, args ...any) {
    52  	if state.opts.testingDbg != nil {
    53  		state.opts.testingDbg(msg, args...)
    54  	}
    55  }
    56  
    57  // sendBatches sends the batches in buffer, or a nil batch if the minimum frequency
    58  // has reached.
    59  //
    60  // It returns the timestamp when the last SendFn is invoked, and a delay if we
    61  // need to wait for the next send token.
    62  //
    63  // TODO(chanli@): Currently we assume sendBatches is very fast, so we use the same
    64  // now value throughout sendBatches. If it turns out the assumption is false, we
    65  // may have bellow issues:
    66  // * it prevents the QPSLimit from replenishing tokens during sendBatches;
    67  // * it may causes sendBatches to send an additional nil batch after sending
    68  //
    69  //	batches, while sendBatches should only try to send a nil batch if it doesn't
    70  //	have any batch to send.
    71  func (state *coordinatorState) sendBatches(ctx context.Context, now, prevLastSend time.Time, send SendFn) (lastSend time.Time, delay time.Duration) {
    72  	lastSend = prevLastSend
    73  	if state.canceled {
    74  		for _, batch := range state.buf.ForceLeaseAll() {
    75  			state.dbg("  >dropping batch: canceled")
    76  			state.opts.DropFn(batch, false)
    77  			state.buf.ACK(batch)
    78  		}
    79  		return
    80  	}
    81  
    82  	// while the context is not canceled, send stuff batches we're able to send.
    83  	for ctx.Err() == nil {
    84  		// See if we're permitted to send.
    85  		res := state.opts.QPSLimit.ReserveN(now, 1)
    86  		if !res.OK() {
    87  			panic(errors.New(
    88  				"impossible; Options.QPSLimit is guaranteed to have Inf rate or burst >= 1"))
    89  		}
    90  		if delay = res.DelayFrom(now); delay != 0 {
    91  			// We have to wait until the next send token is available. Cancel the
    92  			// reservation for now, since we're going to wait via getNextTimingEvent.
    93  			res.CancelAt(now)
    94  			return
    95  		}
    96  
    97  		// We're allowed to send, see if there's actually anything to send.
    98  		if batchToSend := state.buf.LeaseOne(now); batchToSend != nil {
    99  			// got a batch! Send it.
   100  			state.dbg("  >sending batch")
   101  			lastSend = now
   102  			go func() {
   103  				state.resultCh <- workerResult{
   104  					batch: batchToSend,
   105  					err:   send(batchToSend),
   106  				}
   107  			}()
   108  		} else {
   109  			// No more batches.
   110  
   111  			// If there will be no more batches in the future, break.
   112  			if state.closed {
   113  				res.CancelAt(now)
   114  				break
   115  			}
   116  
   117  			// Otherwise, check if the minimal frequency has reached, if yes we
   118  			// need to send a nil batch.
   119  			minInterval := durationFromLimit(state.opts.MinQPS)
   120  			if minInterval > 0 && now.Sub(lastSend) >= minInterval {
   121  				// Send a nil batch.
   122  				state.dbg("  >sending nil batch")
   123  				lastSend = now
   124  				go func() {
   125  					state.resultCh <- workerResult{
   126  						batch: nil,
   127  						err:   send(nil),
   128  					}
   129  				}()
   130  			} else {
   131  				// Cancel the reservation, since we can't use it.
   132  				res.CancelAt(now)
   133  			}
   134  			break
   135  		}
   136  	}
   137  
   138  	return
   139  }
   140  
   141  // getNextTimingEvent returns a clock.Timer channel which will activate when the
   142  // later of the following happen:
   143  //   - buffer.NextSendTime or MinQPS, whichever is earlier
   144  //   - nextQPSToken
   145  //
   146  // So resetDuration = max(min(MinQPS, nextSendTime), nextQPSToken)
   147  func (state *coordinatorState) getNextTimingEvent(now time.Time, nextQPSToken time.Duration) <-chan clock.TimerResult {
   148  	var resetDuration time.Duration
   149  	var msg string
   150  	nextSendReached := false
   151  
   152  	if nextSend := state.buf.NextSendTime(); !nextSend.IsZero() {
   153  		if nextSend.After(now) {
   154  			resetDuration = nextSend.Sub(now)
   155  			msg = "waiting on batch.NextSendTime"
   156  		} else {
   157  			nextSendReached = true
   158  		}
   159  	}
   160  
   161  	minInterval := durationFromLimit(state.opts.MinQPS)
   162  	if !nextSendReached && minInterval > 0 && (resetDuration == 0 || minInterval < resetDuration) {
   163  		resetDuration = minInterval
   164  		msg = "waiting on MinQPS"
   165  	}
   166  
   167  	if nextQPSToken > resetDuration {
   168  		resetDuration = nextQPSToken
   169  		msg = "waiting on QPS limit"
   170  	}
   171  
   172  	if resetDuration > 0 {
   173  		if !state.timer.Stop() {
   174  			select {
   175  			case <-state.timer.GetC():
   176  			default:
   177  				// The timer was already drained in the main loop.
   178  			}
   179  		}
   180  		state.timer.Reset(resetDuration)
   181  		state.dbg("  |%s (%s)", msg, resetDuration)
   182  		return state.timer.GetC()
   183  	}
   184  	return nil
   185  }
   186  
   187  // getWorkChannel returns a channel to receive an individual work item on (from
   188  // our client) if our buffer is willing to accept additional work items.
   189  //
   190  // Otherwise returns nil.
   191  func (state *coordinatorState) getWorkChannel() <-chan any {
   192  	if !state.closed && state.buf.CanAddItem() {
   193  		state.dbg("  |waiting on new data")
   194  		return state.itemCh
   195  	}
   196  	return nil
   197  }
   198  
   199  // handleResult is invoked once for each workerResult returned to the
   200  // coordinator from a worker.
   201  //
   202  // This will ACK/NACK the Batch (once).
   203  func (state *coordinatorState) handleResult(ctx context.Context, result workerResult) {
   204  	state.dbg("  GOT RESULT")
   205  
   206  	if result.err == nil {
   207  		state.dbg("    ACK")
   208  		state.buf.ACK(result.batch)
   209  		return
   210  	}
   211  
   212  	state.dbg("    ERR(%s)", result.err)
   213  	if retry := state.opts.ErrorFn(result.batch, result.err); !retry {
   214  		state.dbg("    NO RETRY (dropping batch)")
   215  		state.opts.DropFn(result.batch, false)
   216  		state.buf.ACK(result.batch)
   217  		return
   218  	}
   219  
   220  	if state.canceled {
   221  		state.dbg("    NO RETRY (dropping batch: canceled context)")
   222  		state.opts.DropFn(result.batch, false)
   223  		state.buf.ACK(result.batch)
   224  		return
   225  	}
   226  
   227  	state.dbg("    NACK")
   228  	state.buf.NACK(ctx, result.err, result.batch)
   229  	return
   230  }
   231  
   232  // coordinator is the main goroutine for managing the state of the Channel.
   233  // Exactly one coordinator() function runs per Channel. This coordinates (!!)
   234  // all of the internal channels of the external Channel object in one big select
   235  // loop.
   236  func (state *coordinatorState) run(ctx context.Context, send SendFn) {
   237  	defer close(state.drainCh)
   238  	if state.opts.DrainedFn != nil {
   239  		defer state.opts.DrainedFn()
   240  	}
   241  	defer state.opts.DropFn(nil, true)
   242  	defer close(state.resultCh)
   243  	defer state.timer.Stop()
   244  
   245  	var lastSend time.Time
   246  loop:
   247  	for {
   248  		state.dbg("LOOP (closed: %t, canceled: %t): buf.Stats[%+v]",
   249  			state.closed, state.canceled, state.buf.Stats())
   250  
   251  		now := clock.Now(ctx)
   252  		if lastSend.IsZero() {
   253  			// Initiate lastSend to now, otherwise sendBatches will immediately send
   254  			// a nil batch.
   255  			lastSend = now
   256  		}
   257  
   258  		var resDelay time.Duration
   259  		lastSend, resDelay = state.sendBatches(ctx, now, lastSend, send)
   260  
   261  		// sendBatches may drain the buf if we're in the canceled state, so pull it
   262  		// again to see if it's empty.
   263  		if state.closed && state.buf.Stats().Empty() {
   264  			break loop
   265  		}
   266  
   267  		// Only select on ctx.Done if we haven't observed its cancelation yet.
   268  		var doneCh <-chan struct{}
   269  		if !state.canceled {
   270  			doneCh = ctx.Done()
   271  		}
   272  
   273  		select {
   274  		case <-doneCh:
   275  			state.dbg("  GOT CANCEL (via context)")
   276  			state.canceled = true
   277  			state.buf.Flush(now)
   278  
   279  		case result := <-state.resultCh:
   280  			state.handleResult(ctx, result)
   281  
   282  		case itm, ok := <-state.getWorkChannel():
   283  			if !ok {
   284  				state.dbg("  GOT DRAIN")
   285  				state.closed = true
   286  				state.buf.Flush(now)
   287  				continue
   288  			}
   289  
   290  			var itemSize int
   291  			if state.opts.ItemSizeFunc != nil {
   292  				itemSize = state.opts.ItemSizeFunc(itm)
   293  			}
   294  			state.dbg("  GOT NEW DATA")
   295  			if state.canceled {
   296  				state.dbg("    dropped item (canceled)")
   297  				state.opts.DropFn(&buffer.Batch{
   298  					Data: []buffer.BatchItem{{Item: itm, Size: itemSize}},
   299  				}, false)
   300  				continue
   301  			}
   302  
   303  			dropped, err := state.buf.AddNoBlock(now, itm, itemSize)
   304  			switch err {
   305  			case nil:
   306  			case buffer.ErrItemTooLarge:
   307  				state.dbg("    dropped item (too large)")
   308  			case buffer.ErrItemTooSmall:
   309  				state.dbg("    dropped item (too small)")
   310  			default:
   311  				// "impossible", since the only other possible error is ErrBufferFull,
   312  				// which we should have protected against in getWorkChannel.
   313  				panic(errors.Annotate(err, "unaccounted error from AddNoBlock").Err())
   314  			}
   315  			if err != nil {
   316  				state.opts.ErrorFn(&buffer.Batch{
   317  					Data: []buffer.BatchItem{{Item: itm, Size: itemSize}},
   318  				}, err)
   319  				continue
   320  			}
   321  			if dropped != nil {
   322  				state.dbg("    dropped batch")
   323  				state.opts.DropFn(dropped, false)
   324  			}
   325  
   326  		case result := <-state.getNextTimingEvent(now, resDelay):
   327  			if result.Incomplete() {
   328  				state.dbg("  GOT CANCEL (via timer)")
   329  				state.canceled = true
   330  				state.buf.Flush(now)
   331  				continue
   332  			}
   333  			state.dbg("  GOT TIMER WAKEUP")
   334  			// opportunistically attempt to send batches; either a new batch is ready
   335  			// to be cut or the qps timer is up. This lowers the upper bound variance
   336  			// and gets a bit closer to the QPS target.
   337  			lastSend, _ = state.sendBatches(ctx, result.Time, lastSend, send)
   338  		}
   339  	}
   340  
   341  	state.dbg("DONE")
   342  }