go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/common/sync/dispatcher/coordinator.go (about) 1 // Copyright 2019 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package dispatcher 16 17 import ( 18 "context" 19 "time" 20 21 "go.chromium.org/luci/common/clock" 22 "go.chromium.org/luci/common/errors" 23 "go.chromium.org/luci/common/sync/dispatcher/buffer" 24 ) 25 26 type coordinatorState struct { 27 opts Options 28 buf *buffer.Buffer 29 30 itemCh <-chan any 31 drainCh chan<- struct{} 32 33 resultCh chan workerResult 34 35 // Used as a wake-up timer for the coordinator to wake itself up when the 36 // buffer will have a batch available due to buffer timeout and/or qps limiter. 37 timer clock.Timer 38 39 // true if itemCh is closed 40 closed bool 41 42 // true if our context is canceled 43 canceled bool 44 } 45 46 type workerResult struct { 47 batch *buffer.Batch 48 err error 49 } 50 51 func (state *coordinatorState) dbg(msg string, args ...any) { 52 if state.opts.testingDbg != nil { 53 state.opts.testingDbg(msg, args...) 54 } 55 } 56 57 // sendBatches sends the batches in buffer, or a nil batch if the minimum frequency 58 // has reached. 59 // 60 // It returns the timestamp when the last SendFn is invoked, and a delay if we 61 // need to wait for the next send token. 62 // 63 // TODO(chanli@): Currently we assume sendBatches is very fast, so we use the same 64 // now value throughout sendBatches. If it turns out the assumption is false, we 65 // may have bellow issues: 66 // * it prevents the QPSLimit from replenishing tokens during sendBatches; 67 // * it may causes sendBatches to send an additional nil batch after sending 68 // 69 // batches, while sendBatches should only try to send a nil batch if it doesn't 70 // have any batch to send. 71 func (state *coordinatorState) sendBatches(ctx context.Context, now, prevLastSend time.Time, send SendFn) (lastSend time.Time, delay time.Duration) { 72 lastSend = prevLastSend 73 if state.canceled { 74 for _, batch := range state.buf.ForceLeaseAll() { 75 state.dbg(" >dropping batch: canceled") 76 state.opts.DropFn(batch, false) 77 state.buf.ACK(batch) 78 } 79 return 80 } 81 82 // while the context is not canceled, send stuff batches we're able to send. 83 for ctx.Err() == nil { 84 // See if we're permitted to send. 85 res := state.opts.QPSLimit.ReserveN(now, 1) 86 if !res.OK() { 87 panic(errors.New( 88 "impossible; Options.QPSLimit is guaranteed to have Inf rate or burst >= 1")) 89 } 90 if delay = res.DelayFrom(now); delay != 0 { 91 // We have to wait until the next send token is available. Cancel the 92 // reservation for now, since we're going to wait via getNextTimingEvent. 93 res.CancelAt(now) 94 return 95 } 96 97 // We're allowed to send, see if there's actually anything to send. 98 if batchToSend := state.buf.LeaseOne(now); batchToSend != nil { 99 // got a batch! Send it. 100 state.dbg(" >sending batch") 101 lastSend = now 102 go func() { 103 state.resultCh <- workerResult{ 104 batch: batchToSend, 105 err: send(batchToSend), 106 } 107 }() 108 } else { 109 // No more batches. 110 111 // If there will be no more batches in the future, break. 112 if state.closed { 113 res.CancelAt(now) 114 break 115 } 116 117 // Otherwise, check if the minimal frequency has reached, if yes we 118 // need to send a nil batch. 119 minInterval := durationFromLimit(state.opts.MinQPS) 120 if minInterval > 0 && now.Sub(lastSend) >= minInterval { 121 // Send a nil batch. 122 state.dbg(" >sending nil batch") 123 lastSend = now 124 go func() { 125 state.resultCh <- workerResult{ 126 batch: nil, 127 err: send(nil), 128 } 129 }() 130 } else { 131 // Cancel the reservation, since we can't use it. 132 res.CancelAt(now) 133 } 134 break 135 } 136 } 137 138 return 139 } 140 141 // getNextTimingEvent returns a clock.Timer channel which will activate when the 142 // later of the following happen: 143 // - buffer.NextSendTime or MinQPS, whichever is earlier 144 // - nextQPSToken 145 // 146 // So resetDuration = max(min(MinQPS, nextSendTime), nextQPSToken) 147 func (state *coordinatorState) getNextTimingEvent(now time.Time, nextQPSToken time.Duration) <-chan clock.TimerResult { 148 var resetDuration time.Duration 149 var msg string 150 nextSendReached := false 151 152 if nextSend := state.buf.NextSendTime(); !nextSend.IsZero() { 153 if nextSend.After(now) { 154 resetDuration = nextSend.Sub(now) 155 msg = "waiting on batch.NextSendTime" 156 } else { 157 nextSendReached = true 158 } 159 } 160 161 minInterval := durationFromLimit(state.opts.MinQPS) 162 if !nextSendReached && minInterval > 0 && (resetDuration == 0 || minInterval < resetDuration) { 163 resetDuration = minInterval 164 msg = "waiting on MinQPS" 165 } 166 167 if nextQPSToken > resetDuration { 168 resetDuration = nextQPSToken 169 msg = "waiting on QPS limit" 170 } 171 172 if resetDuration > 0 { 173 if !state.timer.Stop() { 174 select { 175 case <-state.timer.GetC(): 176 default: 177 // The timer was already drained in the main loop. 178 } 179 } 180 state.timer.Reset(resetDuration) 181 state.dbg(" |%s (%s)", msg, resetDuration) 182 return state.timer.GetC() 183 } 184 return nil 185 } 186 187 // getWorkChannel returns a channel to receive an individual work item on (from 188 // our client) if our buffer is willing to accept additional work items. 189 // 190 // Otherwise returns nil. 191 func (state *coordinatorState) getWorkChannel() <-chan any { 192 if !state.closed && state.buf.CanAddItem() { 193 state.dbg(" |waiting on new data") 194 return state.itemCh 195 } 196 return nil 197 } 198 199 // handleResult is invoked once for each workerResult returned to the 200 // coordinator from a worker. 201 // 202 // This will ACK/NACK the Batch (once). 203 func (state *coordinatorState) handleResult(ctx context.Context, result workerResult) { 204 state.dbg(" GOT RESULT") 205 206 if result.err == nil { 207 state.dbg(" ACK") 208 state.buf.ACK(result.batch) 209 return 210 } 211 212 state.dbg(" ERR(%s)", result.err) 213 if retry := state.opts.ErrorFn(result.batch, result.err); !retry { 214 state.dbg(" NO RETRY (dropping batch)") 215 state.opts.DropFn(result.batch, false) 216 state.buf.ACK(result.batch) 217 return 218 } 219 220 if state.canceled { 221 state.dbg(" NO RETRY (dropping batch: canceled context)") 222 state.opts.DropFn(result.batch, false) 223 state.buf.ACK(result.batch) 224 return 225 } 226 227 state.dbg(" NACK") 228 state.buf.NACK(ctx, result.err, result.batch) 229 return 230 } 231 232 // coordinator is the main goroutine for managing the state of the Channel. 233 // Exactly one coordinator() function runs per Channel. This coordinates (!!) 234 // all of the internal channels of the external Channel object in one big select 235 // loop. 236 func (state *coordinatorState) run(ctx context.Context, send SendFn) { 237 defer close(state.drainCh) 238 if state.opts.DrainedFn != nil { 239 defer state.opts.DrainedFn() 240 } 241 defer state.opts.DropFn(nil, true) 242 defer close(state.resultCh) 243 defer state.timer.Stop() 244 245 var lastSend time.Time 246 loop: 247 for { 248 state.dbg("LOOP (closed: %t, canceled: %t): buf.Stats[%+v]", 249 state.closed, state.canceled, state.buf.Stats()) 250 251 now := clock.Now(ctx) 252 if lastSend.IsZero() { 253 // Initiate lastSend to now, otherwise sendBatches will immediately send 254 // a nil batch. 255 lastSend = now 256 } 257 258 var resDelay time.Duration 259 lastSend, resDelay = state.sendBatches(ctx, now, lastSend, send) 260 261 // sendBatches may drain the buf if we're in the canceled state, so pull it 262 // again to see if it's empty. 263 if state.closed && state.buf.Stats().Empty() { 264 break loop 265 } 266 267 // Only select on ctx.Done if we haven't observed its cancelation yet. 268 var doneCh <-chan struct{} 269 if !state.canceled { 270 doneCh = ctx.Done() 271 } 272 273 select { 274 case <-doneCh: 275 state.dbg(" GOT CANCEL (via context)") 276 state.canceled = true 277 state.buf.Flush(now) 278 279 case result := <-state.resultCh: 280 state.handleResult(ctx, result) 281 282 case itm, ok := <-state.getWorkChannel(): 283 if !ok { 284 state.dbg(" GOT DRAIN") 285 state.closed = true 286 state.buf.Flush(now) 287 continue 288 } 289 290 var itemSize int 291 if state.opts.ItemSizeFunc != nil { 292 itemSize = state.opts.ItemSizeFunc(itm) 293 } 294 state.dbg(" GOT NEW DATA") 295 if state.canceled { 296 state.dbg(" dropped item (canceled)") 297 state.opts.DropFn(&buffer.Batch{ 298 Data: []buffer.BatchItem{{Item: itm, Size: itemSize}}, 299 }, false) 300 continue 301 } 302 303 dropped, err := state.buf.AddNoBlock(now, itm, itemSize) 304 switch err { 305 case nil: 306 case buffer.ErrItemTooLarge: 307 state.dbg(" dropped item (too large)") 308 case buffer.ErrItemTooSmall: 309 state.dbg(" dropped item (too small)") 310 default: 311 // "impossible", since the only other possible error is ErrBufferFull, 312 // which we should have protected against in getWorkChannel. 313 panic(errors.Annotate(err, "unaccounted error from AddNoBlock").Err()) 314 } 315 if err != nil { 316 state.opts.ErrorFn(&buffer.Batch{ 317 Data: []buffer.BatchItem{{Item: itm, Size: itemSize}}, 318 }, err) 319 continue 320 } 321 if dropped != nil { 322 state.dbg(" dropped batch") 323 state.opts.DropFn(dropped, false) 324 } 325 326 case result := <-state.getNextTimingEvent(now, resDelay): 327 if result.Incomplete() { 328 state.dbg(" GOT CANCEL (via timer)") 329 state.canceled = true 330 state.buf.Flush(now) 331 continue 332 } 333 state.dbg(" GOT TIMER WAKEUP") 334 // opportunistically attempt to send batches; either a new batch is ready 335 // to be cut or the qps timer is up. This lowers the upper bound variance 336 // and gets a bit closer to the QPS target. 337 lastSend, _ = state.sendBatches(ctx, result.Time, lastSend, send) 338 } 339 } 340 341 state.dbg("DONE") 342 }