github.com/kaleido-io/firefly@v0.0.0-20210622132723-8b4b6aacb971/internal/batch/batch_manager.go (about) 1 // Copyright © 2021 Kaleido, Inc. 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package batch 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "github.com/kaleido-io/firefly/internal/config" 26 "github.com/kaleido-io/firefly/internal/data" 27 "github.com/kaleido-io/firefly/internal/i18n" 28 "github.com/kaleido-io/firefly/internal/log" 29 "github.com/kaleido-io/firefly/internal/retry" 30 "github.com/kaleido-io/firefly/pkg/database" 31 "github.com/kaleido-io/firefly/pkg/fftypes" 32 ) 33 34 const ( 35 msgBatchOffsetName = "ff_msgbatch" 36 ) 37 38 func NewBatchManager(ctx context.Context, di database.Plugin, dm data.Manager) (Manager, error) { 39 if di == nil || dm == nil { 40 return nil, i18n.NewError(ctx, i18n.MsgInitializationNilDepError) 41 } 42 readPageSize := config.GetUint(config.BatchManagerReadPageSize) 43 bm := &batchManager{ 44 ctx: log.WithLogField(ctx, "role", "batchmgr"), 45 database: di, 46 data: dm, 47 readPageSize: uint64(readPageSize), 48 messagePollTimeout: config.GetDuration(config.BatchManagerReadPollTimeout), 49 startupOffsetRetryAttempts: config.GetInt(config.OrchestratorStartupAttempts), 50 dispatchers: make(map[fftypes.MessageType]*dispatcher), 51 shoulderTap: make(chan bool, 1), 52 newMessages: make(chan int64, readPageSize), 53 sequencerClosed: make(chan struct{}), 54 retry: &retry.Retry{ 55 InitialDelay: config.GetDuration(config.BatchRetryInitDelay), 56 MaximumDelay: config.GetDuration(config.BatchRetryMaxDelay), 57 Factor: config.GetFloat64(config.BatchRetryFactor), 58 }, 59 } 60 return bm, nil 61 } 62 63 type Manager interface { 64 RegisterDispatcher(msgTypes []fftypes.MessageType, handler DispatchHandler, batchOptions Options) 65 NewMessages() chan<- int64 66 Start() error 67 Close() 68 WaitStop() 69 } 70 71 type batchManager struct { 72 ctx context.Context 73 database database.Plugin 74 data data.Manager 75 dispatchers map[fftypes.MessageType]*dispatcher 76 shoulderTap chan bool 77 newMessages chan int64 78 sequencerClosed chan struct{} 79 retry *retry.Retry 80 offsetID *fftypes.UUID 81 offset int64 82 closed bool 83 readPageSize uint64 84 messagePollTimeout time.Duration 85 startupOffsetRetryAttempts int 86 } 87 88 type DispatchHandler func(context.Context, *fftypes.Batch, []*fftypes.Bytes32) error 89 90 type Options struct { 91 BatchMaxSize uint 92 BatchTimeout time.Duration 93 DisposeTimeout time.Duration 94 } 95 96 type dispatcher struct { 97 handler DispatchHandler 98 mux sync.Mutex 99 processors map[string]*batchProcessor 100 batchOptions Options 101 } 102 103 func (bm *batchManager) RegisterDispatcher(msgTypes []fftypes.MessageType, handler DispatchHandler, batchOptions Options) { 104 dispatcher := &dispatcher{ 105 handler: handler, 106 batchOptions: batchOptions, 107 processors: make(map[string]*batchProcessor), 108 } 109 for _, msgType := range msgTypes { 110 bm.dispatchers[msgType] = dispatcher 111 } 112 } 113 114 func (bm *batchManager) Start() error { 115 if err := bm.restoreOffset(); err != nil { 116 return err 117 } 118 go bm.newEventNotifications() 119 go bm.messageSequencer() 120 return nil 121 } 122 123 func (bm *batchManager) NewMessages() chan<- int64 { 124 return bm.newMessages 125 } 126 127 func (bm *batchManager) restoreOffset() (err error) { 128 var offset *fftypes.Offset 129 for offset == nil { 130 offset, err = bm.database.GetOffset(bm.ctx, fftypes.OffsetTypeBatch, fftypes.SystemNamespace, msgBatchOffsetName) 131 if err != nil { 132 return err 133 } 134 if offset == nil { 135 _ = bm.database.UpsertOffset(bm.ctx, &fftypes.Offset{ 136 ID: fftypes.NewUUID(), 137 Type: fftypes.OffsetTypeBatch, 138 Namespace: fftypes.SystemNamespace, 139 Name: msgBatchOffsetName, 140 Current: 0, 141 }, false) 142 } 143 } 144 bm.offsetID = offset.ID 145 bm.offset = offset.Current 146 log.L(bm.ctx).Infof("Batch manager restored offset %d", bm.offset) 147 return nil 148 } 149 150 func (bm *batchManager) removeProcessor(dispatcher *dispatcher, key string) { 151 dispatcher.mux.Lock() 152 delete(dispatcher.processors, key) 153 dispatcher.mux.Unlock() 154 } 155 156 func (bm *batchManager) getProcessor(batchType fftypes.MessageType, group *fftypes.Bytes32, namespace, author string) (*batchProcessor, error) { 157 dispatcher, ok := bm.dispatchers[batchType] 158 if !ok { 159 return nil, i18n.NewError(bm.ctx, i18n.MsgUnregisteredBatchType, batchType) 160 } 161 dispatcher.mux.Lock() 162 key := fmt.Sprintf("%s:%s[group=%v]", namespace, author, group) 163 processor, ok := dispatcher.processors[key] 164 if !ok { 165 processor = newBatchProcessor( 166 bm.ctx, // Background context, not the call context 167 bm.database, 168 &batchProcessorConf{ 169 Options: dispatcher.batchOptions, 170 namespace: namespace, 171 author: author, 172 group: group, 173 dispatch: dispatcher.handler, 174 processorQuiescing: func() { 175 bm.removeProcessor(dispatcher, key) 176 }, 177 }, 178 bm.retry, 179 ) 180 dispatcher.processors[key] = processor 181 } 182 log.L(bm.ctx).Debugf("Created new processor: %s", key) 183 dispatcher.mux.Unlock() 184 return processor, nil 185 } 186 187 func (bm *batchManager) Close() { 188 if bm != nil && !bm.closed { 189 for _, d := range bm.dispatchers { 190 d.mux.Lock() 191 for _, p := range d.processors { 192 p.close() 193 } 194 d.mux.Unlock() 195 } 196 bm.closed = true 197 close(bm.newMessages) 198 } 199 bm = nil 200 } 201 202 func (bm *batchManager) assembleMessageData(msg *fftypes.Message) (data []*fftypes.Data, err error) { 203 var foundAll = false 204 err = bm.retry.Do(bm.ctx, fmt.Sprintf("assemble message %s data", msg.Header.ID), func(attempt int) (retry bool, err error) { 205 data, foundAll, err = bm.data.GetMessageData(bm.ctx, msg, true) 206 // continual retry for persistence error (distinct from not-found) 207 return err != nil && !bm.closed, err 208 }) 209 if err != nil { 210 return nil, err 211 } 212 if !foundAll { 213 return nil, i18n.NewError(bm.ctx, i18n.MsgDataNotFound, msg.Header.ID) 214 } 215 log.L(bm.ctx).Infof("Added broadcast message %s", msg.Header.ID) 216 return data, nil 217 } 218 219 func (bm *batchManager) readPage() ([]*fftypes.Message, error) { 220 var msgs []*fftypes.Message 221 err := bm.retry.Do(bm.ctx, "retrieve messages", func(attempt int) (retry bool, err error) { 222 fb := database.MessageQueryFactory.NewFilterLimit(bm.ctx, bm.readPageSize) 223 msgs, err = bm.database.GetMessages(bm.ctx, fb.And( 224 fb.Gt("sequence", bm.offset), 225 fb.Eq("local", true), 226 ).Sort("sequence").Limit(bm.readPageSize)) 227 if err != nil { 228 return !bm.closed, err // Retry indefinitely, until closed (or context cancelled) 229 } 230 return false, nil 231 }) 232 return msgs, err 233 } 234 235 func (bm *batchManager) messageSequencer() { 236 l := log.L(bm.ctx) 237 l.Debugf("Started batch assembly message sequencer") 238 defer close(bm.sequencerClosed) 239 240 dispatched := make(chan *batchDispatch, bm.readPageSize) 241 242 for !bm.closed { 243 // Read messages from the DB - in an error condition we retry until success, or a closed context 244 msgs, err := bm.readPage() 245 if err != nil { 246 l.Debugf("Exiting: %s", err) // errors logged in readPage 247 return 248 } 249 batchWasFull := false 250 251 if len(msgs) > 0 { 252 batchWasFull = (uint64(len(msgs)) == bm.readPageSize) 253 var dispatchCount int 254 for _, msg := range msgs { 255 data, err := bm.assembleMessageData(msg) 256 if err != nil { 257 l.Errorf("Failed to retrieve message data for %s: %s", msg.Header.ID, err) 258 continue 259 } 260 261 err = bm.dispatchMessage(dispatched, msg, data...) 262 if err != nil { 263 l.Errorf("Failed to dispatch message %s: %s", msg.Header.ID, err) 264 continue 265 } 266 dispatchCount++ 267 } 268 269 for i := 0; i < dispatchCount; i++ { 270 select { 271 case dispatched := <-dispatched: 272 l.Debugf("Dispatched message %s to batch %s", dispatched.msg.Header.ID, dispatched.batchID) 273 case <-bm.ctx.Done(): 274 l.Debugf("Message sequencer exiting (context closed)") 275 bm.Close() 276 return 277 } 278 } 279 280 if !bm.closed { 281 _ = bm.updateOffset(true, msgs[len(msgs)-1].Sequence) 282 } 283 } 284 285 // Wait to be woken again 286 if !bm.closed && !batchWasFull { 287 bm.waitForShoulderTapOrPollTimeout() 288 } 289 } 290 } 291 292 // newEventNotifications just consumes new messags, logs them, then ensures there's a shoulderTap 293 // in the channel - without blocking. This is important as we must not block the notifier 294 func (bm *batchManager) newEventNotifications() { 295 l := log.L(bm.ctx).WithField("role", "batch-newmessages") 296 for { 297 select { 298 case m, ok := <-bm.newMessages: 299 if !ok { 300 l.Debugf("Exiting due to close") 301 return 302 } 303 l.Debugf("New message sequence notification: %d", m) 304 case <-bm.ctx.Done(): 305 l.Debugf("Exiting due to cancelled context") 306 return 307 } 308 // Do not block sending to the shoulderTap - as it can only contain one 309 select { 310 case bm.shoulderTap <- true: 311 default: 312 } 313 } 314 } 315 316 func (bm *batchManager) waitForShoulderTapOrPollTimeout() { 317 l := log.L(bm.ctx) 318 timeout := time.NewTimer(bm.messagePollTimeout) 319 select { 320 case <-timeout.C: 321 l.Debugf("Woken after poll timeout") 322 case <-bm.shoulderTap: 323 l.Debugf("Woken for trigger for messages") 324 case <-bm.ctx.Done(): 325 l.Debugf("Exiting due to cancelled context") 326 bm.Close() 327 return 328 } 329 } 330 331 func (bm *batchManager) updateOffset(infiniteRetry bool, newOffset int64) (err error) { 332 l := log.L(bm.ctx) 333 return bm.retry.Do(bm.ctx, "update offset", func(attempt int) (retry bool, err error) { 334 bm.offset = newOffset 335 u := database.OffsetQueryFactory.NewUpdate(bm.ctx).Set("current", bm.offset) 336 err = bm.database.UpdateOffset(bm.ctx, bm.offsetID, u) 337 if err != nil { 338 l.Errorf("Batch persist attempt %d failed: %s", attempt, err) 339 stillRetrying := infiniteRetry || (attempt <= bm.startupOffsetRetryAttempts) 340 return !bm.closed && stillRetrying, err 341 } 342 l.Infof("Batch manager committed offset %d", newOffset) 343 return false, nil 344 }) 345 } 346 347 func (bm *batchManager) dispatchMessage(dispatched chan *batchDispatch, msg *fftypes.Message, data ...*fftypes.Data) error { 348 l := log.L(bm.ctx) 349 processor, err := bm.getProcessor(msg.Header.Type, msg.Header.Group, msg.Header.Namespace, msg.Header.Author) 350 if err != nil { 351 return err 352 } 353 l.Debugf("Dispatching message %s to %s batch", msg.Header.ID, msg.Header.Type) 354 work := &batchWork{ 355 msg: msg, 356 data: data, 357 dispatched: dispatched, 358 } 359 processor.newWork <- work 360 return nil 361 } 362 363 func (bm *batchManager) WaitStop() { 364 <-bm.sequencerClosed 365 var processors []*batchProcessor 366 for _, d := range bm.dispatchers { 367 d.mux.Lock() 368 for _, p := range d.processors { 369 processors = append(processors, p) 370 } 371 d.mux.Unlock() 372 } 373 for _, p := range processors { 374 p.waitClosed() 375 } 376 }