github.com/kaleido-io/firefly@v0.0.0-20210622132723-8b4b6aacb971/internal/batch/batch_manager.go (about)

     1  // Copyright © 2021 Kaleido, Inc.
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //     http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package batch
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"github.com/kaleido-io/firefly/internal/config"
    26  	"github.com/kaleido-io/firefly/internal/data"
    27  	"github.com/kaleido-io/firefly/internal/i18n"
    28  	"github.com/kaleido-io/firefly/internal/log"
    29  	"github.com/kaleido-io/firefly/internal/retry"
    30  	"github.com/kaleido-io/firefly/pkg/database"
    31  	"github.com/kaleido-io/firefly/pkg/fftypes"
    32  )
    33  
    34  const (
    35  	msgBatchOffsetName = "ff_msgbatch"
    36  )
    37  
    38  func NewBatchManager(ctx context.Context, di database.Plugin, dm data.Manager) (Manager, error) {
    39  	if di == nil || dm == nil {
    40  		return nil, i18n.NewError(ctx, i18n.MsgInitializationNilDepError)
    41  	}
    42  	readPageSize := config.GetUint(config.BatchManagerReadPageSize)
    43  	bm := &batchManager{
    44  		ctx:                        log.WithLogField(ctx, "role", "batchmgr"),
    45  		database:                   di,
    46  		data:                       dm,
    47  		readPageSize:               uint64(readPageSize),
    48  		messagePollTimeout:         config.GetDuration(config.BatchManagerReadPollTimeout),
    49  		startupOffsetRetryAttempts: config.GetInt(config.OrchestratorStartupAttempts),
    50  		dispatchers:                make(map[fftypes.MessageType]*dispatcher),
    51  		shoulderTap:                make(chan bool, 1),
    52  		newMessages:                make(chan int64, readPageSize),
    53  		sequencerClosed:            make(chan struct{}),
    54  		retry: &retry.Retry{
    55  			InitialDelay: config.GetDuration(config.BatchRetryInitDelay),
    56  			MaximumDelay: config.GetDuration(config.BatchRetryMaxDelay),
    57  			Factor:       config.GetFloat64(config.BatchRetryFactor),
    58  		},
    59  	}
    60  	return bm, nil
    61  }
    62  
    63  type Manager interface {
    64  	RegisterDispatcher(msgTypes []fftypes.MessageType, handler DispatchHandler, batchOptions Options)
    65  	NewMessages() chan<- int64
    66  	Start() error
    67  	Close()
    68  	WaitStop()
    69  }
    70  
    71  type batchManager struct {
    72  	ctx                        context.Context
    73  	database                   database.Plugin
    74  	data                       data.Manager
    75  	dispatchers                map[fftypes.MessageType]*dispatcher
    76  	shoulderTap                chan bool
    77  	newMessages                chan int64
    78  	sequencerClosed            chan struct{}
    79  	retry                      *retry.Retry
    80  	offsetID                   *fftypes.UUID
    81  	offset                     int64
    82  	closed                     bool
    83  	readPageSize               uint64
    84  	messagePollTimeout         time.Duration
    85  	startupOffsetRetryAttempts int
    86  }
    87  
    88  type DispatchHandler func(context.Context, *fftypes.Batch, []*fftypes.Bytes32) error
    89  
    90  type Options struct {
    91  	BatchMaxSize   uint
    92  	BatchTimeout   time.Duration
    93  	DisposeTimeout time.Duration
    94  }
    95  
    96  type dispatcher struct {
    97  	handler      DispatchHandler
    98  	mux          sync.Mutex
    99  	processors   map[string]*batchProcessor
   100  	batchOptions Options
   101  }
   102  
   103  func (bm *batchManager) RegisterDispatcher(msgTypes []fftypes.MessageType, handler DispatchHandler, batchOptions Options) {
   104  	dispatcher := &dispatcher{
   105  		handler:      handler,
   106  		batchOptions: batchOptions,
   107  		processors:   make(map[string]*batchProcessor),
   108  	}
   109  	for _, msgType := range msgTypes {
   110  		bm.dispatchers[msgType] = dispatcher
   111  	}
   112  }
   113  
   114  func (bm *batchManager) Start() error {
   115  	if err := bm.restoreOffset(); err != nil {
   116  		return err
   117  	}
   118  	go bm.newEventNotifications()
   119  	go bm.messageSequencer()
   120  	return nil
   121  }
   122  
   123  func (bm *batchManager) NewMessages() chan<- int64 {
   124  	return bm.newMessages
   125  }
   126  
   127  func (bm *batchManager) restoreOffset() (err error) {
   128  	var offset *fftypes.Offset
   129  	for offset == nil {
   130  		offset, err = bm.database.GetOffset(bm.ctx, fftypes.OffsetTypeBatch, fftypes.SystemNamespace, msgBatchOffsetName)
   131  		if err != nil {
   132  			return err
   133  		}
   134  		if offset == nil {
   135  			_ = bm.database.UpsertOffset(bm.ctx, &fftypes.Offset{
   136  				ID:        fftypes.NewUUID(),
   137  				Type:      fftypes.OffsetTypeBatch,
   138  				Namespace: fftypes.SystemNamespace,
   139  				Name:      msgBatchOffsetName,
   140  				Current:   0,
   141  			}, false)
   142  		}
   143  	}
   144  	bm.offsetID = offset.ID
   145  	bm.offset = offset.Current
   146  	log.L(bm.ctx).Infof("Batch manager restored offset %d", bm.offset)
   147  	return nil
   148  }
   149  
   150  func (bm *batchManager) removeProcessor(dispatcher *dispatcher, key string) {
   151  	dispatcher.mux.Lock()
   152  	delete(dispatcher.processors, key)
   153  	dispatcher.mux.Unlock()
   154  }
   155  
   156  func (bm *batchManager) getProcessor(batchType fftypes.MessageType, group *fftypes.Bytes32, namespace, author string) (*batchProcessor, error) {
   157  	dispatcher, ok := bm.dispatchers[batchType]
   158  	if !ok {
   159  		return nil, i18n.NewError(bm.ctx, i18n.MsgUnregisteredBatchType, batchType)
   160  	}
   161  	dispatcher.mux.Lock()
   162  	key := fmt.Sprintf("%s:%s[group=%v]", namespace, author, group)
   163  	processor, ok := dispatcher.processors[key]
   164  	if !ok {
   165  		processor = newBatchProcessor(
   166  			bm.ctx, // Background context, not the call context
   167  			bm.database,
   168  			&batchProcessorConf{
   169  				Options:   dispatcher.batchOptions,
   170  				namespace: namespace,
   171  				author:    author,
   172  				group:     group,
   173  				dispatch:  dispatcher.handler,
   174  				processorQuiescing: func() {
   175  					bm.removeProcessor(dispatcher, key)
   176  				},
   177  			},
   178  			bm.retry,
   179  		)
   180  		dispatcher.processors[key] = processor
   181  	}
   182  	log.L(bm.ctx).Debugf("Created new processor: %s", key)
   183  	dispatcher.mux.Unlock()
   184  	return processor, nil
   185  }
   186  
   187  func (bm *batchManager) Close() {
   188  	if bm != nil && !bm.closed {
   189  		for _, d := range bm.dispatchers {
   190  			d.mux.Lock()
   191  			for _, p := range d.processors {
   192  				p.close()
   193  			}
   194  			d.mux.Unlock()
   195  		}
   196  		bm.closed = true
   197  		close(bm.newMessages)
   198  	}
   199  	bm = nil
   200  }
   201  
   202  func (bm *batchManager) assembleMessageData(msg *fftypes.Message) (data []*fftypes.Data, err error) {
   203  	var foundAll = false
   204  	err = bm.retry.Do(bm.ctx, fmt.Sprintf("assemble message %s data", msg.Header.ID), func(attempt int) (retry bool, err error) {
   205  		data, foundAll, err = bm.data.GetMessageData(bm.ctx, msg, true)
   206  		// continual retry for persistence error (distinct from not-found)
   207  		return err != nil && !bm.closed, err
   208  	})
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  	if !foundAll {
   213  		return nil, i18n.NewError(bm.ctx, i18n.MsgDataNotFound, msg.Header.ID)
   214  	}
   215  	log.L(bm.ctx).Infof("Added broadcast message %s", msg.Header.ID)
   216  	return data, nil
   217  }
   218  
   219  func (bm *batchManager) readPage() ([]*fftypes.Message, error) {
   220  	var msgs []*fftypes.Message
   221  	err := bm.retry.Do(bm.ctx, "retrieve messages", func(attempt int) (retry bool, err error) {
   222  		fb := database.MessageQueryFactory.NewFilterLimit(bm.ctx, bm.readPageSize)
   223  		msgs, err = bm.database.GetMessages(bm.ctx, fb.And(
   224  			fb.Gt("sequence", bm.offset),
   225  			fb.Eq("local", true),
   226  		).Sort("sequence").Limit(bm.readPageSize))
   227  		if err != nil {
   228  			return !bm.closed, err // Retry indefinitely, until closed (or context cancelled)
   229  		}
   230  		return false, nil
   231  	})
   232  	return msgs, err
   233  }
   234  
   235  func (bm *batchManager) messageSequencer() {
   236  	l := log.L(bm.ctx)
   237  	l.Debugf("Started batch assembly message sequencer")
   238  	defer close(bm.sequencerClosed)
   239  
   240  	dispatched := make(chan *batchDispatch, bm.readPageSize)
   241  
   242  	for !bm.closed {
   243  		// Read messages from the DB - in an error condition we retry until success, or a closed context
   244  		msgs, err := bm.readPage()
   245  		if err != nil {
   246  			l.Debugf("Exiting: %s", err) // errors logged in readPage
   247  			return
   248  		}
   249  		batchWasFull := false
   250  
   251  		if len(msgs) > 0 {
   252  			batchWasFull = (uint64(len(msgs)) == bm.readPageSize)
   253  			var dispatchCount int
   254  			for _, msg := range msgs {
   255  				data, err := bm.assembleMessageData(msg)
   256  				if err != nil {
   257  					l.Errorf("Failed to retrieve message data for %s: %s", msg.Header.ID, err)
   258  					continue
   259  				}
   260  
   261  				err = bm.dispatchMessage(dispatched, msg, data...)
   262  				if err != nil {
   263  					l.Errorf("Failed to dispatch message %s: %s", msg.Header.ID, err)
   264  					continue
   265  				}
   266  				dispatchCount++
   267  			}
   268  
   269  			for i := 0; i < dispatchCount; i++ {
   270  				select {
   271  				case dispatched := <-dispatched:
   272  					l.Debugf("Dispatched message %s to batch %s", dispatched.msg.Header.ID, dispatched.batchID)
   273  				case <-bm.ctx.Done():
   274  					l.Debugf("Message sequencer exiting (context closed)")
   275  					bm.Close()
   276  					return
   277  				}
   278  			}
   279  
   280  			if !bm.closed {
   281  				_ = bm.updateOffset(true, msgs[len(msgs)-1].Sequence)
   282  			}
   283  		}
   284  
   285  		// Wait to be woken again
   286  		if !bm.closed && !batchWasFull {
   287  			bm.waitForShoulderTapOrPollTimeout()
   288  		}
   289  	}
   290  }
   291  
   292  // newEventNotifications just consumes new messags, logs them, then ensures there's a shoulderTap
   293  // in the channel - without blocking. This is important as we must not block the notifier
   294  func (bm *batchManager) newEventNotifications() {
   295  	l := log.L(bm.ctx).WithField("role", "batch-newmessages")
   296  	for {
   297  		select {
   298  		case m, ok := <-bm.newMessages:
   299  			if !ok {
   300  				l.Debugf("Exiting due to close")
   301  				return
   302  			}
   303  			l.Debugf("New message sequence notification: %d", m)
   304  		case <-bm.ctx.Done():
   305  			l.Debugf("Exiting due to cancelled context")
   306  			return
   307  		}
   308  		// Do not block sending to the shoulderTap - as it can only contain one
   309  		select {
   310  		case bm.shoulderTap <- true:
   311  		default:
   312  		}
   313  	}
   314  }
   315  
   316  func (bm *batchManager) waitForShoulderTapOrPollTimeout() {
   317  	l := log.L(bm.ctx)
   318  	timeout := time.NewTimer(bm.messagePollTimeout)
   319  	select {
   320  	case <-timeout.C:
   321  		l.Debugf("Woken after poll timeout")
   322  	case <-bm.shoulderTap:
   323  		l.Debugf("Woken for trigger for messages")
   324  	case <-bm.ctx.Done():
   325  		l.Debugf("Exiting due to cancelled context")
   326  		bm.Close()
   327  		return
   328  	}
   329  }
   330  
   331  func (bm *batchManager) updateOffset(infiniteRetry bool, newOffset int64) (err error) {
   332  	l := log.L(bm.ctx)
   333  	return bm.retry.Do(bm.ctx, "update offset", func(attempt int) (retry bool, err error) {
   334  		bm.offset = newOffset
   335  		u := database.OffsetQueryFactory.NewUpdate(bm.ctx).Set("current", bm.offset)
   336  		err = bm.database.UpdateOffset(bm.ctx, bm.offsetID, u)
   337  		if err != nil {
   338  			l.Errorf("Batch persist attempt %d failed: %s", attempt, err)
   339  			stillRetrying := infiniteRetry || (attempt <= bm.startupOffsetRetryAttempts)
   340  			return !bm.closed && stillRetrying, err
   341  		}
   342  		l.Infof("Batch manager committed offset %d", newOffset)
   343  		return false, nil
   344  	})
   345  }
   346  
   347  func (bm *batchManager) dispatchMessage(dispatched chan *batchDispatch, msg *fftypes.Message, data ...*fftypes.Data) error {
   348  	l := log.L(bm.ctx)
   349  	processor, err := bm.getProcessor(msg.Header.Type, msg.Header.Group, msg.Header.Namespace, msg.Header.Author)
   350  	if err != nil {
   351  		return err
   352  	}
   353  	l.Debugf("Dispatching message %s to %s batch", msg.Header.ID, msg.Header.Type)
   354  	work := &batchWork{
   355  		msg:        msg,
   356  		data:       data,
   357  		dispatched: dispatched,
   358  	}
   359  	processor.newWork <- work
   360  	return nil
   361  }
   362  
   363  func (bm *batchManager) WaitStop() {
   364  	<-bm.sequencerClosed
   365  	var processors []*batchProcessor
   366  	for _, d := range bm.dispatchers {
   367  		d.mux.Lock()
   368  		for _, p := range d.processors {
   369  			processors = append(processors, p)
   370  		}
   371  		d.mux.Unlock()
   372  	}
   373  	for _, p := range processors {
   374  		p.waitClosed()
   375  	}
   376  }