github.com/kaituanwang/hyperledger@v2.0.1+incompatible/orderer/consensus/kafka/chain.go

github.com/kaituanwang/hyperledger@v2.0.1+incompatible/orderer/consensus/kafka/chain.go (about)

     1  /*
     2  Copyright IBM Corp. All Rights Reserved.
     3  
     4  SPDX-License-Identifier: Apache-2.0
     5  */
     6  
     7  package kafka
     8  
     9  import (
    10  	"context"
    11  	"fmt"
    12  	"strconv"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/Shopify/sarama"
    17  	"github.com/golang/protobuf/proto"
    18  	cb "github.com/hyperledger/fabric-protos-go/common"
    19  	ab "github.com/hyperledger/fabric-protos-go/orderer"
    20  	"github.com/hyperledger/fabric/orderer/common/localconfig"
    21  	"github.com/hyperledger/fabric/orderer/common/msgprocessor"
    22  	"github.com/hyperledger/fabric/orderer/consensus"
    23  	"github.com/hyperledger/fabric/protoutil"
    24  	"github.com/pkg/errors"
    25  )
    26  
    27  // Used for capturing metrics -- see processMessagesToBlocks
    28  const (
    29  	indexRecvError = iota
    30  	indexUnmarshalError
    31  	indexRecvPass
    32  	indexProcessConnectPass
    33  	indexProcessTimeToCutError
    34  	indexProcessTimeToCutPass
    35  	indexProcessRegularError
    36  	indexProcessRegularPass
    37  	indexSendTimeToCutError
    38  	indexSendTimeToCutPass
    39  	indexExitChanPass
    40  )
    41  
    42  func newChain(
    43  	consenter commonConsenter,
    44  	support consensus.ConsenterSupport,
    45  	lastOffsetPersisted int64,
    46  	lastOriginalOffsetProcessed int64,
    47  	lastResubmittedConfigOffset int64,
    48  ) (*chainImpl, error) {
    49  	lastCutBlockNumber := getLastCutBlockNumber(support.Height())
    50  	logger.Infof("[channel: %s] Starting chain with last persisted offset %d and last recorded block [%d]",
    51  		support.ChannelID(), lastOffsetPersisted, lastCutBlockNumber)
    52  
    53  	doneReprocessingMsgInFlight := make(chan struct{})
    54  	// In either one of following cases, we should unblock ingress messages:
    55  	// - lastResubmittedConfigOffset == 0, where we've never resubmitted any config messages
    56  	// - lastResubmittedConfigOffset == lastOriginalOffsetProcessed, where the latest config message we resubmitted
    57  	//   has been processed already
    58  	// - lastResubmittedConfigOffset < lastOriginalOffsetProcessed, where we've processed one or more resubmitted
    59  	//   normal messages after the latest resubmitted config message. (we advance `lastResubmittedConfigOffset` for
    60  	//   config messages, but not normal messages)
    61  	if lastResubmittedConfigOffset == 0 || lastResubmittedConfigOffset <= lastOriginalOffsetProcessed {
    62  		// If we've already caught up with the reprocessing resubmitted messages, close the channel to unblock broadcast
    63  		close(doneReprocessingMsgInFlight)
    64  	}
    65  
    66  	consenter.Metrics().LastOffsetPersisted.With("channel", support.ChannelID()).Set(float64(lastOffsetPersisted))
    67  
    68  	return &chainImpl{
    69  		consenter:                   consenter,
    70  		ConsenterSupport:            support,
    71  		channel:                     newChannel(support.ChannelID(), defaultPartition),
    72  		lastOffsetPersisted:         lastOffsetPersisted,
    73  		lastOriginalOffsetProcessed: lastOriginalOffsetProcessed,
    74  		lastResubmittedConfigOffset: lastResubmittedConfigOffset,
    75  		lastCutBlockNumber:          lastCutBlockNumber,
    76  
    77  		haltChan:                    make(chan struct{}),
    78  		startChan:                   make(chan struct{}),
    79  		doneReprocessingMsgInFlight: doneReprocessingMsgInFlight,
    80  	}, nil
    81  }
    82  
    83  //go:generate counterfeiter -o mock/sync_producer.go --fake-name SyncProducer . syncProducer
    84  
    85  type syncProducer interface {
    86  	SendMessage(msg *sarama.ProducerMessage) (partition int32, offset int64, err error)
    87  	SendMessages(msgs []*sarama.ProducerMessage) error
    88  	Close() error
    89  }
    90  
    91  type chainImpl struct {
    92  	consenter commonConsenter
    93  	consensus.ConsenterSupport
    94  
    95  	channel                     channel
    96  	lastOffsetPersisted         int64
    97  	lastOriginalOffsetProcessed int64
    98  	lastResubmittedConfigOffset int64
    99  	lastCutBlockNumber          uint64
   100  
   101  	producer        syncProducer
   102  	parentConsumer  sarama.Consumer
   103  	channelConsumer sarama.PartitionConsumer
   104  
   105  	// mutex used when changing the doneReprocessingMsgInFlight
   106  	doneReprocessingMutex sync.Mutex
   107  	// notification that there are in-flight messages need to wait for
   108  	doneReprocessingMsgInFlight chan struct{}
   109  
   110  	// When the partition consumer errors, close the channel. Otherwise, make
   111  	// this an open, unbuffered channel.
   112  	errorChan chan struct{}
   113  	// When a Halt() request comes, close the channel. Unlike errorChan, this
   114  	// channel never re-opens when closed. Its closing triggers the exit of the
   115  	// processMessagesToBlock loop.
   116  	haltChan chan struct{}
   117  	// notification that the chain has stopped processing messages into blocks
   118  	doneProcessingMessagesToBlocks chan struct{}
   119  	// Close when the retriable steps in Start have completed.
   120  	startChan chan struct{}
   121  	// timer controls the batch timeout of cutting pending messages into block
   122  	timer <-chan time.Time
   123  
   124  	replicaIDs []int32
   125  }
   126  
   127  // Errored returns a channel which will close when a partition consumer error
   128  // has occurred. Checked by Deliver().
   129  func (chain *chainImpl) Errored() <-chan struct{} {
   130  	select {
   131  	case <-chain.startChan:
   132  		return chain.errorChan
   133  	default:
   134  		// While the consenter is starting, always return an error
   135  		dummyError := make(chan struct{})
   136  		close(dummyError)
   137  		return dummyError
   138  	}
   139  }
   140  
   141  // Start allocates the necessary resources for staying up to date with this
   142  // Chain. Implements the consensus.Chain interface. Called by
   143  // consensus.NewManagerImpl() which is invoked when the ordering process is
   144  // launched, before the call to NewServer(). Launches a goroutine so as not to
   145  // block the consensus.Manager.
   146  func (chain *chainImpl) Start() {
   147  	go startThread(chain)
   148  }
   149  
   150  // Halt frees the resources which were allocated for this Chain. Implements the
   151  // consensus.Chain interface.
   152  func (chain *chainImpl) Halt() {
   153  	select {
   154  	case <-chain.startChan:
   155  		// chain finished starting, so we can halt it
   156  		select {
   157  		case <-chain.haltChan:
   158  			// This construct is useful because it allows Halt() to be called
   159  			// multiple times (by a single thread) w/o panicking. Recall that a
   160  			// receive from a closed channel returns (the zero value) immediately.
   161  			logger.Warningf("[channel: %s] Halting of chain requested again", chain.ChannelID())
   162  		default:
   163  			logger.Criticalf("[channel: %s] Halting of chain requested", chain.ChannelID())
   164  			// stat shutdown of chain
   165  			close(chain.haltChan)
   166  			// wait for processing of messages to blocks to finish shutting down
   167  			<-chain.doneProcessingMessagesToBlocks
   168  			// close the kafka producer and the consumer
   169  			chain.closeKafkaObjects()
   170  			logger.Debugf("[channel: %s] Closed the haltChan", chain.ChannelID())
   171  		}
   172  	default:
   173  		logger.Warningf("[channel: %s] Waiting for chain to finish starting before halting", chain.ChannelID())
   174  		<-chain.startChan
   175  		chain.Halt()
   176  	}
   177  }
   178  
   179  func (chain *chainImpl) WaitReady() error {
   180  	select {
   181  	case <-chain.startChan: // The Start phase has completed
   182  		select {
   183  		case <-chain.haltChan: // The chain has been halted, stop here
   184  			return fmt.Errorf("consenter for this channel has been halted")
   185  		case <-chain.doneReprocessing(): // Block waiting for all re-submitted messages to be reprocessed
   186  			return nil
   187  		}
   188  	default: // Not ready yet
   189  		return fmt.Errorf("backing Kafka cluster has not completed booting; try again later")
   190  	}
   191  }
   192  
   193  func (chain *chainImpl) doneReprocessing() <-chan struct{} {
   194  	chain.doneReprocessingMutex.Lock()
   195  	defer chain.doneReprocessingMutex.Unlock()
   196  	return chain.doneReprocessingMsgInFlight
   197  }
   198  
   199  func (chain *chainImpl) reprocessConfigComplete() {
   200  	chain.doneReprocessingMutex.Lock()
   201  	defer chain.doneReprocessingMutex.Unlock()
   202  	close(chain.doneReprocessingMsgInFlight)
   203  }
   204  
   205  func (chain *chainImpl) reprocessConfigPending() {
   206  	chain.doneReprocessingMutex.Lock()
   207  	defer chain.doneReprocessingMutex.Unlock()
   208  	chain.doneReprocessingMsgInFlight = make(chan struct{})
   209  }
   210  
   211  // Implements the consensus.Chain interface. Called by Broadcast().
   212  func (chain *chainImpl) Order(env *cb.Envelope, configSeq uint64) error {
   213  	return chain.order(env, configSeq, int64(0))
   214  }
   215  
   216  func (chain *chainImpl) order(env *cb.Envelope, configSeq uint64, originalOffset int64) error {
   217  	marshaledEnv, err := protoutil.Marshal(env)
   218  	if err != nil {
   219  		return errors.Errorf("cannot enqueue, unable to marshal envelope: %s", err)
   220  	}
   221  	if !chain.enqueue(newNormalMessage(marshaledEnv, configSeq, originalOffset)) {
   222  		return errors.Errorf("cannot enqueue")
   223  	}
   224  	return nil
   225  }
   226  
   227  // Implements the consensus.Chain interface. Called by Broadcast().
   228  func (chain *chainImpl) Configure(config *cb.Envelope, configSeq uint64) error {
   229  	return chain.configure(config, configSeq, int64(0))
   230  }
   231  
   232  func (chain *chainImpl) configure(config *cb.Envelope, configSeq uint64, originalOffset int64) error {
   233  	marshaledConfig, err := protoutil.Marshal(config)
   234  	if err != nil {
   235  		return fmt.Errorf("cannot enqueue, unable to marshal config because %s", err)
   236  	}
   237  	if !chain.enqueue(newConfigMessage(marshaledConfig, configSeq, originalOffset)) {
   238  		return fmt.Errorf("cannot enqueue")
   239  	}
   240  	return nil
   241  }
   242  
   243  // enqueue accepts a message and returns true on acceptance, or false otherwise.
   244  func (chain *chainImpl) enqueue(kafkaMsg *ab.KafkaMessage) bool {
   245  	logger.Debugf("[channel: %s] Enqueueing envelope...", chain.ChannelID())
   246  	select {
   247  	case <-chain.startChan: // The Start phase has completed
   248  		select {
   249  		case <-chain.haltChan: // The chain has been halted, stop here
   250  			logger.Warningf("[channel: %s] consenter for this channel has been halted", chain.ChannelID())
   251  			return false
   252  		default: // The post path
   253  			payload, err := protoutil.Marshal(kafkaMsg)
   254  			if err != nil {
   255  				logger.Errorf("[channel: %s] unable to marshal Kafka message because = %s", chain.ChannelID(), err)
   256  				return false
   257  			}
   258  			message := newProducerMessage(chain.channel, payload)
   259  			if _, _, err = chain.producer.SendMessage(message); err != nil {
   260  				logger.Errorf("[channel: %s] cannot enqueue envelope because = %s", chain.ChannelID(), err)
   261  				return false
   262  			}
   263  			logger.Debugf("[channel: %s] Envelope enqueued successfully", chain.ChannelID())
   264  			return true
   265  		}
   266  	default: // Not ready yet
   267  		logger.Warningf("[channel: %s] Will not enqueue, consenter for this channel hasn't started yet", chain.ChannelID())
   268  		return false
   269  	}
   270  }
   271  
   272  func (chain *chainImpl) HealthCheck(ctx context.Context) error {
   273  	var err error
   274  
   275  	payload := protoutil.MarshalOrPanic(newConnectMessage())
   276  	message := newProducerMessage(chain.channel, payload)
   277  
   278  	_, _, err = chain.producer.SendMessage(message)
   279  	if err != nil {
   280  		logger.Warnf("[channel %s] Cannot post CONNECT message = %s", chain.channel.topic(), err)
   281  		if err == sarama.ErrNotEnoughReplicas {
   282  			errMsg := fmt.Sprintf("[replica ids: %d]", chain.replicaIDs)
   283  			return errors.WithMessage(err, errMsg)
   284  		}
   285  	}
   286  	return nil
   287  }
   288  
   289  // Called by Start().
   290  func startThread(chain *chainImpl) {
   291  	var err error
   292  
   293  	// Create topic if it does not exist (requires Kafka v0.10.1.0)
   294  	err = setupTopicForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.consenter.topicDetail(), chain.channel)
   295  	if err != nil {
   296  		// log for now and fallback to auto create topics setting for broker
   297  		logger.Infof("[channel: %s]: failed to create Kafka topic = %s", chain.channel.topic(), err)
   298  	}
   299  
   300  	// Set up the producer
   301  	chain.producer, err = setupProducerForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel)
   302  	if err != nil {
   303  		logger.Panicf("[channel: %s] Cannot set up producer = %s", chain.channel.topic(), err)
   304  	}
   305  	logger.Infof("[channel: %s] Producer set up successfully", chain.ChannelID())
   306  
   307  	// Have the producer post the CONNECT message
   308  	if err = sendConnectMessage(chain.consenter.retryOptions(), chain.haltChan, chain.producer, chain.channel); err != nil {
   309  		logger.Panicf("[channel: %s] Cannot post CONNECT message = %s", chain.channel.topic(), err)
   310  	}
   311  	logger.Infof("[channel: %s] CONNECT message posted successfully", chain.channel.topic())
   312  
   313  	// Set up the parent consumer
   314  	chain.parentConsumer, err = setupParentConsumerForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel)
   315  	if err != nil {
   316  		logger.Panicf("[channel: %s] Cannot set up parent consumer = %s", chain.channel.topic(), err)
   317  	}
   318  	logger.Infof("[channel: %s] Parent consumer set up successfully", chain.channel.topic())
   319  
   320  	// Set up the channel consumer
   321  	chain.channelConsumer, err = setupChannelConsumerForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.parentConsumer, chain.channel, chain.lastOffsetPersisted+1)
   322  	if err != nil {
   323  		logger.Panicf("[channel: %s] Cannot set up channel consumer = %s", chain.channel.topic(), err)
   324  	}
   325  	logger.Infof("[channel: %s] Channel consumer set up successfully", chain.channel.topic())
   326  
   327  	chain.replicaIDs, err = getHealthyClusterReplicaInfo(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel)
   328  	if err != nil {
   329  		logger.Panicf("[channel: %s] failed to get replica IDs = %s", chain.channel.topic(), err)
   330  	}
   331  
   332  	chain.doneProcessingMessagesToBlocks = make(chan struct{})
   333  
   334  	chain.errorChan = make(chan struct{}) // Deliver requests will also go through
   335  	close(chain.startChan)                // Broadcast requests will now go through
   336  
   337  	logger.Infof("[channel: %s] Start phase completed successfully", chain.channel.topic())
   338  
   339  	chain.processMessagesToBlocks() // Keep up to date with the channel
   340  }
   341  
   342  // processMessagesToBlocks drains the Kafka consumer for the given channel, and
   343  // takes care of converting the stream of ordered messages into blocks for the
   344  // channel's ledger.
   345  func (chain *chainImpl) processMessagesToBlocks() ([]uint64, error) {
   346  	counts := make([]uint64, 11) // For metrics and tests
   347  	msg := new(ab.KafkaMessage)
   348  
   349  	defer func() {
   350  		// notify that we are not processing messages to blocks
   351  		close(chain.doneProcessingMessagesToBlocks)
   352  	}()
   353  
   354  	defer func() { // When Halt() is called
   355  		select {
   356  		case <-chain.errorChan: // If already closed, don't do anything
   357  		default:
   358  			close(chain.errorChan)
   359  		}
   360  	}()
   361  
   362  	subscription := fmt.Sprintf("added subscription to %s/%d", chain.channel.topic(), chain.channel.partition())
   363  	var topicPartitionSubscriptionResumed <-chan string
   364  	var deliverSessionTimer *time.Timer
   365  	var deliverSessionTimedOut <-chan time.Time
   366  
   367  	for {
   368  		select {
   369  		case <-chain.haltChan:
   370  			logger.Warningf("[channel: %s] Consenter for channel exiting", chain.ChannelID())
   371  			counts[indexExitChanPass]++
   372  			return counts, nil
   373  		case kafkaErr := <-chain.channelConsumer.Errors():
   374  			logger.Errorf("[channel: %s] Error during consumption: %s", chain.ChannelID(), kafkaErr)
   375  			counts[indexRecvError]++
   376  			select {
   377  			case <-chain.errorChan: // If already closed, don't do anything
   378  			default:
   379  
   380  				switch kafkaErr.Err {
   381  				case sarama.ErrOffsetOutOfRange:
   382  					// the kafka consumer will auto retry for all errors except for ErrOffsetOutOfRange
   383  					logger.Errorf("[channel: %s] Unrecoverable error during consumption: %s", chain.ChannelID(), kafkaErr)
   384  					close(chain.errorChan)
   385  				default:
   386  					if topicPartitionSubscriptionResumed == nil {
   387  						// register listener
   388  						topicPartitionSubscriptionResumed = saramaLogger.NewListener(subscription)
   389  						// start session timout timer
   390  						deliverSessionTimer = time.NewTimer(chain.consenter.retryOptions().NetworkTimeouts.ReadTimeout)
   391  						deliverSessionTimedOut = deliverSessionTimer.C
   392  					}
   393  				}
   394  			}
   395  			select {
   396  			case <-chain.errorChan: // we are not ignoring the error
   397  				logger.Warningf("[channel: %s] Closed the errorChan", chain.ChannelID())
   398  				// This covers the edge case where (1) a consumption error has
   399  				// closed the errorChan and thus rendered the chain unavailable to
   400  				// deliver clients, (2) we're already at the newest offset, and (3)
   401  				// there are no new Broadcast requests coming in. In this case,
   402  				// there is no trigger that can recreate the errorChan again and
   403  				// mark the chain as available, so we have to force that trigger via
   404  				// the emission of a CONNECT message. TODO Consider rate limiting
   405  				go sendConnectMessage(chain.consenter.retryOptions(), chain.haltChan, chain.producer, chain.channel)
   406  			default: // we are ignoring the error
   407  				logger.Warningf("[channel: %s] Deliver sessions will be dropped if consumption errors continue.", chain.ChannelID())
   408  			}
   409  		case <-topicPartitionSubscriptionResumed:
   410  			// stop listening for subscription message
   411  			saramaLogger.RemoveListener(subscription, topicPartitionSubscriptionResumed)
   412  			// disable subscription event chan
   413  			topicPartitionSubscriptionResumed = nil
   414  
   415  			// stop timeout timer
   416  			if !deliverSessionTimer.Stop() {
   417  				<-deliverSessionTimer.C
   418  			}
   419  			logger.Warningf("[channel: %s] Consumption will resume.", chain.ChannelID())
   420  
   421  		case <-deliverSessionTimedOut:
   422  			// stop listening for subscription message
   423  			saramaLogger.RemoveListener(subscription, topicPartitionSubscriptionResumed)
   424  			// disable subscription event chan
   425  			topicPartitionSubscriptionResumed = nil
   426  
   427  			close(chain.errorChan)
   428  			logger.Warningf("[channel: %s] Closed the errorChan", chain.ChannelID())
   429  
   430  			// make chain available again via CONNECT message trigger
   431  			go sendConnectMessage(chain.consenter.retryOptions(), chain.haltChan, chain.producer, chain.channel)
   432  
   433  		case in, ok := <-chain.channelConsumer.Messages():
   434  			if !ok {
   435  				logger.Criticalf("[channel: %s] Kafka consumer closed.", chain.ChannelID())
   436  				return counts, nil
   437  			}
   438  
   439  			// catch the possibility that we missed a topic subscription event before
   440  			// we registered the event listener
   441  			if topicPartitionSubscriptionResumed != nil {
   442  				// stop listening for subscription message
   443  				saramaLogger.RemoveListener(subscription, topicPartitionSubscriptionResumed)
   444  				// disable subscription event chan
   445  				topicPartitionSubscriptionResumed = nil
   446  				// stop timeout timer
   447  				if !deliverSessionTimer.Stop() {
   448  					<-deliverSessionTimer.C
   449  				}
   450  			}
   451  
   452  			select {
   453  			case <-chain.errorChan: // If this channel was closed...
   454  				chain.errorChan = make(chan struct{}) // ...make a new one.
   455  				logger.Infof("[channel: %s] Marked consenter as available again", chain.ChannelID())
   456  			default:
   457  			}
   458  			if err := proto.Unmarshal(in.Value, msg); err != nil {
   459  				// This shouldn't happen, it should be filtered at ingress
   460  				logger.Criticalf("[channel: %s] Unable to unmarshal consumed message = %s", chain.ChannelID(), err)
   461  				counts[indexUnmarshalError]++
   462  				continue
   463  			} else {
   464  				logger.Debugf("[channel: %s] Successfully unmarshalled consumed message, offset is %d. Inspecting type...", chain.ChannelID(), in.Offset)
   465  				counts[indexRecvPass]++
   466  			}
   467  			switch msg.Type.(type) {
   468  			case *ab.KafkaMessage_Connect:
   469  				_ = chain.processConnect(chain.ChannelID())
   470  				counts[indexProcessConnectPass]++
   471  			case *ab.KafkaMessage_TimeToCut:
   472  				if err := chain.processTimeToCut(msg.GetTimeToCut(), in.Offset); err != nil {
   473  					logger.Warningf("[channel: %s] %s", chain.ChannelID(), err)
   474  					logger.Criticalf("[channel: %s] Consenter for channel exiting", chain.ChannelID())
   475  					counts[indexProcessTimeToCutError]++
   476  					return counts, err // TODO Revisit whether we should indeed stop processing the chain at this point
   477  				}
   478  				counts[indexProcessTimeToCutPass]++
   479  			case *ab.KafkaMessage_Regular:
   480  				if err := chain.processRegular(msg.GetRegular(), in.Offset); err != nil {
   481  					logger.Warningf("[channel: %s] Error when processing incoming message of type REGULAR = %s", chain.ChannelID(), err)
   482  					counts[indexProcessRegularError]++
   483  				} else {
   484  					counts[indexProcessRegularPass]++
   485  				}
   486  			}
   487  		case <-chain.timer:
   488  			if err := sendTimeToCut(chain.producer, chain.channel, chain.lastCutBlockNumber+1, &chain.timer); err != nil {
   489  				logger.Errorf("[channel: %s] cannot post time-to-cut message = %s", chain.ChannelID(), err)
   490  				// Do not return though
   491  				counts[indexSendTimeToCutError]++
   492  			} else {
   493  				counts[indexSendTimeToCutPass]++
   494  			}
   495  		}
   496  	}
   497  }
   498  
   499  func (chain *chainImpl) closeKafkaObjects() []error {
   500  	var errs []error
   501  
   502  	err := chain.channelConsumer.Close()
   503  	if err != nil {
   504  		logger.Errorf("[channel: %s] could not close channelConsumer cleanly = %s", chain.ChannelID(), err)
   505  		errs = append(errs, err)
   506  	} else {
   507  		logger.Debugf("[channel: %s] Closed the channel consumer", chain.ChannelID())
   508  	}
   509  
   510  	err = chain.parentConsumer.Close()
   511  	if err != nil {
   512  		logger.Errorf("[channel: %s] could not close parentConsumer cleanly = %s", chain.ChannelID(), err)
   513  		errs = append(errs, err)
   514  	} else {
   515  		logger.Debugf("[channel: %s] Closed the parent consumer", chain.ChannelID())
   516  	}
   517  
   518  	err = chain.producer.Close()
   519  	if err != nil {
   520  		logger.Errorf("[channel: %s] could not close producer cleanly = %s", chain.ChannelID(), err)
   521  		errs = append(errs, err)
   522  	} else {
   523  		logger.Debugf("[channel: %s] Closed the producer", chain.ChannelID())
   524  	}
   525  
   526  	return errs
   527  }
   528  
   529  // Helper functions
   530  
   531  func getLastCutBlockNumber(blockchainHeight uint64) uint64 {
   532  	return blockchainHeight - 1
   533  }
   534  
   535  func getOffsets(metadataValue []byte, chainID string) (persisted int64, processed int64, resubmitted int64) {
   536  	if metadataValue != nil {
   537  		// Extract orderer-related metadata from the tip of the ledger first
   538  		kafkaMetadata := &ab.KafkaMetadata{}
   539  		if err := proto.Unmarshal(metadataValue, kafkaMetadata); err != nil {
   540  			logger.Panicf("[channel: %s] Ledger may be corrupted:"+
   541  				"cannot unmarshal orderer metadata in most recent block", chainID)
   542  		}
   543  		return kafkaMetadata.LastOffsetPersisted,
   544  			kafkaMetadata.LastOriginalOffsetProcessed,
   545  			kafkaMetadata.LastResubmittedConfigOffset
   546  	}
   547  	return sarama.OffsetOldest - 1, int64(0), int64(0) // default
   548  }
   549  
   550  func newConnectMessage() *ab.KafkaMessage {
   551  	return &ab.KafkaMessage{
   552  		Type: &ab.KafkaMessage_Connect{
   553  			Connect: &ab.KafkaMessageConnect{
   554  				Payload: nil,
   555  			},
   556  		},
   557  	}
   558  }
   559  
   560  func newNormalMessage(payload []byte, configSeq uint64, originalOffset int64) *ab.KafkaMessage {
   561  	return &ab.KafkaMessage{
   562  		Type: &ab.KafkaMessage_Regular{
   563  			Regular: &ab.KafkaMessageRegular{
   564  				Payload:        payload,
   565  				ConfigSeq:      configSeq,
   566  				Class:          ab.KafkaMessageRegular_NORMAL,
   567  				OriginalOffset: originalOffset,
   568  			},
   569  		},
   570  	}
   571  }
   572  
   573  func newConfigMessage(config []byte, configSeq uint64, originalOffset int64) *ab.KafkaMessage {
   574  	return &ab.KafkaMessage{
   575  		Type: &ab.KafkaMessage_Regular{
   576  			Regular: &ab.KafkaMessageRegular{
   577  				Payload:        config,
   578  				ConfigSeq:      configSeq,
   579  				Class:          ab.KafkaMessageRegular_CONFIG,
   580  				OriginalOffset: originalOffset,
   581  			},
   582  		},
   583  	}
   584  }
   585  
   586  func newTimeToCutMessage(blockNumber uint64) *ab.KafkaMessage {
   587  	return &ab.KafkaMessage{
   588  		Type: &ab.KafkaMessage_TimeToCut{
   589  			TimeToCut: &ab.KafkaMessageTimeToCut{
   590  				BlockNumber: blockNumber,
   591  			},
   592  		},
   593  	}
   594  }
   595  
   596  func newProducerMessage(channel channel, pld []byte) *sarama.ProducerMessage {
   597  	return &sarama.ProducerMessage{
   598  		Topic: channel.topic(),
   599  		Key:   sarama.StringEncoder(strconv.Itoa(int(channel.partition()))), // TODO Consider writing an IntEncoder?
   600  		Value: sarama.ByteEncoder(pld),
   601  	}
   602  }
   603  
   604  func (chain *chainImpl) processConnect(channelName string) error {
   605  	logger.Debugf("[channel: %s] It's a connect message - ignoring", channelName)
   606  	return nil
   607  }
   608  
   609  func (chain *chainImpl) processRegular(regularMessage *ab.KafkaMessageRegular, receivedOffset int64) error {
   610  	// When committing a normal message, we also update `lastOriginalOffsetProcessed` with `newOffset`.
   611  	// It is caller's responsibility to deduce correct value of `newOffset` based on following rules:
   612  	// - if Resubmission is switched off, it should always be zero
   613  	// - if the message is committed on first pass, meaning it's not re-validated and re-ordered, this value
   614  	//   should be the same as current `lastOriginalOffsetProcessed`
   615  	// - if the message is re-validated and re-ordered, this value should be the `OriginalOffset` of that
   616  	//   Kafka message, so that `lastOriginalOffsetProcessed` is advanced
   617  	commitNormalMsg := func(message *cb.Envelope, newOffset int64) {
   618  		batches, pending := chain.BlockCutter().Ordered(message)
   619  		logger.Debugf("[channel: %s] Ordering results: items in batch = %d, pending = %v", chain.ChannelID(), len(batches), pending)
   620  
   621  		switch {
   622  		case chain.timer != nil && !pending:
   623  			// Timer is already running but there are no messages pending, stop the timer
   624  			chain.timer = nil
   625  		case chain.timer == nil && pending:
   626  			// Timer is not already running and there are messages pending, so start it
   627  			chain.timer = time.After(chain.SharedConfig().BatchTimeout())
   628  			logger.Debugf("[channel: %s] Just began %s batch timer", chain.ChannelID(), chain.SharedConfig().BatchTimeout().String())
   629  		default:
   630  			// Do nothing when:
   631  			// 1. Timer is already running and there are messages pending
   632  			// 2. Timer is not set and there are no messages pending
   633  		}
   634  
   635  		if len(batches) == 0 {
   636  			// If no block is cut, we update the `lastOriginalOffsetProcessed`, start the timer if necessary and return
   637  			chain.lastOriginalOffsetProcessed = newOffset
   638  			return
   639  		}
   640  
   641  		offset := receivedOffset
   642  		if pending || len(batches) == 2 {
   643  			// If the newest envelope is not encapsulated into the first batch,
   644  			// the `LastOffsetPersisted` should be `receivedOffset` - 1.
   645  			offset--
   646  		} else {
   647  			// We are just cutting exactly one block, so it is safe to update
   648  			// `lastOriginalOffsetProcessed` with `newOffset` here, and then
   649  			// encapsulate it into this block. Otherwise, if we are cutting two
   650  			// blocks, the first one should use current `lastOriginalOffsetProcessed`
   651  			// and the second one should use `newOffset`, which is also used to
   652  			// update `lastOriginalOffsetProcessed`
   653  			chain.lastOriginalOffsetProcessed = newOffset
   654  		}
   655  
   656  		// Commit the first block
   657  		block := chain.CreateNextBlock(batches[0])
   658  		metadata := &ab.KafkaMetadata{
   659  			LastOffsetPersisted:         offset,
   660  			LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed,
   661  			LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset,
   662  		}
   663  		chain.WriteBlock(block, metadata)
   664  		chain.lastCutBlockNumber++
   665  		logger.Debugf("[channel: %s] Batch filled, just cut block [%d] - last persisted offset is now %d", chain.ChannelID(), chain.lastCutBlockNumber, offset)
   666  
   667  		// Commit the second block if exists
   668  		if len(batches) == 2 {
   669  			chain.lastOriginalOffsetProcessed = newOffset
   670  			offset++
   671  
   672  			block := chain.CreateNextBlock(batches[1])
   673  			metadata := &ab.KafkaMetadata{
   674  				LastOffsetPersisted:         offset,
   675  				LastOriginalOffsetProcessed: newOffset,
   676  				LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset,
   677  			}
   678  			chain.WriteBlock(block, metadata)
   679  			chain.lastCutBlockNumber++
   680  			logger.Debugf("[channel: %s] Batch filled, just cut block [%d] - last persisted offset is now %d", chain.ChannelID(), chain.lastCutBlockNumber, offset)
   681  		}
   682  	}
   683  
   684  	// When committing a config message, we also update `lastOriginalOffsetProcessed` with `newOffset`.
   685  	// It is caller's responsibility to deduce correct value of `newOffset` based on following rules:
   686  	// - if Resubmission is switched off, it should always be zero
   687  	// - if the message is committed on first pass, meaning it's not re-validated and re-ordered, this value
   688  	//   should be the same as current `lastOriginalOffsetProcessed`
   689  	// - if the message is re-validated and re-ordered, this value should be the `OriginalOffset` of that
   690  	//   Kafka message, so that `lastOriginalOffsetProcessed` is advanced
   691  	commitConfigMsg := func(message *cb.Envelope, newOffset int64) {
   692  		logger.Debugf("[channel: %s] Received config message", chain.ChannelID())
   693  		batch := chain.BlockCutter().Cut()
   694  
   695  		if batch != nil {
   696  			logger.Debugf("[channel: %s] Cut pending messages into block", chain.ChannelID())
   697  			block := chain.CreateNextBlock(batch)
   698  			metadata := &ab.KafkaMetadata{
   699  				LastOffsetPersisted:         receivedOffset - 1,
   700  				LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed,
   701  				LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset,
   702  			}
   703  			chain.WriteBlock(block, metadata)
   704  			chain.lastCutBlockNumber++
   705  		}
   706  
   707  		logger.Debugf("[channel: %s] Creating isolated block for config message", chain.ChannelID())
   708  		chain.lastOriginalOffsetProcessed = newOffset
   709  		block := chain.CreateNextBlock([]*cb.Envelope{message})
   710  		metadata := &ab.KafkaMetadata{
   711  			LastOffsetPersisted:         receivedOffset,
   712  			LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed,
   713  			LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset,
   714  		}
   715  		chain.WriteConfigBlock(block, metadata)
   716  		chain.lastCutBlockNumber++
   717  		chain.timer = nil
   718  	}
   719  
   720  	seq := chain.Sequence()
   721  
   722  	env := &cb.Envelope{}
   723  	if err := proto.Unmarshal(regularMessage.Payload, env); err != nil {
   724  		// This shouldn't happen, it should be filtered at ingress
   725  		return fmt.Errorf("failed to unmarshal payload of regular message because = %s", err)
   726  	}
   727  
   728  	logger.Debugf("[channel: %s] Processing regular Kafka message of type %s", chain.ChannelID(), regularMessage.Class.String())
   729  
   730  	// If we receive a message from a pre-v1.1 orderer, or resubmission is explicitly disabled, every orderer
   731  	// should operate as the pre-v1.1 ones: validate again and not attempt to reorder. That is because the
   732  	// pre-v1.1 orderers cannot identify re-ordered messages and resubmissions could lead to committing
   733  	// the same message twice.
   734  	//
   735  	// The implicit assumption here is that the resubmission capability flag is set only when there are no more
   736  	// pre-v1.1 orderers on the network. Otherwise it is unset, and this is what we call a compatibility mode.
   737  	if regularMessage.Class == ab.KafkaMessageRegular_UNKNOWN || !chain.SharedConfig().Capabilities().Resubmission() {
   738  		// Received regular message of type UNKNOWN or resubmission if off, indicating an OSN network with v1.0.x orderer
   739  		logger.Warningf("[channel: %s] This orderer is running in compatibility mode", chain.ChannelID())
   740  
   741  		chdr, err := protoutil.ChannelHeader(env)
   742  		if err != nil {
   743  			return fmt.Errorf("discarding bad config message because of channel header unmarshalling error = %s", err)
   744  		}
   745  
   746  		class := chain.ClassifyMsg(chdr)
   747  		switch class {
   748  		case msgprocessor.ConfigMsg:
   749  			if _, _, err := chain.ProcessConfigMsg(env); err != nil {
   750  				return fmt.Errorf("discarding bad config message because = %s", err)
   751  			}
   752  
   753  			commitConfigMsg(env, chain.lastOriginalOffsetProcessed)
   754  
   755  		case msgprocessor.NormalMsg:
   756  			if _, err := chain.ProcessNormalMsg(env); err != nil {
   757  				return fmt.Errorf("discarding bad normal message because = %s", err)
   758  			}
   759  
   760  			commitNormalMsg(env, chain.lastOriginalOffsetProcessed)
   761  
   762  		case msgprocessor.ConfigUpdateMsg:
   763  			return fmt.Errorf("not expecting message of type ConfigUpdate")
   764  
   765  		default:
   766  			logger.Panicf("[channel: %s] Unsupported message classification: %v", chain.ChannelID(), class)
   767  		}
   768  
   769  		return nil
   770  	}
   771  
   772  	switch regularMessage.Class {
   773  	case ab.KafkaMessageRegular_UNKNOWN:
   774  		logger.Panicf("[channel: %s] Kafka message of type UNKNOWN should have been processed already", chain.ChannelID())
   775  
   776  	case ab.KafkaMessageRegular_NORMAL:
   777  		// This is a message that is re-validated and re-ordered
   778  		if regularMessage.OriginalOffset != 0 {
   779  			logger.Debugf("[channel: %s] Received re-submitted normal message with original offset %d", chain.ChannelID(), regularMessage.OriginalOffset)
   780  
   781  			// But we've reprocessed it already
   782  			if regularMessage.OriginalOffset <= chain.lastOriginalOffsetProcessed {
   783  				logger.Debugf(
   784  					"[channel: %s] OriginalOffset(%d) <= LastOriginalOffsetProcessed(%d), message has been consumed already, discard",
   785  					chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed)
   786  				return nil
   787  			}
   788  
   789  			logger.Debugf(
   790  				"[channel: %s] OriginalOffset(%d) > LastOriginalOffsetProcessed(%d), "+
   791  					"this is the first time we receive this re-submitted normal message",
   792  				chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed)
   793  
   794  			// In case we haven't reprocessed the message, there's no need to differentiate it from those
   795  			// messages that will be processed for the first time.
   796  		}
   797  
   798  		// The config sequence has advanced
   799  		if regularMessage.ConfigSeq < seq {
   800  			logger.Debugf("[channel: %s] Config sequence has advanced since this normal message got validated, re-validating", chain.ChannelID())
   801  			configSeq, err := chain.ProcessNormalMsg(env)
   802  			if err != nil {
   803  				return fmt.Errorf("discarding bad normal message because = %s", err)
   804  			}
   805  
   806  			logger.Debugf("[channel: %s] Normal message is still valid, re-submit", chain.ChannelID())
   807  
   808  			// For both messages that are ordered for the first time or re-ordered, we set original offset
   809  			// to current received offset and re-order it.
   810  			if err := chain.order(env, configSeq, receivedOffset); err != nil {
   811  				return fmt.Errorf("error re-submitting normal message because = %s", err)
   812  			}
   813  
   814  			return nil
   815  		}
   816  
   817  		// Any messages coming in here may or may not have been re-validated
   818  		// and re-ordered, BUT they are definitely valid here
   819  
   820  		// advance lastOriginalOffsetProcessed if message is re-validated and re-ordered
   821  		offset := regularMessage.OriginalOffset
   822  		if offset == 0 {
   823  			offset = chain.lastOriginalOffsetProcessed
   824  		}
   825  
   826  		commitNormalMsg(env, offset)
   827  
   828  	case ab.KafkaMessageRegular_CONFIG:
   829  		// This is a message that is re-validated and re-ordered
   830  		if regularMessage.OriginalOffset != 0 {
   831  			logger.Debugf("[channel: %s] Received re-submitted config message with original offset %d", chain.ChannelID(), regularMessage.OriginalOffset)
   832  
   833  			// But we've reprocessed it already
   834  			if regularMessage.OriginalOffset <= chain.lastOriginalOffsetProcessed {
   835  				logger.Debugf(
   836  					"[channel: %s] OriginalOffset(%d) <= LastOriginalOffsetProcessed(%d), message has been consumed already, discard",
   837  					chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed)
   838  				return nil
   839  			}
   840  
   841  			logger.Debugf(
   842  				"[channel: %s] OriginalOffset(%d) > LastOriginalOffsetProcessed(%d), "+
   843  					"this is the first time we receive this re-submitted config message",
   844  				chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed)
   845  
   846  			if regularMessage.OriginalOffset == chain.lastResubmittedConfigOffset && // This is very last resubmitted config message
   847  				regularMessage.ConfigSeq == seq { // AND we don't need to resubmit it again
   848  				logger.Debugf("[channel: %s] Config message with original offset %d is the last in-flight resubmitted message"+
   849  					"and it does not require revalidation, unblock ingress messages now", chain.ChannelID(), regularMessage.OriginalOffset)
   850  				chain.reprocessConfigComplete() // Therefore, we could finally unblock broadcast
   851  			}
   852  
   853  			// Somebody resubmitted message at offset X, whereas we didn't. This is due to non-determinism where
   854  			// that message was considered invalid by us during re-validation, however somebody else deemed it to
   855  			// be valid, and resubmitted it. We need to advance lastResubmittedConfigOffset in this case in order
   856  			// to enforce consistency across the network.
   857  			if chain.lastResubmittedConfigOffset < regularMessage.OriginalOffset {
   858  				chain.lastResubmittedConfigOffset = regularMessage.OriginalOffset
   859  			}
   860  		}
   861  
   862  		// The config sequence has advanced
   863  		if regularMessage.ConfigSeq < seq {
   864  			logger.Debugf("[channel: %s] Config sequence has advanced since this config message got validated, re-validating", chain.ChannelID())
   865  			configEnv, configSeq, err := chain.ProcessConfigMsg(env)
   866  			if err != nil {
   867  				return fmt.Errorf("rejecting config message because = %s", err)
   868  			}
   869  
   870  			// For both messages that are ordered for the first time or re-ordered, we set original offset
   871  			// to current received offset and re-order it.
   872  			if err := chain.configure(configEnv, configSeq, receivedOffset); err != nil {
   873  				return fmt.Errorf("error re-submitting config message because = %s", err)
   874  			}
   875  
   876  			logger.Debugf("[channel: %s] Resubmitted config message with offset %d, block ingress messages", chain.ChannelID(), receivedOffset)
   877  			chain.lastResubmittedConfigOffset = receivedOffset // Keep track of last resubmitted message offset
   878  			chain.reprocessConfigPending()                     // Begin blocking ingress messages
   879  
   880  			return nil
   881  		}
   882  
   883  		// Any messages coming in here may or may not have been re-validated
   884  		// and re-ordered, BUT they are definitely valid here
   885  
   886  		// advance lastOriginalOffsetProcessed if message is re-validated and re-ordered
   887  		offset := regularMessage.OriginalOffset
   888  		if offset == 0 {
   889  			offset = chain.lastOriginalOffsetProcessed
   890  		}
   891  
   892  		commitConfigMsg(env, offset)
   893  
   894  	default:
   895  		return errors.Errorf("unsupported regular kafka message type: %v", regularMessage.Class.String())
   896  	}
   897  
   898  	return nil
   899  }
   900  
   901  func (chain *chainImpl) processTimeToCut(ttcMessage *ab.KafkaMessageTimeToCut, receivedOffset int64) error {
   902  	ttcNumber := ttcMessage.GetBlockNumber()
   903  	logger.Debugf("[channel: %s] It's a time-to-cut message for block [%d]", chain.ChannelID(), ttcNumber)
   904  	if ttcNumber == chain.lastCutBlockNumber+1 {
   905  		chain.timer = nil
   906  		logger.Debugf("[channel: %s] Nil'd the timer", chain.ChannelID())
   907  		batch := chain.BlockCutter().Cut()
   908  		if len(batch) == 0 {
   909  			return fmt.Errorf("got right time-to-cut message (for block [%d]),"+
   910  				" no pending requests though; this might indicate a bug", chain.lastCutBlockNumber+1)
   911  		}
   912  		block := chain.CreateNextBlock(batch)
   913  		metadata := &ab.KafkaMetadata{
   914  			LastOffsetPersisted:         receivedOffset,
   915  			LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed,
   916  		}
   917  		chain.WriteBlock(block, metadata)
   918  		chain.lastCutBlockNumber++
   919  		logger.Debugf("[channel: %s] Proper time-to-cut received, just cut block [%d]", chain.ChannelID(), chain.lastCutBlockNumber)
   920  		return nil
   921  	} else if ttcNumber > chain.lastCutBlockNumber+1 {
   922  		return fmt.Errorf("got larger time-to-cut message (%d) than allowed/expected (%d)"+
   923  			" - this might indicate a bug", ttcNumber, chain.lastCutBlockNumber+1)
   924  	}
   925  	logger.Debugf("[channel: %s] Ignoring stale time-to-cut-message for block [%d]", chain.ChannelID(), ttcNumber)
   926  	return nil
   927  }
   928  
   929  // WriteBlock acts as a wrapper around the consenter support WriteBlock, encoding the metadata,
   930  // and updating the metrics.
   931  func (chain *chainImpl) WriteBlock(block *cb.Block, metadata *ab.KafkaMetadata) {
   932  	chain.ConsenterSupport.WriteBlock(block, protoutil.MarshalOrPanic(metadata))
   933  	chain.consenter.Metrics().LastOffsetPersisted.With("channel", chain.ChannelID()).Set(float64(metadata.LastOffsetPersisted))
   934  }
   935  
   936  // WriteConfigBlock acts as a wrapper around the consenter support WriteConfigBlock, encoding the metadata,
   937  // and updating the metrics.
   938  func (chain *chainImpl) WriteConfigBlock(block *cb.Block, metadata *ab.KafkaMetadata) {
   939  	chain.ConsenterSupport.WriteConfigBlock(block, protoutil.MarshalOrPanic(metadata))
   940  	chain.consenter.Metrics().LastOffsetPersisted.With("channel", chain.ChannelID()).Set(float64(metadata.LastOffsetPersisted))
   941  }
   942  
   943  // Post a CONNECT message to the channel using the given retry options. This
   944  // prevents the panicking that would occur if we were to set up a consumer and
   945  // seek on a partition that hadn't been written to yet.
   946  func sendConnectMessage(retryOptions localconfig.Retry, exitChan chan struct{}, producer sarama.SyncProducer, channel channel) error {
   947  	logger.Infof("[channel: %s] About to post the CONNECT message...", channel.topic())
   948  
   949  	payload := protoutil.MarshalOrPanic(newConnectMessage())
   950  	message := newProducerMessage(channel, payload)
   951  
   952  	retryMsg := "Attempting to post the CONNECT message..."
   953  	postConnect := newRetryProcess(retryOptions, exitChan, channel, retryMsg, func() error {
   954  		select {
   955  		case <-exitChan:
   956  			logger.Debugf("[channel: %s] Consenter for channel exiting, aborting retry", channel)
   957  			return nil
   958  		default:
   959  			_, _, err := producer.SendMessage(message)
   960  			return err
   961  		}
   962  	})
   963  
   964  	return postConnect.retry()
   965  }
   966  
   967  func sendTimeToCut(producer sarama.SyncProducer, channel channel, timeToCutBlockNumber uint64, timer *<-chan time.Time) error {
   968  	logger.Debugf("[channel: %s] Time-to-cut block [%d] timer expired", channel.topic(), timeToCutBlockNumber)
   969  	*timer = nil
   970  	payload := protoutil.MarshalOrPanic(newTimeToCutMessage(timeToCutBlockNumber))
   971  	message := newProducerMessage(channel, payload)
   972  	_, _, err := producer.SendMessage(message)
   973  	return err
   974  }
   975  
   976  // Sets up the partition consumer for a channel using the given retry options.
   977  func setupChannelConsumerForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, parentConsumer sarama.Consumer, channel channel, startFrom int64) (sarama.PartitionConsumer, error) {
   978  	var err error
   979  	var channelConsumer sarama.PartitionConsumer
   980  
   981  	logger.Infof("[channel: %s] Setting up the channel consumer for this channel (start offset: %d)...", channel.topic(), startFrom)
   982  
   983  	retryMsg := "Connecting to the Kafka cluster"
   984  	setupChannelConsumer := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error {
   985  		channelConsumer, err = parentConsumer.ConsumePartition(channel.topic(), channel.partition(), startFrom)
   986  		return err
   987  	})
   988  
   989  	return channelConsumer, setupChannelConsumer.retry()
   990  }
   991  
   992  // Sets up the parent consumer for a channel using the given retry options.
   993  func setupParentConsumerForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) (sarama.Consumer, error) {
   994  	var err error
   995  	var parentConsumer sarama.Consumer
   996  
   997  	logger.Infof("[channel: %s] Setting up the parent consumer for this channel...", channel.topic())
   998  
   999  	retryMsg := "Connecting to the Kafka cluster"
  1000  	setupParentConsumer := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error {
  1001  		parentConsumer, err = sarama.NewConsumer(brokers, brokerConfig)
  1002  		return err
  1003  	})
  1004  
  1005  	return parentConsumer, setupParentConsumer.retry()
  1006  }
  1007  
  1008  // Sets up the writer/producer for a channel using the given retry options.
  1009  func setupProducerForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) (sarama.SyncProducer, error) {
  1010  	var err error
  1011  	var producer sarama.SyncProducer
  1012  
  1013  	logger.Infof("[channel: %s] Setting up the producer for this channel...", channel.topic())
  1014  
  1015  	retryMsg := "Connecting to the Kafka cluster"
  1016  	setupProducer := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error {
  1017  		producer, err = sarama.NewSyncProducer(brokers, brokerConfig)
  1018  		return err
  1019  	})
  1020  
  1021  	return producer, setupProducer.retry()
  1022  }
  1023  
  1024  // Creates the Kafka topic for the channel if it does not already exist
  1025  func setupTopicForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, topicDetail *sarama.TopicDetail, channel channel) error {
  1026  
  1027  	// requires Kafka v0.10.1.0 or higher
  1028  	if !brokerConfig.Version.IsAtLeast(sarama.V0_10_1_0) {
  1029  		return nil
  1030  	}
  1031  
  1032  	logger.Infof("[channel: %s] Setting up the topic for this channel...",
  1033  		channel.topic())
  1034  
  1035  	retryMsg := fmt.Sprintf("Creating Kafka topic [%s] for channel [%s]",
  1036  		channel.topic(), channel.String())
  1037  
  1038  	setupTopic := newRetryProcess(
  1039  		retryOptions,
  1040  		haltChan,
  1041  		channel,
  1042  		retryMsg,
  1043  		func() error {
  1044  
  1045  			var err error
  1046  			clusterMembers := map[int32]*sarama.Broker{}
  1047  			var controllerId int32
  1048  
  1049  			// loop through brokers to access metadata
  1050  			for _, address := range brokers {
  1051  				broker := sarama.NewBroker(address)
  1052  				err = broker.Open(brokerConfig)
  1053  
  1054  				if err != nil {
  1055  					continue
  1056  				}
  1057  
  1058  				var ok bool
  1059  				ok, err = broker.Connected()
  1060  				if !ok {
  1061  					continue
  1062  				}
  1063  				defer broker.Close()
  1064  
  1065  				// metadata request which includes the topic
  1066  				var apiVersion int16
  1067  				if brokerConfig.Version.IsAtLeast(sarama.V0_11_0_0) {
  1068  					// use API version 4 to disable auto topic creation for
  1069  					// metadata requests
  1070  					apiVersion = 4
  1071  				} else {
  1072  					apiVersion = 1
  1073  				}
  1074  				metadata, err := broker.GetMetadata(&sarama.MetadataRequest{
  1075  					Version:                apiVersion,
  1076  					Topics:                 []string{channel.topic()},
  1077  					AllowAutoTopicCreation: false})
  1078  
  1079  				if err != nil {
  1080  					continue
  1081  				}
  1082  
  1083  				controllerId = metadata.ControllerID
  1084  				for _, broker := range metadata.Brokers {
  1085  					clusterMembers[broker.ID()] = broker
  1086  				}
  1087  
  1088  				for _, topic := range metadata.Topics {
  1089  					if topic.Name == channel.topic() {
  1090  						if topic.Err != sarama.ErrUnknownTopicOrPartition {
  1091  							// auto create topics must be enabled so return
  1092  							return nil
  1093  						}
  1094  					}
  1095  				}
  1096  				break
  1097  			}
  1098  
  1099  			// check to see if we got any metadata from any of the brokers in the list
  1100  			if len(clusterMembers) == 0 {
  1101  				return fmt.Errorf(
  1102  					"error creating topic [%s]; failed to retrieve metadata for the cluster",
  1103  					channel.topic())
  1104  			}
  1105  
  1106  			// get the controller
  1107  			controller := clusterMembers[controllerId]
  1108  			err = controller.Open(brokerConfig)
  1109  
  1110  			if err != nil {
  1111  				return err
  1112  			}
  1113  
  1114  			var ok bool
  1115  			ok, err = controller.Connected()
  1116  			if !ok {
  1117  				return err
  1118  			}
  1119  			defer controller.Close()
  1120  
  1121  			// create the topic
  1122  			req := &sarama.CreateTopicsRequest{
  1123  				Version: 0,
  1124  				TopicDetails: map[string]*sarama.TopicDetail{
  1125  					channel.topic(): topicDetail},
  1126  				Timeout: 3 * time.Second}
  1127  			resp := &sarama.CreateTopicsResponse{}
  1128  			resp, err = controller.CreateTopics(req)
  1129  			if err != nil {
  1130  				return err
  1131  			}
  1132  
  1133  			// check the response
  1134  			if topicErr, ok := resp.TopicErrors[channel.topic()]; ok {
  1135  				// treat no error and topic exists error as success
  1136  				if topicErr.Err == sarama.ErrNoError ||
  1137  					topicErr.Err == sarama.ErrTopicAlreadyExists {
  1138  					return nil
  1139  				}
  1140  				if topicErr.Err == sarama.ErrInvalidTopic {
  1141  					// topic is invalid so abort
  1142  					logger.Warningf("[channel: %s] Failed to set up topic = %s",
  1143  						channel.topic(), topicErr.Err.Error())
  1144  					go func() {
  1145  						haltChan <- struct{}{}
  1146  					}()
  1147  				}
  1148  				return fmt.Errorf("error creating topic: [%s]",
  1149  					topicErr.Err.Error())
  1150  			}
  1151  
  1152  			return nil
  1153  		})
  1154  
  1155  	return setupTopic.retry()
  1156  }
  1157  
  1158  // Replica ID information can accurately be retrieved only when the cluster
  1159  // is healthy. Otherwise, the replica request does not return the full set
  1160  // of initial replicas. This information is needed to provide context when
  1161  // a health check returns an error.
  1162  func getHealthyClusterReplicaInfo(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) ([]int32, error) {
  1163  	var replicaIDs []int32
  1164  
  1165  	retryMsg := "Getting list of Kafka brokers replicating the channel"
  1166  	getReplicaInfo := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error {
  1167  		client, err := sarama.NewClient(brokers, brokerConfig)
  1168  		if err != nil {
  1169  			return err
  1170  		}
  1171  		defer client.Close()
  1172  
  1173  		replicaIDs, err = client.Replicas(channel.topic(), channel.partition())
  1174  		if err != nil {
  1175  			return err
  1176  		}
  1177  		return nil
  1178  	})
  1179  
  1180  	return replicaIDs, getReplicaInfo.retry()
  1181  }