github.com/kaituanwang/hyperledger@v2.0.1+incompatible/orderer/consensus/kafka/chain.go (about) 1 /* 2 Copyright IBM Corp. All Rights Reserved. 3 4 SPDX-License-Identifier: Apache-2.0 5 */ 6 7 package kafka 8 9 import ( 10 "context" 11 "fmt" 12 "strconv" 13 "sync" 14 "time" 15 16 "github.com/Shopify/sarama" 17 "github.com/golang/protobuf/proto" 18 cb "github.com/hyperledger/fabric-protos-go/common" 19 ab "github.com/hyperledger/fabric-protos-go/orderer" 20 "github.com/hyperledger/fabric/orderer/common/localconfig" 21 "github.com/hyperledger/fabric/orderer/common/msgprocessor" 22 "github.com/hyperledger/fabric/orderer/consensus" 23 "github.com/hyperledger/fabric/protoutil" 24 "github.com/pkg/errors" 25 ) 26 27 // Used for capturing metrics -- see processMessagesToBlocks 28 const ( 29 indexRecvError = iota 30 indexUnmarshalError 31 indexRecvPass 32 indexProcessConnectPass 33 indexProcessTimeToCutError 34 indexProcessTimeToCutPass 35 indexProcessRegularError 36 indexProcessRegularPass 37 indexSendTimeToCutError 38 indexSendTimeToCutPass 39 indexExitChanPass 40 ) 41 42 func newChain( 43 consenter commonConsenter, 44 support consensus.ConsenterSupport, 45 lastOffsetPersisted int64, 46 lastOriginalOffsetProcessed int64, 47 lastResubmittedConfigOffset int64, 48 ) (*chainImpl, error) { 49 lastCutBlockNumber := getLastCutBlockNumber(support.Height()) 50 logger.Infof("[channel: %s] Starting chain with last persisted offset %d and last recorded block [%d]", 51 support.ChannelID(), lastOffsetPersisted, lastCutBlockNumber) 52 53 doneReprocessingMsgInFlight := make(chan struct{}) 54 // In either one of following cases, we should unblock ingress messages: 55 // - lastResubmittedConfigOffset == 0, where we've never resubmitted any config messages 56 // - lastResubmittedConfigOffset == lastOriginalOffsetProcessed, where the latest config message we resubmitted 57 // has been processed already 58 // - lastResubmittedConfigOffset < lastOriginalOffsetProcessed, where we've processed one or more resubmitted 59 // normal messages after the latest resubmitted config message. (we advance `lastResubmittedConfigOffset` for 60 // config messages, but not normal messages) 61 if lastResubmittedConfigOffset == 0 || lastResubmittedConfigOffset <= lastOriginalOffsetProcessed { 62 // If we've already caught up with the reprocessing resubmitted messages, close the channel to unblock broadcast 63 close(doneReprocessingMsgInFlight) 64 } 65 66 consenter.Metrics().LastOffsetPersisted.With("channel", support.ChannelID()).Set(float64(lastOffsetPersisted)) 67 68 return &chainImpl{ 69 consenter: consenter, 70 ConsenterSupport: support, 71 channel: newChannel(support.ChannelID(), defaultPartition), 72 lastOffsetPersisted: lastOffsetPersisted, 73 lastOriginalOffsetProcessed: lastOriginalOffsetProcessed, 74 lastResubmittedConfigOffset: lastResubmittedConfigOffset, 75 lastCutBlockNumber: lastCutBlockNumber, 76 77 haltChan: make(chan struct{}), 78 startChan: make(chan struct{}), 79 doneReprocessingMsgInFlight: doneReprocessingMsgInFlight, 80 }, nil 81 } 82 83 //go:generate counterfeiter -o mock/sync_producer.go --fake-name SyncProducer . syncProducer 84 85 type syncProducer interface { 86 SendMessage(msg *sarama.ProducerMessage) (partition int32, offset int64, err error) 87 SendMessages(msgs []*sarama.ProducerMessage) error 88 Close() error 89 } 90 91 type chainImpl struct { 92 consenter commonConsenter 93 consensus.ConsenterSupport 94 95 channel channel 96 lastOffsetPersisted int64 97 lastOriginalOffsetProcessed int64 98 lastResubmittedConfigOffset int64 99 lastCutBlockNumber uint64 100 101 producer syncProducer 102 parentConsumer sarama.Consumer 103 channelConsumer sarama.PartitionConsumer 104 105 // mutex used when changing the doneReprocessingMsgInFlight 106 doneReprocessingMutex sync.Mutex 107 // notification that there are in-flight messages need to wait for 108 doneReprocessingMsgInFlight chan struct{} 109 110 // When the partition consumer errors, close the channel. Otherwise, make 111 // this an open, unbuffered channel. 112 errorChan chan struct{} 113 // When a Halt() request comes, close the channel. Unlike errorChan, this 114 // channel never re-opens when closed. Its closing triggers the exit of the 115 // processMessagesToBlock loop. 116 haltChan chan struct{} 117 // notification that the chain has stopped processing messages into blocks 118 doneProcessingMessagesToBlocks chan struct{} 119 // Close when the retriable steps in Start have completed. 120 startChan chan struct{} 121 // timer controls the batch timeout of cutting pending messages into block 122 timer <-chan time.Time 123 124 replicaIDs []int32 125 } 126 127 // Errored returns a channel which will close when a partition consumer error 128 // has occurred. Checked by Deliver(). 129 func (chain *chainImpl) Errored() <-chan struct{} { 130 select { 131 case <-chain.startChan: 132 return chain.errorChan 133 default: 134 // While the consenter is starting, always return an error 135 dummyError := make(chan struct{}) 136 close(dummyError) 137 return dummyError 138 } 139 } 140 141 // Start allocates the necessary resources for staying up to date with this 142 // Chain. Implements the consensus.Chain interface. Called by 143 // consensus.NewManagerImpl() which is invoked when the ordering process is 144 // launched, before the call to NewServer(). Launches a goroutine so as not to 145 // block the consensus.Manager. 146 func (chain *chainImpl) Start() { 147 go startThread(chain) 148 } 149 150 // Halt frees the resources which were allocated for this Chain. Implements the 151 // consensus.Chain interface. 152 func (chain *chainImpl) Halt() { 153 select { 154 case <-chain.startChan: 155 // chain finished starting, so we can halt it 156 select { 157 case <-chain.haltChan: 158 // This construct is useful because it allows Halt() to be called 159 // multiple times (by a single thread) w/o panicking. Recall that a 160 // receive from a closed channel returns (the zero value) immediately. 161 logger.Warningf("[channel: %s] Halting of chain requested again", chain.ChannelID()) 162 default: 163 logger.Criticalf("[channel: %s] Halting of chain requested", chain.ChannelID()) 164 // stat shutdown of chain 165 close(chain.haltChan) 166 // wait for processing of messages to blocks to finish shutting down 167 <-chain.doneProcessingMessagesToBlocks 168 // close the kafka producer and the consumer 169 chain.closeKafkaObjects() 170 logger.Debugf("[channel: %s] Closed the haltChan", chain.ChannelID()) 171 } 172 default: 173 logger.Warningf("[channel: %s] Waiting for chain to finish starting before halting", chain.ChannelID()) 174 <-chain.startChan 175 chain.Halt() 176 } 177 } 178 179 func (chain *chainImpl) WaitReady() error { 180 select { 181 case <-chain.startChan: // The Start phase has completed 182 select { 183 case <-chain.haltChan: // The chain has been halted, stop here 184 return fmt.Errorf("consenter for this channel has been halted") 185 case <-chain.doneReprocessing(): // Block waiting for all re-submitted messages to be reprocessed 186 return nil 187 } 188 default: // Not ready yet 189 return fmt.Errorf("backing Kafka cluster has not completed booting; try again later") 190 } 191 } 192 193 func (chain *chainImpl) doneReprocessing() <-chan struct{} { 194 chain.doneReprocessingMutex.Lock() 195 defer chain.doneReprocessingMutex.Unlock() 196 return chain.doneReprocessingMsgInFlight 197 } 198 199 func (chain *chainImpl) reprocessConfigComplete() { 200 chain.doneReprocessingMutex.Lock() 201 defer chain.doneReprocessingMutex.Unlock() 202 close(chain.doneReprocessingMsgInFlight) 203 } 204 205 func (chain *chainImpl) reprocessConfigPending() { 206 chain.doneReprocessingMutex.Lock() 207 defer chain.doneReprocessingMutex.Unlock() 208 chain.doneReprocessingMsgInFlight = make(chan struct{}) 209 } 210 211 // Implements the consensus.Chain interface. Called by Broadcast(). 212 func (chain *chainImpl) Order(env *cb.Envelope, configSeq uint64) error { 213 return chain.order(env, configSeq, int64(0)) 214 } 215 216 func (chain *chainImpl) order(env *cb.Envelope, configSeq uint64, originalOffset int64) error { 217 marshaledEnv, err := protoutil.Marshal(env) 218 if err != nil { 219 return errors.Errorf("cannot enqueue, unable to marshal envelope: %s", err) 220 } 221 if !chain.enqueue(newNormalMessage(marshaledEnv, configSeq, originalOffset)) { 222 return errors.Errorf("cannot enqueue") 223 } 224 return nil 225 } 226 227 // Implements the consensus.Chain interface. Called by Broadcast(). 228 func (chain *chainImpl) Configure(config *cb.Envelope, configSeq uint64) error { 229 return chain.configure(config, configSeq, int64(0)) 230 } 231 232 func (chain *chainImpl) configure(config *cb.Envelope, configSeq uint64, originalOffset int64) error { 233 marshaledConfig, err := protoutil.Marshal(config) 234 if err != nil { 235 return fmt.Errorf("cannot enqueue, unable to marshal config because %s", err) 236 } 237 if !chain.enqueue(newConfigMessage(marshaledConfig, configSeq, originalOffset)) { 238 return fmt.Errorf("cannot enqueue") 239 } 240 return nil 241 } 242 243 // enqueue accepts a message and returns true on acceptance, or false otherwise. 244 func (chain *chainImpl) enqueue(kafkaMsg *ab.KafkaMessage) bool { 245 logger.Debugf("[channel: %s] Enqueueing envelope...", chain.ChannelID()) 246 select { 247 case <-chain.startChan: // The Start phase has completed 248 select { 249 case <-chain.haltChan: // The chain has been halted, stop here 250 logger.Warningf("[channel: %s] consenter for this channel has been halted", chain.ChannelID()) 251 return false 252 default: // The post path 253 payload, err := protoutil.Marshal(kafkaMsg) 254 if err != nil { 255 logger.Errorf("[channel: %s] unable to marshal Kafka message because = %s", chain.ChannelID(), err) 256 return false 257 } 258 message := newProducerMessage(chain.channel, payload) 259 if _, _, err = chain.producer.SendMessage(message); err != nil { 260 logger.Errorf("[channel: %s] cannot enqueue envelope because = %s", chain.ChannelID(), err) 261 return false 262 } 263 logger.Debugf("[channel: %s] Envelope enqueued successfully", chain.ChannelID()) 264 return true 265 } 266 default: // Not ready yet 267 logger.Warningf("[channel: %s] Will not enqueue, consenter for this channel hasn't started yet", chain.ChannelID()) 268 return false 269 } 270 } 271 272 func (chain *chainImpl) HealthCheck(ctx context.Context) error { 273 var err error 274 275 payload := protoutil.MarshalOrPanic(newConnectMessage()) 276 message := newProducerMessage(chain.channel, payload) 277 278 _, _, err = chain.producer.SendMessage(message) 279 if err != nil { 280 logger.Warnf("[channel %s] Cannot post CONNECT message = %s", chain.channel.topic(), err) 281 if err == sarama.ErrNotEnoughReplicas { 282 errMsg := fmt.Sprintf("[replica ids: %d]", chain.replicaIDs) 283 return errors.WithMessage(err, errMsg) 284 } 285 } 286 return nil 287 } 288 289 // Called by Start(). 290 func startThread(chain *chainImpl) { 291 var err error 292 293 // Create topic if it does not exist (requires Kafka v0.10.1.0) 294 err = setupTopicForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.consenter.topicDetail(), chain.channel) 295 if err != nil { 296 // log for now and fallback to auto create topics setting for broker 297 logger.Infof("[channel: %s]: failed to create Kafka topic = %s", chain.channel.topic(), err) 298 } 299 300 // Set up the producer 301 chain.producer, err = setupProducerForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel) 302 if err != nil { 303 logger.Panicf("[channel: %s] Cannot set up producer = %s", chain.channel.topic(), err) 304 } 305 logger.Infof("[channel: %s] Producer set up successfully", chain.ChannelID()) 306 307 // Have the producer post the CONNECT message 308 if err = sendConnectMessage(chain.consenter.retryOptions(), chain.haltChan, chain.producer, chain.channel); err != nil { 309 logger.Panicf("[channel: %s] Cannot post CONNECT message = %s", chain.channel.topic(), err) 310 } 311 logger.Infof("[channel: %s] CONNECT message posted successfully", chain.channel.topic()) 312 313 // Set up the parent consumer 314 chain.parentConsumer, err = setupParentConsumerForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel) 315 if err != nil { 316 logger.Panicf("[channel: %s] Cannot set up parent consumer = %s", chain.channel.topic(), err) 317 } 318 logger.Infof("[channel: %s] Parent consumer set up successfully", chain.channel.topic()) 319 320 // Set up the channel consumer 321 chain.channelConsumer, err = setupChannelConsumerForChannel(chain.consenter.retryOptions(), chain.haltChan, chain.parentConsumer, chain.channel, chain.lastOffsetPersisted+1) 322 if err != nil { 323 logger.Panicf("[channel: %s] Cannot set up channel consumer = %s", chain.channel.topic(), err) 324 } 325 logger.Infof("[channel: %s] Channel consumer set up successfully", chain.channel.topic()) 326 327 chain.replicaIDs, err = getHealthyClusterReplicaInfo(chain.consenter.retryOptions(), chain.haltChan, chain.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel) 328 if err != nil { 329 logger.Panicf("[channel: %s] failed to get replica IDs = %s", chain.channel.topic(), err) 330 } 331 332 chain.doneProcessingMessagesToBlocks = make(chan struct{}) 333 334 chain.errorChan = make(chan struct{}) // Deliver requests will also go through 335 close(chain.startChan) // Broadcast requests will now go through 336 337 logger.Infof("[channel: %s] Start phase completed successfully", chain.channel.topic()) 338 339 chain.processMessagesToBlocks() // Keep up to date with the channel 340 } 341 342 // processMessagesToBlocks drains the Kafka consumer for the given channel, and 343 // takes care of converting the stream of ordered messages into blocks for the 344 // channel's ledger. 345 func (chain *chainImpl) processMessagesToBlocks() ([]uint64, error) { 346 counts := make([]uint64, 11) // For metrics and tests 347 msg := new(ab.KafkaMessage) 348 349 defer func() { 350 // notify that we are not processing messages to blocks 351 close(chain.doneProcessingMessagesToBlocks) 352 }() 353 354 defer func() { // When Halt() is called 355 select { 356 case <-chain.errorChan: // If already closed, don't do anything 357 default: 358 close(chain.errorChan) 359 } 360 }() 361 362 subscription := fmt.Sprintf("added subscription to %s/%d", chain.channel.topic(), chain.channel.partition()) 363 var topicPartitionSubscriptionResumed <-chan string 364 var deliverSessionTimer *time.Timer 365 var deliverSessionTimedOut <-chan time.Time 366 367 for { 368 select { 369 case <-chain.haltChan: 370 logger.Warningf("[channel: %s] Consenter for channel exiting", chain.ChannelID()) 371 counts[indexExitChanPass]++ 372 return counts, nil 373 case kafkaErr := <-chain.channelConsumer.Errors(): 374 logger.Errorf("[channel: %s] Error during consumption: %s", chain.ChannelID(), kafkaErr) 375 counts[indexRecvError]++ 376 select { 377 case <-chain.errorChan: // If already closed, don't do anything 378 default: 379 380 switch kafkaErr.Err { 381 case sarama.ErrOffsetOutOfRange: 382 // the kafka consumer will auto retry for all errors except for ErrOffsetOutOfRange 383 logger.Errorf("[channel: %s] Unrecoverable error during consumption: %s", chain.ChannelID(), kafkaErr) 384 close(chain.errorChan) 385 default: 386 if topicPartitionSubscriptionResumed == nil { 387 // register listener 388 topicPartitionSubscriptionResumed = saramaLogger.NewListener(subscription) 389 // start session timout timer 390 deliverSessionTimer = time.NewTimer(chain.consenter.retryOptions().NetworkTimeouts.ReadTimeout) 391 deliverSessionTimedOut = deliverSessionTimer.C 392 } 393 } 394 } 395 select { 396 case <-chain.errorChan: // we are not ignoring the error 397 logger.Warningf("[channel: %s] Closed the errorChan", chain.ChannelID()) 398 // This covers the edge case where (1) a consumption error has 399 // closed the errorChan and thus rendered the chain unavailable to 400 // deliver clients, (2) we're already at the newest offset, and (3) 401 // there are no new Broadcast requests coming in. In this case, 402 // there is no trigger that can recreate the errorChan again and 403 // mark the chain as available, so we have to force that trigger via 404 // the emission of a CONNECT message. TODO Consider rate limiting 405 go sendConnectMessage(chain.consenter.retryOptions(), chain.haltChan, chain.producer, chain.channel) 406 default: // we are ignoring the error 407 logger.Warningf("[channel: %s] Deliver sessions will be dropped if consumption errors continue.", chain.ChannelID()) 408 } 409 case <-topicPartitionSubscriptionResumed: 410 // stop listening for subscription message 411 saramaLogger.RemoveListener(subscription, topicPartitionSubscriptionResumed) 412 // disable subscription event chan 413 topicPartitionSubscriptionResumed = nil 414 415 // stop timeout timer 416 if !deliverSessionTimer.Stop() { 417 <-deliverSessionTimer.C 418 } 419 logger.Warningf("[channel: %s] Consumption will resume.", chain.ChannelID()) 420 421 case <-deliverSessionTimedOut: 422 // stop listening for subscription message 423 saramaLogger.RemoveListener(subscription, topicPartitionSubscriptionResumed) 424 // disable subscription event chan 425 topicPartitionSubscriptionResumed = nil 426 427 close(chain.errorChan) 428 logger.Warningf("[channel: %s] Closed the errorChan", chain.ChannelID()) 429 430 // make chain available again via CONNECT message trigger 431 go sendConnectMessage(chain.consenter.retryOptions(), chain.haltChan, chain.producer, chain.channel) 432 433 case in, ok := <-chain.channelConsumer.Messages(): 434 if !ok { 435 logger.Criticalf("[channel: %s] Kafka consumer closed.", chain.ChannelID()) 436 return counts, nil 437 } 438 439 // catch the possibility that we missed a topic subscription event before 440 // we registered the event listener 441 if topicPartitionSubscriptionResumed != nil { 442 // stop listening for subscription message 443 saramaLogger.RemoveListener(subscription, topicPartitionSubscriptionResumed) 444 // disable subscription event chan 445 topicPartitionSubscriptionResumed = nil 446 // stop timeout timer 447 if !deliverSessionTimer.Stop() { 448 <-deliverSessionTimer.C 449 } 450 } 451 452 select { 453 case <-chain.errorChan: // If this channel was closed... 454 chain.errorChan = make(chan struct{}) // ...make a new one. 455 logger.Infof("[channel: %s] Marked consenter as available again", chain.ChannelID()) 456 default: 457 } 458 if err := proto.Unmarshal(in.Value, msg); err != nil { 459 // This shouldn't happen, it should be filtered at ingress 460 logger.Criticalf("[channel: %s] Unable to unmarshal consumed message = %s", chain.ChannelID(), err) 461 counts[indexUnmarshalError]++ 462 continue 463 } else { 464 logger.Debugf("[channel: %s] Successfully unmarshalled consumed message, offset is %d. Inspecting type...", chain.ChannelID(), in.Offset) 465 counts[indexRecvPass]++ 466 } 467 switch msg.Type.(type) { 468 case *ab.KafkaMessage_Connect: 469 _ = chain.processConnect(chain.ChannelID()) 470 counts[indexProcessConnectPass]++ 471 case *ab.KafkaMessage_TimeToCut: 472 if err := chain.processTimeToCut(msg.GetTimeToCut(), in.Offset); err != nil { 473 logger.Warningf("[channel: %s] %s", chain.ChannelID(), err) 474 logger.Criticalf("[channel: %s] Consenter for channel exiting", chain.ChannelID()) 475 counts[indexProcessTimeToCutError]++ 476 return counts, err // TODO Revisit whether we should indeed stop processing the chain at this point 477 } 478 counts[indexProcessTimeToCutPass]++ 479 case *ab.KafkaMessage_Regular: 480 if err := chain.processRegular(msg.GetRegular(), in.Offset); err != nil { 481 logger.Warningf("[channel: %s] Error when processing incoming message of type REGULAR = %s", chain.ChannelID(), err) 482 counts[indexProcessRegularError]++ 483 } else { 484 counts[indexProcessRegularPass]++ 485 } 486 } 487 case <-chain.timer: 488 if err := sendTimeToCut(chain.producer, chain.channel, chain.lastCutBlockNumber+1, &chain.timer); err != nil { 489 logger.Errorf("[channel: %s] cannot post time-to-cut message = %s", chain.ChannelID(), err) 490 // Do not return though 491 counts[indexSendTimeToCutError]++ 492 } else { 493 counts[indexSendTimeToCutPass]++ 494 } 495 } 496 } 497 } 498 499 func (chain *chainImpl) closeKafkaObjects() []error { 500 var errs []error 501 502 err := chain.channelConsumer.Close() 503 if err != nil { 504 logger.Errorf("[channel: %s] could not close channelConsumer cleanly = %s", chain.ChannelID(), err) 505 errs = append(errs, err) 506 } else { 507 logger.Debugf("[channel: %s] Closed the channel consumer", chain.ChannelID()) 508 } 509 510 err = chain.parentConsumer.Close() 511 if err != nil { 512 logger.Errorf("[channel: %s] could not close parentConsumer cleanly = %s", chain.ChannelID(), err) 513 errs = append(errs, err) 514 } else { 515 logger.Debugf("[channel: %s] Closed the parent consumer", chain.ChannelID()) 516 } 517 518 err = chain.producer.Close() 519 if err != nil { 520 logger.Errorf("[channel: %s] could not close producer cleanly = %s", chain.ChannelID(), err) 521 errs = append(errs, err) 522 } else { 523 logger.Debugf("[channel: %s] Closed the producer", chain.ChannelID()) 524 } 525 526 return errs 527 } 528 529 // Helper functions 530 531 func getLastCutBlockNumber(blockchainHeight uint64) uint64 { 532 return blockchainHeight - 1 533 } 534 535 func getOffsets(metadataValue []byte, chainID string) (persisted int64, processed int64, resubmitted int64) { 536 if metadataValue != nil { 537 // Extract orderer-related metadata from the tip of the ledger first 538 kafkaMetadata := &ab.KafkaMetadata{} 539 if err := proto.Unmarshal(metadataValue, kafkaMetadata); err != nil { 540 logger.Panicf("[channel: %s] Ledger may be corrupted:"+ 541 "cannot unmarshal orderer metadata in most recent block", chainID) 542 } 543 return kafkaMetadata.LastOffsetPersisted, 544 kafkaMetadata.LastOriginalOffsetProcessed, 545 kafkaMetadata.LastResubmittedConfigOffset 546 } 547 return sarama.OffsetOldest - 1, int64(0), int64(0) // default 548 } 549 550 func newConnectMessage() *ab.KafkaMessage { 551 return &ab.KafkaMessage{ 552 Type: &ab.KafkaMessage_Connect{ 553 Connect: &ab.KafkaMessageConnect{ 554 Payload: nil, 555 }, 556 }, 557 } 558 } 559 560 func newNormalMessage(payload []byte, configSeq uint64, originalOffset int64) *ab.KafkaMessage { 561 return &ab.KafkaMessage{ 562 Type: &ab.KafkaMessage_Regular{ 563 Regular: &ab.KafkaMessageRegular{ 564 Payload: payload, 565 ConfigSeq: configSeq, 566 Class: ab.KafkaMessageRegular_NORMAL, 567 OriginalOffset: originalOffset, 568 }, 569 }, 570 } 571 } 572 573 func newConfigMessage(config []byte, configSeq uint64, originalOffset int64) *ab.KafkaMessage { 574 return &ab.KafkaMessage{ 575 Type: &ab.KafkaMessage_Regular{ 576 Regular: &ab.KafkaMessageRegular{ 577 Payload: config, 578 ConfigSeq: configSeq, 579 Class: ab.KafkaMessageRegular_CONFIG, 580 OriginalOffset: originalOffset, 581 }, 582 }, 583 } 584 } 585 586 func newTimeToCutMessage(blockNumber uint64) *ab.KafkaMessage { 587 return &ab.KafkaMessage{ 588 Type: &ab.KafkaMessage_TimeToCut{ 589 TimeToCut: &ab.KafkaMessageTimeToCut{ 590 BlockNumber: blockNumber, 591 }, 592 }, 593 } 594 } 595 596 func newProducerMessage(channel channel, pld []byte) *sarama.ProducerMessage { 597 return &sarama.ProducerMessage{ 598 Topic: channel.topic(), 599 Key: sarama.StringEncoder(strconv.Itoa(int(channel.partition()))), // TODO Consider writing an IntEncoder? 600 Value: sarama.ByteEncoder(pld), 601 } 602 } 603 604 func (chain *chainImpl) processConnect(channelName string) error { 605 logger.Debugf("[channel: %s] It's a connect message - ignoring", channelName) 606 return nil 607 } 608 609 func (chain *chainImpl) processRegular(regularMessage *ab.KafkaMessageRegular, receivedOffset int64) error { 610 // When committing a normal message, we also update `lastOriginalOffsetProcessed` with `newOffset`. 611 // It is caller's responsibility to deduce correct value of `newOffset` based on following rules: 612 // - if Resubmission is switched off, it should always be zero 613 // - if the message is committed on first pass, meaning it's not re-validated and re-ordered, this value 614 // should be the same as current `lastOriginalOffsetProcessed` 615 // - if the message is re-validated and re-ordered, this value should be the `OriginalOffset` of that 616 // Kafka message, so that `lastOriginalOffsetProcessed` is advanced 617 commitNormalMsg := func(message *cb.Envelope, newOffset int64) { 618 batches, pending := chain.BlockCutter().Ordered(message) 619 logger.Debugf("[channel: %s] Ordering results: items in batch = %d, pending = %v", chain.ChannelID(), len(batches), pending) 620 621 switch { 622 case chain.timer != nil && !pending: 623 // Timer is already running but there are no messages pending, stop the timer 624 chain.timer = nil 625 case chain.timer == nil && pending: 626 // Timer is not already running and there are messages pending, so start it 627 chain.timer = time.After(chain.SharedConfig().BatchTimeout()) 628 logger.Debugf("[channel: %s] Just began %s batch timer", chain.ChannelID(), chain.SharedConfig().BatchTimeout().String()) 629 default: 630 // Do nothing when: 631 // 1. Timer is already running and there are messages pending 632 // 2. Timer is not set and there are no messages pending 633 } 634 635 if len(batches) == 0 { 636 // If no block is cut, we update the `lastOriginalOffsetProcessed`, start the timer if necessary and return 637 chain.lastOriginalOffsetProcessed = newOffset 638 return 639 } 640 641 offset := receivedOffset 642 if pending || len(batches) == 2 { 643 // If the newest envelope is not encapsulated into the first batch, 644 // the `LastOffsetPersisted` should be `receivedOffset` - 1. 645 offset-- 646 } else { 647 // We are just cutting exactly one block, so it is safe to update 648 // `lastOriginalOffsetProcessed` with `newOffset` here, and then 649 // encapsulate it into this block. Otherwise, if we are cutting two 650 // blocks, the first one should use current `lastOriginalOffsetProcessed` 651 // and the second one should use `newOffset`, which is also used to 652 // update `lastOriginalOffsetProcessed` 653 chain.lastOriginalOffsetProcessed = newOffset 654 } 655 656 // Commit the first block 657 block := chain.CreateNextBlock(batches[0]) 658 metadata := &ab.KafkaMetadata{ 659 LastOffsetPersisted: offset, 660 LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed, 661 LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset, 662 } 663 chain.WriteBlock(block, metadata) 664 chain.lastCutBlockNumber++ 665 logger.Debugf("[channel: %s] Batch filled, just cut block [%d] - last persisted offset is now %d", chain.ChannelID(), chain.lastCutBlockNumber, offset) 666 667 // Commit the second block if exists 668 if len(batches) == 2 { 669 chain.lastOriginalOffsetProcessed = newOffset 670 offset++ 671 672 block := chain.CreateNextBlock(batches[1]) 673 metadata := &ab.KafkaMetadata{ 674 LastOffsetPersisted: offset, 675 LastOriginalOffsetProcessed: newOffset, 676 LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset, 677 } 678 chain.WriteBlock(block, metadata) 679 chain.lastCutBlockNumber++ 680 logger.Debugf("[channel: %s] Batch filled, just cut block [%d] - last persisted offset is now %d", chain.ChannelID(), chain.lastCutBlockNumber, offset) 681 } 682 } 683 684 // When committing a config message, we also update `lastOriginalOffsetProcessed` with `newOffset`. 685 // It is caller's responsibility to deduce correct value of `newOffset` based on following rules: 686 // - if Resubmission is switched off, it should always be zero 687 // - if the message is committed on first pass, meaning it's not re-validated and re-ordered, this value 688 // should be the same as current `lastOriginalOffsetProcessed` 689 // - if the message is re-validated and re-ordered, this value should be the `OriginalOffset` of that 690 // Kafka message, so that `lastOriginalOffsetProcessed` is advanced 691 commitConfigMsg := func(message *cb.Envelope, newOffset int64) { 692 logger.Debugf("[channel: %s] Received config message", chain.ChannelID()) 693 batch := chain.BlockCutter().Cut() 694 695 if batch != nil { 696 logger.Debugf("[channel: %s] Cut pending messages into block", chain.ChannelID()) 697 block := chain.CreateNextBlock(batch) 698 metadata := &ab.KafkaMetadata{ 699 LastOffsetPersisted: receivedOffset - 1, 700 LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed, 701 LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset, 702 } 703 chain.WriteBlock(block, metadata) 704 chain.lastCutBlockNumber++ 705 } 706 707 logger.Debugf("[channel: %s] Creating isolated block for config message", chain.ChannelID()) 708 chain.lastOriginalOffsetProcessed = newOffset 709 block := chain.CreateNextBlock([]*cb.Envelope{message}) 710 metadata := &ab.KafkaMetadata{ 711 LastOffsetPersisted: receivedOffset, 712 LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed, 713 LastResubmittedConfigOffset: chain.lastResubmittedConfigOffset, 714 } 715 chain.WriteConfigBlock(block, metadata) 716 chain.lastCutBlockNumber++ 717 chain.timer = nil 718 } 719 720 seq := chain.Sequence() 721 722 env := &cb.Envelope{} 723 if err := proto.Unmarshal(regularMessage.Payload, env); err != nil { 724 // This shouldn't happen, it should be filtered at ingress 725 return fmt.Errorf("failed to unmarshal payload of regular message because = %s", err) 726 } 727 728 logger.Debugf("[channel: %s] Processing regular Kafka message of type %s", chain.ChannelID(), regularMessage.Class.String()) 729 730 // If we receive a message from a pre-v1.1 orderer, or resubmission is explicitly disabled, every orderer 731 // should operate as the pre-v1.1 ones: validate again and not attempt to reorder. That is because the 732 // pre-v1.1 orderers cannot identify re-ordered messages and resubmissions could lead to committing 733 // the same message twice. 734 // 735 // The implicit assumption here is that the resubmission capability flag is set only when there are no more 736 // pre-v1.1 orderers on the network. Otherwise it is unset, and this is what we call a compatibility mode. 737 if regularMessage.Class == ab.KafkaMessageRegular_UNKNOWN || !chain.SharedConfig().Capabilities().Resubmission() { 738 // Received regular message of type UNKNOWN or resubmission if off, indicating an OSN network with v1.0.x orderer 739 logger.Warningf("[channel: %s] This orderer is running in compatibility mode", chain.ChannelID()) 740 741 chdr, err := protoutil.ChannelHeader(env) 742 if err != nil { 743 return fmt.Errorf("discarding bad config message because of channel header unmarshalling error = %s", err) 744 } 745 746 class := chain.ClassifyMsg(chdr) 747 switch class { 748 case msgprocessor.ConfigMsg: 749 if _, _, err := chain.ProcessConfigMsg(env); err != nil { 750 return fmt.Errorf("discarding bad config message because = %s", err) 751 } 752 753 commitConfigMsg(env, chain.lastOriginalOffsetProcessed) 754 755 case msgprocessor.NormalMsg: 756 if _, err := chain.ProcessNormalMsg(env); err != nil { 757 return fmt.Errorf("discarding bad normal message because = %s", err) 758 } 759 760 commitNormalMsg(env, chain.lastOriginalOffsetProcessed) 761 762 case msgprocessor.ConfigUpdateMsg: 763 return fmt.Errorf("not expecting message of type ConfigUpdate") 764 765 default: 766 logger.Panicf("[channel: %s] Unsupported message classification: %v", chain.ChannelID(), class) 767 } 768 769 return nil 770 } 771 772 switch regularMessage.Class { 773 case ab.KafkaMessageRegular_UNKNOWN: 774 logger.Panicf("[channel: %s] Kafka message of type UNKNOWN should have been processed already", chain.ChannelID()) 775 776 case ab.KafkaMessageRegular_NORMAL: 777 // This is a message that is re-validated and re-ordered 778 if regularMessage.OriginalOffset != 0 { 779 logger.Debugf("[channel: %s] Received re-submitted normal message with original offset %d", chain.ChannelID(), regularMessage.OriginalOffset) 780 781 // But we've reprocessed it already 782 if regularMessage.OriginalOffset <= chain.lastOriginalOffsetProcessed { 783 logger.Debugf( 784 "[channel: %s] OriginalOffset(%d) <= LastOriginalOffsetProcessed(%d), message has been consumed already, discard", 785 chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed) 786 return nil 787 } 788 789 logger.Debugf( 790 "[channel: %s] OriginalOffset(%d) > LastOriginalOffsetProcessed(%d), "+ 791 "this is the first time we receive this re-submitted normal message", 792 chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed) 793 794 // In case we haven't reprocessed the message, there's no need to differentiate it from those 795 // messages that will be processed for the first time. 796 } 797 798 // The config sequence has advanced 799 if regularMessage.ConfigSeq < seq { 800 logger.Debugf("[channel: %s] Config sequence has advanced since this normal message got validated, re-validating", chain.ChannelID()) 801 configSeq, err := chain.ProcessNormalMsg(env) 802 if err != nil { 803 return fmt.Errorf("discarding bad normal message because = %s", err) 804 } 805 806 logger.Debugf("[channel: %s] Normal message is still valid, re-submit", chain.ChannelID()) 807 808 // For both messages that are ordered for the first time or re-ordered, we set original offset 809 // to current received offset and re-order it. 810 if err := chain.order(env, configSeq, receivedOffset); err != nil { 811 return fmt.Errorf("error re-submitting normal message because = %s", err) 812 } 813 814 return nil 815 } 816 817 // Any messages coming in here may or may not have been re-validated 818 // and re-ordered, BUT they are definitely valid here 819 820 // advance lastOriginalOffsetProcessed if message is re-validated and re-ordered 821 offset := regularMessage.OriginalOffset 822 if offset == 0 { 823 offset = chain.lastOriginalOffsetProcessed 824 } 825 826 commitNormalMsg(env, offset) 827 828 case ab.KafkaMessageRegular_CONFIG: 829 // This is a message that is re-validated and re-ordered 830 if regularMessage.OriginalOffset != 0 { 831 logger.Debugf("[channel: %s] Received re-submitted config message with original offset %d", chain.ChannelID(), regularMessage.OriginalOffset) 832 833 // But we've reprocessed it already 834 if regularMessage.OriginalOffset <= chain.lastOriginalOffsetProcessed { 835 logger.Debugf( 836 "[channel: %s] OriginalOffset(%d) <= LastOriginalOffsetProcessed(%d), message has been consumed already, discard", 837 chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed) 838 return nil 839 } 840 841 logger.Debugf( 842 "[channel: %s] OriginalOffset(%d) > LastOriginalOffsetProcessed(%d), "+ 843 "this is the first time we receive this re-submitted config message", 844 chain.ChannelID(), regularMessage.OriginalOffset, chain.lastOriginalOffsetProcessed) 845 846 if regularMessage.OriginalOffset == chain.lastResubmittedConfigOffset && // This is very last resubmitted config message 847 regularMessage.ConfigSeq == seq { // AND we don't need to resubmit it again 848 logger.Debugf("[channel: %s] Config message with original offset %d is the last in-flight resubmitted message"+ 849 "and it does not require revalidation, unblock ingress messages now", chain.ChannelID(), regularMessage.OriginalOffset) 850 chain.reprocessConfigComplete() // Therefore, we could finally unblock broadcast 851 } 852 853 // Somebody resubmitted message at offset X, whereas we didn't. This is due to non-determinism where 854 // that message was considered invalid by us during re-validation, however somebody else deemed it to 855 // be valid, and resubmitted it. We need to advance lastResubmittedConfigOffset in this case in order 856 // to enforce consistency across the network. 857 if chain.lastResubmittedConfigOffset < regularMessage.OriginalOffset { 858 chain.lastResubmittedConfigOffset = regularMessage.OriginalOffset 859 } 860 } 861 862 // The config sequence has advanced 863 if regularMessage.ConfigSeq < seq { 864 logger.Debugf("[channel: %s] Config sequence has advanced since this config message got validated, re-validating", chain.ChannelID()) 865 configEnv, configSeq, err := chain.ProcessConfigMsg(env) 866 if err != nil { 867 return fmt.Errorf("rejecting config message because = %s", err) 868 } 869 870 // For both messages that are ordered for the first time or re-ordered, we set original offset 871 // to current received offset and re-order it. 872 if err := chain.configure(configEnv, configSeq, receivedOffset); err != nil { 873 return fmt.Errorf("error re-submitting config message because = %s", err) 874 } 875 876 logger.Debugf("[channel: %s] Resubmitted config message with offset %d, block ingress messages", chain.ChannelID(), receivedOffset) 877 chain.lastResubmittedConfigOffset = receivedOffset // Keep track of last resubmitted message offset 878 chain.reprocessConfigPending() // Begin blocking ingress messages 879 880 return nil 881 } 882 883 // Any messages coming in here may or may not have been re-validated 884 // and re-ordered, BUT they are definitely valid here 885 886 // advance lastOriginalOffsetProcessed if message is re-validated and re-ordered 887 offset := regularMessage.OriginalOffset 888 if offset == 0 { 889 offset = chain.lastOriginalOffsetProcessed 890 } 891 892 commitConfigMsg(env, offset) 893 894 default: 895 return errors.Errorf("unsupported regular kafka message type: %v", regularMessage.Class.String()) 896 } 897 898 return nil 899 } 900 901 func (chain *chainImpl) processTimeToCut(ttcMessage *ab.KafkaMessageTimeToCut, receivedOffset int64) error { 902 ttcNumber := ttcMessage.GetBlockNumber() 903 logger.Debugf("[channel: %s] It's a time-to-cut message for block [%d]", chain.ChannelID(), ttcNumber) 904 if ttcNumber == chain.lastCutBlockNumber+1 { 905 chain.timer = nil 906 logger.Debugf("[channel: %s] Nil'd the timer", chain.ChannelID()) 907 batch := chain.BlockCutter().Cut() 908 if len(batch) == 0 { 909 return fmt.Errorf("got right time-to-cut message (for block [%d]),"+ 910 " no pending requests though; this might indicate a bug", chain.lastCutBlockNumber+1) 911 } 912 block := chain.CreateNextBlock(batch) 913 metadata := &ab.KafkaMetadata{ 914 LastOffsetPersisted: receivedOffset, 915 LastOriginalOffsetProcessed: chain.lastOriginalOffsetProcessed, 916 } 917 chain.WriteBlock(block, metadata) 918 chain.lastCutBlockNumber++ 919 logger.Debugf("[channel: %s] Proper time-to-cut received, just cut block [%d]", chain.ChannelID(), chain.lastCutBlockNumber) 920 return nil 921 } else if ttcNumber > chain.lastCutBlockNumber+1 { 922 return fmt.Errorf("got larger time-to-cut message (%d) than allowed/expected (%d)"+ 923 " - this might indicate a bug", ttcNumber, chain.lastCutBlockNumber+1) 924 } 925 logger.Debugf("[channel: %s] Ignoring stale time-to-cut-message for block [%d]", chain.ChannelID(), ttcNumber) 926 return nil 927 } 928 929 // WriteBlock acts as a wrapper around the consenter support WriteBlock, encoding the metadata, 930 // and updating the metrics. 931 func (chain *chainImpl) WriteBlock(block *cb.Block, metadata *ab.KafkaMetadata) { 932 chain.ConsenterSupport.WriteBlock(block, protoutil.MarshalOrPanic(metadata)) 933 chain.consenter.Metrics().LastOffsetPersisted.With("channel", chain.ChannelID()).Set(float64(metadata.LastOffsetPersisted)) 934 } 935 936 // WriteConfigBlock acts as a wrapper around the consenter support WriteConfigBlock, encoding the metadata, 937 // and updating the metrics. 938 func (chain *chainImpl) WriteConfigBlock(block *cb.Block, metadata *ab.KafkaMetadata) { 939 chain.ConsenterSupport.WriteConfigBlock(block, protoutil.MarshalOrPanic(metadata)) 940 chain.consenter.Metrics().LastOffsetPersisted.With("channel", chain.ChannelID()).Set(float64(metadata.LastOffsetPersisted)) 941 } 942 943 // Post a CONNECT message to the channel using the given retry options. This 944 // prevents the panicking that would occur if we were to set up a consumer and 945 // seek on a partition that hadn't been written to yet. 946 func sendConnectMessage(retryOptions localconfig.Retry, exitChan chan struct{}, producer sarama.SyncProducer, channel channel) error { 947 logger.Infof("[channel: %s] About to post the CONNECT message...", channel.topic()) 948 949 payload := protoutil.MarshalOrPanic(newConnectMessage()) 950 message := newProducerMessage(channel, payload) 951 952 retryMsg := "Attempting to post the CONNECT message..." 953 postConnect := newRetryProcess(retryOptions, exitChan, channel, retryMsg, func() error { 954 select { 955 case <-exitChan: 956 logger.Debugf("[channel: %s] Consenter for channel exiting, aborting retry", channel) 957 return nil 958 default: 959 _, _, err := producer.SendMessage(message) 960 return err 961 } 962 }) 963 964 return postConnect.retry() 965 } 966 967 func sendTimeToCut(producer sarama.SyncProducer, channel channel, timeToCutBlockNumber uint64, timer *<-chan time.Time) error { 968 logger.Debugf("[channel: %s] Time-to-cut block [%d] timer expired", channel.topic(), timeToCutBlockNumber) 969 *timer = nil 970 payload := protoutil.MarshalOrPanic(newTimeToCutMessage(timeToCutBlockNumber)) 971 message := newProducerMessage(channel, payload) 972 _, _, err := producer.SendMessage(message) 973 return err 974 } 975 976 // Sets up the partition consumer for a channel using the given retry options. 977 func setupChannelConsumerForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, parentConsumer sarama.Consumer, channel channel, startFrom int64) (sarama.PartitionConsumer, error) { 978 var err error 979 var channelConsumer sarama.PartitionConsumer 980 981 logger.Infof("[channel: %s] Setting up the channel consumer for this channel (start offset: %d)...", channel.topic(), startFrom) 982 983 retryMsg := "Connecting to the Kafka cluster" 984 setupChannelConsumer := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error { 985 channelConsumer, err = parentConsumer.ConsumePartition(channel.topic(), channel.partition(), startFrom) 986 return err 987 }) 988 989 return channelConsumer, setupChannelConsumer.retry() 990 } 991 992 // Sets up the parent consumer for a channel using the given retry options. 993 func setupParentConsumerForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) (sarama.Consumer, error) { 994 var err error 995 var parentConsumer sarama.Consumer 996 997 logger.Infof("[channel: %s] Setting up the parent consumer for this channel...", channel.topic()) 998 999 retryMsg := "Connecting to the Kafka cluster" 1000 setupParentConsumer := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error { 1001 parentConsumer, err = sarama.NewConsumer(brokers, brokerConfig) 1002 return err 1003 }) 1004 1005 return parentConsumer, setupParentConsumer.retry() 1006 } 1007 1008 // Sets up the writer/producer for a channel using the given retry options. 1009 func setupProducerForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) (sarama.SyncProducer, error) { 1010 var err error 1011 var producer sarama.SyncProducer 1012 1013 logger.Infof("[channel: %s] Setting up the producer for this channel...", channel.topic()) 1014 1015 retryMsg := "Connecting to the Kafka cluster" 1016 setupProducer := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error { 1017 producer, err = sarama.NewSyncProducer(brokers, brokerConfig) 1018 return err 1019 }) 1020 1021 return producer, setupProducer.retry() 1022 } 1023 1024 // Creates the Kafka topic for the channel if it does not already exist 1025 func setupTopicForChannel(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, topicDetail *sarama.TopicDetail, channel channel) error { 1026 1027 // requires Kafka v0.10.1.0 or higher 1028 if !brokerConfig.Version.IsAtLeast(sarama.V0_10_1_0) { 1029 return nil 1030 } 1031 1032 logger.Infof("[channel: %s] Setting up the topic for this channel...", 1033 channel.topic()) 1034 1035 retryMsg := fmt.Sprintf("Creating Kafka topic [%s] for channel [%s]", 1036 channel.topic(), channel.String()) 1037 1038 setupTopic := newRetryProcess( 1039 retryOptions, 1040 haltChan, 1041 channel, 1042 retryMsg, 1043 func() error { 1044 1045 var err error 1046 clusterMembers := map[int32]*sarama.Broker{} 1047 var controllerId int32 1048 1049 // loop through brokers to access metadata 1050 for _, address := range brokers { 1051 broker := sarama.NewBroker(address) 1052 err = broker.Open(brokerConfig) 1053 1054 if err != nil { 1055 continue 1056 } 1057 1058 var ok bool 1059 ok, err = broker.Connected() 1060 if !ok { 1061 continue 1062 } 1063 defer broker.Close() 1064 1065 // metadata request which includes the topic 1066 var apiVersion int16 1067 if brokerConfig.Version.IsAtLeast(sarama.V0_11_0_0) { 1068 // use API version 4 to disable auto topic creation for 1069 // metadata requests 1070 apiVersion = 4 1071 } else { 1072 apiVersion = 1 1073 } 1074 metadata, err := broker.GetMetadata(&sarama.MetadataRequest{ 1075 Version: apiVersion, 1076 Topics: []string{channel.topic()}, 1077 AllowAutoTopicCreation: false}) 1078 1079 if err != nil { 1080 continue 1081 } 1082 1083 controllerId = metadata.ControllerID 1084 for _, broker := range metadata.Brokers { 1085 clusterMembers[broker.ID()] = broker 1086 } 1087 1088 for _, topic := range metadata.Topics { 1089 if topic.Name == channel.topic() { 1090 if topic.Err != sarama.ErrUnknownTopicOrPartition { 1091 // auto create topics must be enabled so return 1092 return nil 1093 } 1094 } 1095 } 1096 break 1097 } 1098 1099 // check to see if we got any metadata from any of the brokers in the list 1100 if len(clusterMembers) == 0 { 1101 return fmt.Errorf( 1102 "error creating topic [%s]; failed to retrieve metadata for the cluster", 1103 channel.topic()) 1104 } 1105 1106 // get the controller 1107 controller := clusterMembers[controllerId] 1108 err = controller.Open(brokerConfig) 1109 1110 if err != nil { 1111 return err 1112 } 1113 1114 var ok bool 1115 ok, err = controller.Connected() 1116 if !ok { 1117 return err 1118 } 1119 defer controller.Close() 1120 1121 // create the topic 1122 req := &sarama.CreateTopicsRequest{ 1123 Version: 0, 1124 TopicDetails: map[string]*sarama.TopicDetail{ 1125 channel.topic(): topicDetail}, 1126 Timeout: 3 * time.Second} 1127 resp := &sarama.CreateTopicsResponse{} 1128 resp, err = controller.CreateTopics(req) 1129 if err != nil { 1130 return err 1131 } 1132 1133 // check the response 1134 if topicErr, ok := resp.TopicErrors[channel.topic()]; ok { 1135 // treat no error and topic exists error as success 1136 if topicErr.Err == sarama.ErrNoError || 1137 topicErr.Err == sarama.ErrTopicAlreadyExists { 1138 return nil 1139 } 1140 if topicErr.Err == sarama.ErrInvalidTopic { 1141 // topic is invalid so abort 1142 logger.Warningf("[channel: %s] Failed to set up topic = %s", 1143 channel.topic(), topicErr.Err.Error()) 1144 go func() { 1145 haltChan <- struct{}{} 1146 }() 1147 } 1148 return fmt.Errorf("error creating topic: [%s]", 1149 topicErr.Err.Error()) 1150 } 1151 1152 return nil 1153 }) 1154 1155 return setupTopic.retry() 1156 } 1157 1158 // Replica ID information can accurately be retrieved only when the cluster 1159 // is healthy. Otherwise, the replica request does not return the full set 1160 // of initial replicas. This information is needed to provide context when 1161 // a health check returns an error. 1162 func getHealthyClusterReplicaInfo(retryOptions localconfig.Retry, haltChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) ([]int32, error) { 1163 var replicaIDs []int32 1164 1165 retryMsg := "Getting list of Kafka brokers replicating the channel" 1166 getReplicaInfo := newRetryProcess(retryOptions, haltChan, channel, retryMsg, func() error { 1167 client, err := sarama.NewClient(brokers, brokerConfig) 1168 if err != nil { 1169 return err 1170 } 1171 defer client.Close() 1172 1173 replicaIDs, err = client.Replicas(channel.topic(), channel.partition()) 1174 if err != nil { 1175 return err 1176 } 1177 return nil 1178 }) 1179 1180 return replicaIDs, getReplicaInfo.retry() 1181 }