github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/dkg/broker.go (about) 1 package dkg 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/onflow/crypto" 10 "github.com/rs/zerolog" 11 "github.com/sethvargo/go-retry" 12 13 "github.com/onflow/flow-go/engine" 14 "github.com/onflow/flow-go/model/fingerprint" 15 "github.com/onflow/flow-go/model/flow" 16 "github.com/onflow/flow-go/model/messages" 17 "github.com/onflow/flow-go/module" 18 "github.com/onflow/flow-go/module/retrymiddleware" 19 ) 20 21 // BrokerOpt is a functional option which modifies the DKG Broker config. 22 type BrokerOpt func(*BrokerConfig) 23 24 // BrokerConfig is configuration for the DKG Broker component. 25 type BrokerConfig struct { 26 // PublishMaxRetries is the maximum number of times the broker will attempt 27 // to broadcast a message or publish a result. 28 PublishMaxRetries uint64 29 // ReadMaxRetries is the max number of times the broker will attempt to 30 // read messages before giving up. 31 ReadMaxRetries uint64 32 // RetryMaxConsecutiveFailures is the number of consecutive failures allowed 33 // before we switch to a different Access client for subsequent attempts. 34 RetryMaxConsecutiveFailures int 35 // RetryInitialWait is the initial duration to wait between retries for all 36 // retryable requests - increases exponentially for subsequent retries. 37 RetryInitialWait time.Duration 38 // RetryJitterPct is the percentage jitter to introduce to each retry interval 39 // for all retryable requests. 40 RetryJitterPct uint64 41 } 42 43 // DefaultBrokerConfig returns the default config for the DKG Broker component. 44 func DefaultBrokerConfig() BrokerConfig { 45 return BrokerConfig{ 46 PublishMaxRetries: 10, 47 ReadMaxRetries: 3, 48 RetryMaxConsecutiveFailures: 2, 49 RetryInitialWait: time.Second, 50 RetryJitterPct: 25, 51 } 52 } 53 54 // Broker is an implementation of the DKGBroker interface which is intended to 55 // be used in conjunction with the DKG MessagingEngine for private messages, and 56 // with the DKG smart-contract for broadcast messages. 57 type Broker struct { 58 config BrokerConfig 59 log zerolog.Logger 60 unit *engine.Unit 61 dkgInstanceID string // unique identifier of the current dkg run (prevent replay attacks) 62 committee flow.IdentitySkeletonList // identities of DKG members 63 me module.Local // used for signing broadcast messages 64 myIndex int // index of this instance in the committee 65 dkgContractClients []module.DKGContractClient // array of clients to communicate with the DKG smart contract in priority order for fallbacks during retries 66 lastSuccessfulClientIndex int // index of the contract client that was last successful during retries 67 tunnel *BrokerTunnel // channels through which the broker communicates with the network engine 68 privateMsgCh chan messages.PrivDKGMessageIn // channel to forward incoming private messages to consumers 69 broadcastMsgCh chan messages.BroadcastDKGMessage // channel to forward incoming broadcast messages to consumers 70 messageOffset uint // offset for next broadcast messages to fetch 71 shutdownCh chan struct{} // channel to stop the broker from listening 72 73 broadcasts uint // broadcasts counts the number of attempted broadcasts 74 75 clientLock sync.Mutex // lock around updates to current client 76 broadcastLock sync.Mutex // lock around outbound broadcasts 77 pollLock sync.Mutex // lock around polls to read inbound broadcasts 78 } 79 80 var _ module.DKGBroker = (*Broker)(nil) 81 82 // NewBroker instantiates a new epoch-specific broker capable of communicating 83 // with other nodes via a network engine and dkg smart-contract. 84 func NewBroker( 85 log zerolog.Logger, 86 dkgInstanceID string, 87 committee flow.IdentitySkeletonList, 88 me module.Local, 89 myIndex int, 90 dkgContractClients []module.DKGContractClient, 91 tunnel *BrokerTunnel, 92 opts ...BrokerOpt, 93 ) *Broker { 94 95 config := DefaultBrokerConfig() 96 for _, apply := range opts { 97 apply(&config) 98 } 99 100 b := &Broker{ 101 config: config, 102 log: log.With().Str("component", "dkg_broker").Str("dkg_instance_id", dkgInstanceID).Logger(), 103 unit: engine.NewUnit(), 104 dkgInstanceID: dkgInstanceID, 105 committee: committee, 106 me: me, 107 myIndex: myIndex, 108 dkgContractClients: dkgContractClients, 109 tunnel: tunnel, 110 privateMsgCh: make(chan messages.PrivDKGMessageIn), 111 broadcastMsgCh: make(chan messages.BroadcastDKGMessage), 112 shutdownCh: make(chan struct{}), 113 } 114 115 go b.listen() 116 117 return b 118 } 119 120 /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 121 Implement DKGBroker 122 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ 123 124 // GetIndex returns the index of this node in the committee list. 125 func (b *Broker) GetIndex() int { 126 return b.myIndex 127 } 128 129 // PrivateSend sends a DKGMessage to a destination over a private channel. It 130 // appends the current DKG instance ID to the message. 131 func (b *Broker) PrivateSend(dest int, data []byte) { 132 if dest >= len(b.committee) || dest < 0 { 133 b.log.Error().Msgf("destination id out of range: %d", dest) 134 return 135 } 136 dkgMessageOut := messages.PrivDKGMessageOut{ 137 DKGMessage: messages.NewDKGMessage(data, b.dkgInstanceID), 138 DestID: b.committee[dest].NodeID, 139 } 140 b.tunnel.SendOut(dkgMessageOut) 141 } 142 143 // Broadcast signs and broadcasts a message to all participants. 144 func (b *Broker) Broadcast(data []byte) { 145 b.unit.Launch(func() { 146 147 // NOTE: We're counting the number of times the underlying DKG requested 148 // a broadcast so that we can detect an unhappy path (any time there is 149 // more than 1 broadcast message per DKG) Thus incrementing broadcasts 150 // before we perform the broadcasts is okay. 151 b.broadcastLock.Lock() 152 if b.broadcasts > 0 { 153 // The warn-level log is used by the integration tests to check if this 154 // func is called more than once within one epoch (unhappy path). 155 b.log.Warn().Msgf("preparing to send DKG broadcast number %d with header %d", b.broadcasts+1, data[0]) 156 } else { 157 b.log.Info().Msgf("preparing to send DKG message broadcast with header %d", data[0]) 158 } 159 b.broadcasts++ 160 log := b.log.With().Uint("broadcast_number", b.broadcasts).Logger() 161 b.broadcastLock.Unlock() 162 163 bcastMsg, err := b.prepareBroadcastMessage(data) 164 if err != nil { 165 log.Fatal().Err(err).Msg("failed to create broadcast message") 166 } 167 168 backoff := retry.NewExponential(b.config.RetryInitialWait) 169 backoff = retry.WithMaxRetries(b.config.PublishMaxRetries, backoff) 170 backoff = retry.WithJitterPercent(b.config.RetryJitterPct, backoff) 171 172 clientIndex, dkgContractClient := b.getInitialContractClient() 173 onMaxConsecutiveRetries := func(totalAttempts int) { 174 clientIndex, dkgContractClient = b.updateContractClient(clientIndex) 175 log.Warn().Msgf("broadcast: retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex) 176 } 177 backoff = retrymiddleware.AfterConsecutiveFailures(b.config.RetryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries) 178 179 b.broadcastLock.Lock() 180 attempts := 1 181 err = retry.Do(b.unit.Ctx(), backoff, func(ctx context.Context) error { 182 err := dkgContractClient.Broadcast(bcastMsg) 183 if err != nil { 184 log.Error().Err(err).Msgf("error broadcasting, retrying (attempt %d)", attempts) 185 attempts++ 186 return retry.RetryableError(err) 187 } 188 189 // update our last successful client index for future calls 190 b.updateLastSuccessfulClient(clientIndex) 191 return nil 192 }) 193 b.broadcastLock.Unlock() 194 195 // Various network conditions can result in errors while broadcasting DKG messages. 196 // Because the overall DKG is resilient to individual message failures, 197 // it is acceptable to log the error and move on. 198 if err != nil { 199 log.Error().Err(err).Msgf("failed to broadcast message after %d attempts", attempts) 200 return 201 } 202 log.Info().Msgf("dkg broadcast successfully on attempt %d", attempts) 203 }) 204 } 205 206 // SubmitResult publishes the result of the DKG protocol to the smart contract. 207 func (b *Broker) SubmitResult(groupKey crypto.PublicKey, pubKeys []crypto.PublicKey) error { 208 209 // If the DKG failed locally, we will get a nil key vector here. We need to convert 210 // the nil slice to a slice of nil keys before submission. 211 // 212 // In general, if pubKeys does not have one key per participant, we cannot submit 213 // a valid result - therefore we submit a nil vector (indicating that we have 214 // completed the process, but we know that we don't have a valid result). 215 if len(pubKeys) != len(b.committee) { 216 b.log.Warn().Msgf("submitting dkg result with incomplete key vector (len=%d, expected=%d)", len(pubKeys), len(b.committee)) 217 // create a key vector with one nil entry for each committee member 218 pubKeys = make([]crypto.PublicKey, len(b.committee)) 219 } 220 221 backoff := retry.NewExponential(b.config.RetryInitialWait) 222 backoff = retry.WithMaxRetries(b.config.PublishMaxRetries, backoff) 223 backoff = retry.WithJitterPercent(b.config.RetryJitterPct, backoff) 224 225 clientIndex, dkgContractClient := b.getInitialContractClient() 226 onMaxConsecutiveRetries := func(totalAttempts int) { 227 clientIndex, dkgContractClient = b.updateContractClient(clientIndex) 228 b.log.Warn().Msgf("submit result: retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex) 229 } 230 backoff = retrymiddleware.AfterConsecutiveFailures(b.config.RetryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries) 231 232 attempts := 1 233 err := retry.Do(b.unit.Ctx(), backoff, func(ctx context.Context) error { 234 err := dkgContractClient.SubmitResult(groupKey, pubKeys) 235 if err != nil { 236 b.log.Error().Err(err).Msgf("error submitting DKG result, retrying (attempt %d)", attempts) 237 attempts++ 238 return retry.RetryableError(err) 239 } 240 241 // update our last successful client index for future calls 242 b.updateLastSuccessfulClient(clientIndex) 243 return nil 244 }) 245 if err != nil { 246 return fmt.Errorf("failed to submit dkg result after %d attempts: %w", attempts, err) 247 } 248 249 b.log.Info().Msgf("dkg result submitted successfully on attempt %d", attempts) 250 return nil 251 } 252 253 // Disqualify flags that a node is misbehaving and got disqualified 254 func (b *Broker) Disqualify(node int, log string) { 255 var nodeID flow.Identifier 256 if node < len(b.committee) { 257 nodeID = b.committee[node].NodeID 258 } 259 260 // The warn-level log is used by the integration tests to check if this method is called. 261 b.log.Warn().Msgf("participant %d (this node) is disqualifying participant (index=%d, node_id=%s) because: %s", 262 b.myIndex, node, nodeID, log) 263 } 264 265 // FlagMisbehavior warns that a node is misbehaving. 266 func (b *Broker) FlagMisbehavior(node int, log string) { 267 var nodeID flow.Identifier 268 if node < len(b.committee) { 269 nodeID = b.committee[node].NodeID 270 } 271 272 // The warn-level log is used by the integration tests to check if this method is called. 273 b.log.Warn().Msgf("participant %d (this node) is flagging participant (index=%d, node_id=%s) because: %s", 274 b.myIndex, node, nodeID, log) 275 } 276 277 // GetPrivateMsgCh returns the channel through which consumers can receive 278 // incoming private DKG messages. 279 func (b *Broker) GetPrivateMsgCh() <-chan messages.PrivDKGMessageIn { 280 return b.privateMsgCh 281 } 282 283 // GetBroadcastMsgCh returns the channel through which consumers can receive 284 // incoming broadcast DKG messages. 285 func (b *Broker) GetBroadcastMsgCh() <-chan messages.BroadcastDKGMessage { 286 return b.broadcastMsgCh 287 } 288 289 // Poll calls the DKG smart contract to get missing DKG messages for the current 290 // epoch, and forwards them to the msgCh. It should be called with the ID of a 291 // block whose seal is finalized. The function doesn't return until the received 292 // messages are processed by the consumer because b.msgCh is not buffered. 293 func (b *Broker) Poll(referenceBlock flow.Identifier) error { 294 // We only issue one poll at a time to avoid delivering duplicate broadcast messages. 295 // The messageOffset determines which messages are retrieved by a Poll, 296 // and is not updated until the end of this function. 297 b.pollLock.Lock() 298 defer b.pollLock.Unlock() 299 300 backoff := retry.NewExponential(b.config.RetryInitialWait) 301 backoff = retry.WithMaxRetries(b.config.ReadMaxRetries, backoff) 302 backoff = retry.WithJitterPercent(b.config.RetryJitterPct, backoff) 303 304 clientIndex, dkgContractClient := b.getInitialContractClient() 305 onMaxConsecutiveRetries := func(totalAttempts int) { 306 clientIndex, dkgContractClient = b.updateContractClient(clientIndex) 307 b.log.Warn().Msgf("poll: retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex) 308 } 309 backoff = retrymiddleware.AfterConsecutiveFailures(b.config.RetryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries) 310 311 var msgs []messages.BroadcastDKGMessage 312 var err error 313 attempt := 1 314 err = retry.Do(b.unit.Ctx(), backoff, func(ctx context.Context) error { 315 msgs, err = dkgContractClient.ReadBroadcast(b.messageOffset, referenceBlock) 316 if err != nil { 317 err = fmt.Errorf("could not read broadcast messages (attempt: %d, offset: %d, ref: %v): %w", attempt, b.messageOffset, referenceBlock, err) 318 attempt++ 319 return retry.RetryableError(err) 320 } 321 322 // update our last successful client index for future calls 323 b.updateLastSuccessfulClient(clientIndex) 324 return nil 325 }) 326 // Various network conditions can result in errors while reading DKG messages 327 // We will read any missed messages during the next poll because messageOffset is not increased 328 if err != nil { 329 b.log.Error().Err(err).Msgf("failed to read messages after %d attempts", attempt) 330 return nil 331 } 332 333 for _, msg := range msgs { 334 // set the CommitteeMemberIndex field for the message 335 memberIndex, ok := b.committee.GetIndex(msg.NodeID) 336 if !ok { 337 b.log.Error().Msgf("broadcast message from node with id (%v) does not match the ID of any committee member", msg.NodeID) 338 continue 339 } 340 msg.CommitteeMemberIndex = uint64(memberIndex) 341 342 ok, err := b.verifyBroadcastMessage(msg) 343 if err != nil { 344 b.log.Error().Err(err).Msg("unable to verify broadcast message") 345 continue 346 } 347 if !ok { 348 b.log.Error().Err(err).Msg("invalid signature on broadcast dkg message") 349 continue 350 } 351 b.log.Debug().Msgf("forwarding broadcast message to controller") 352 b.broadcastMsgCh <- msg 353 } 354 355 // update message offset to use for future polls, this avoids forwarding the 356 // same message more than once 357 b.messageOffset += uint(len(msgs)) 358 return nil 359 } 360 361 // Shutdown stop the goroutine that listens to incoming private messages. 362 func (b *Broker) Shutdown() { 363 close(b.shutdownCh) 364 } 365 366 /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ 367 368 // updateContractClient will return the last successful client index by default for all initial operations or else 369 // it will return the appropriate client index with respect to last successful and number of client. 370 func (b *Broker) updateContractClient(clientIndex int) (int, module.DKGContractClient) { 371 b.clientLock.Lock() 372 defer b.clientLock.Unlock() 373 if clientIndex == b.lastSuccessfulClientIndex { 374 if clientIndex == len(b.dkgContractClients)-1 { 375 clientIndex = 0 376 } else { 377 clientIndex++ 378 } 379 } else { 380 clientIndex = b.lastSuccessfulClientIndex 381 } 382 383 return clientIndex, b.dkgContractClients[clientIndex] 384 } 385 386 // getInitialContractClient will return the last successful contract client or the initial 387 func (b *Broker) getInitialContractClient() (int, module.DKGContractClient) { 388 b.clientLock.Lock() 389 defer b.clientLock.Unlock() 390 return b.lastSuccessfulClientIndex, b.dkgContractClients[b.lastSuccessfulClientIndex] 391 } 392 393 // updateLastSuccessfulClient set lastSuccessfulClientIndex in concurrency safe way 394 func (b *Broker) updateLastSuccessfulClient(clientIndex int) { 395 b.clientLock.Lock() 396 defer b.clientLock.Unlock() 397 398 b.lastSuccessfulClientIndex = clientIndex 399 } 400 401 // listen is a blocking call that processes incoming messages from the network 402 // engine. 403 func (b *Broker) listen() { 404 for { 405 select { 406 case msg := <-b.tunnel.MsgChIn: 407 b.onPrivateMessage(msg.OriginID, msg.DKGMessage) 408 case <-b.shutdownCh: 409 return 410 } 411 } 412 } 413 414 // onPrivateMessage verifies the integrity of an incoming message, sets the CommitteeMemberIndex and forwards 415 // it to consumers via the msgCh. 416 func (b *Broker) onPrivateMessage(originID flow.Identifier, msg messages.DKGMessage) { 417 memberIndex, ok := b.committee.GetIndex(originID) 418 if !ok { 419 b.log.Error().Msgf("bad message: OriginID (%v) does not match the NodeID of any committee member", originID) 420 return 421 } 422 423 err := b.hasValidDKGInstanceID(msg) 424 if err != nil { 425 b.log.Err(err).Msg("bad message") 426 return 427 } 428 429 b.privateMsgCh <- messages.PrivDKGMessageIn{DKGMessage: msg, OriginID: originID, CommitteeMemberIndex: uint64(memberIndex)} 430 } 431 432 // hasValidDKGInstanceID returns an error if msg.DKGInstanceID does not match Broker.DKGInstanceID 433 func (b *Broker) hasValidDKGInstanceID(msg messages.DKGMessage) error { 434 // check that the message corresponds to the current epoch 435 if b.dkgInstanceID != msg.DKGInstanceID { 436 return fmt.Errorf("wrong DKG instance. Got %s, want %s", msg.DKGInstanceID, b.dkgInstanceID) 437 } 438 439 return nil 440 } 441 442 // prepareBroadcastMessage creates BroadcastDKGMessage with a signature from the 443 // node's staking key. 444 func (b *Broker) prepareBroadcastMessage(data []byte) (messages.BroadcastDKGMessage, error) { 445 dkgMessage := messages.NewDKGMessage( 446 data, 447 b.dkgInstanceID, 448 ) 449 sigData := fingerprint.Fingerprint(dkgMessage) 450 signature, err := b.me.Sign(sigData[:], NewDKGMessageHasher()) 451 if err != nil { 452 return messages.BroadcastDKGMessage{}, err 453 } 454 bcastMsg := messages.BroadcastDKGMessage{ 455 DKGMessage: dkgMessage, 456 Signature: signature, 457 } 458 return bcastMsg, nil 459 } 460 461 // verifyBroadcastMessage checks the DKG instance of a broadcast 462 // message, as well as the signature against the staking key of the sender. 463 // Returns: 464 // - true, nil if the message contents are valid and have a valid signature 465 // - false, nil if the message contents are valid but have an invalid signature 466 // - false, err if the message contents are invalid, or could not be checked, 467 // or the signature could not be checked 468 // 469 // TODO differentiate errors 470 func (b *Broker) verifyBroadcastMessage(bcastMsg messages.BroadcastDKGMessage) (bool, error) { 471 err := b.hasValidDKGInstanceID(bcastMsg.DKGMessage) 472 if err != nil { 473 return false, fmt.Errorf("invalid dkg instance: %w", err) 474 } 475 origin := b.committee[bcastMsg.CommitteeMemberIndex] 476 signData := fingerprint.Fingerprint(bcastMsg.DKGMessage) 477 return origin.StakingPubKey.Verify( 478 bcastMsg.Signature, 479 signData[:], 480 NewDKGMessageHasher(), 481 ) 482 }