github.com/koko1123/flow-go-1@v0.29.6/module/dkg/controller.go (about) 1 package dkg 2 3 import ( 4 "fmt" 5 "math" 6 "math/rand" 7 "sync" 8 "time" 9 10 "github.com/rs/zerolog" 11 12 "github.com/onflow/flow-go/crypto" 13 "github.com/koko1123/flow-go-1/model/flow" 14 "github.com/koko1123/flow-go-1/module" 15 ) 16 17 const ( 18 19 // DefaultBaseStartDelay is the default base delay to use when introducing 20 // random delay to the DKG start process. See preStartDelay for details. 21 DefaultBaseStartDelay = 500 * time.Microsecond 22 23 // DefaultBaseHandleFirstBroadcastDelay is the default base to use when 24 // introducing random delay to processing the first DKG broadcast message. 25 // See preHandleFirstBroadcastDelay for details. 26 // 27 // For a 150-node DKG, we observe a cost of ~2.5s per message to process 28 // broadcast messages during phase 1, for a total of ~6m of total CPU time. 29 // We would like to target spreading this cost over a 30 minute period. 30 // With the default value for DefaultHandleSubsequentBroadcastDelay, this 31 // results in processing all phase 1 messages in 6m+6m=12m, so for a maximum 32 // total processing time of 30m, we sample the initial delay from [0,18m]. 33 // We use 50ms as the default because 50ms*150^2 = 18.75m 34 // 35 DefaultBaseHandleFirstBroadcastDelay = 50 * time.Millisecond 36 37 // DefaultHandleSubsequentBroadcastDelay is the default delay to use before 38 // processing all DKG broadcasts after the first. 39 DefaultHandleSubsequentBroadcastDelay = 2500 * time.Millisecond 40 ) 41 42 // ControllerConfig defines configuration for the DKG Controller. These define 43 // how the DKG controller introduces delays to expensive DKG computations. 44 // 45 // We introduce delays for two reasons: 46 // 1. Avoid running long-running expensive DKG computations consecutively. 47 // 2. Avoid synchronizing expensive DKG computations across the DKG committee. 48 // 49 // Delays introduced prior to DKG start and prior to processing the FIRST broadcast 50 // message are sampled uniformly from [0,m), where m=b*n^2 51 // 52 // b = base delay (from config) 53 // n = size of DKG committee 54 // 55 // Delays introduced prior to processing subsequent broadcast messages are constant. 56 type ControllerConfig struct { 57 // BaseStartDelay determines the maximum delay before starting the DKG. 58 BaseStartDelay time.Duration 59 // BaseHandleFirstBroadcastDelay determines the maximum delay before handling 60 // the first broadcast message. 61 BaseHandleFirstBroadcastDelay time.Duration 62 // HandleSubsequentBroadcastDelay determines the constant delay before handling 63 // all broadcast messages following the first. 64 HandleSubsequentBroadcastDelay time.Duration 65 } 66 67 // Controller implements the DKGController interface. It controls the execution 68 // of a Joint Feldman DKG instance. A new Controller must be instantiated for 69 // every epoch. 70 type Controller struct { 71 // The embedded state Manager is used to manage the controller's underlying 72 // state. 73 Manager 74 75 log zerolog.Logger 76 77 // DKGState is the object that actually executes the protocol steps. 78 dkg crypto.DKGState 79 80 // dkgLock protects access to dkg 81 dkgLock sync.Mutex 82 83 // seed is required by DKGState 84 seed []byte 85 86 // broker enables the controller to communicate with other nodes 87 broker module.DKGBroker 88 89 // Channels used internally to trigger state transitions 90 h1Ch chan struct{} 91 h2Ch chan struct{} 92 endCh chan struct{} 93 shutdownCh chan struct{} 94 95 // private fields that hold the DKG artifacts when the protocol runs to 96 // completion 97 privateShare crypto.PrivateKey 98 publicKeys []crypto.PublicKey 99 groupPublicKey crypto.PublicKey 100 101 // artifactsLock protects access to artifacts 102 artifactsLock sync.Mutex 103 104 config ControllerConfig 105 once *sync.Once 106 } 107 108 // NewController instantiates a new Joint Feldman DKG controller. 109 func NewController( 110 log zerolog.Logger, 111 dkgInstanceID string, 112 dkg crypto.DKGState, 113 seed []byte, 114 broker module.DKGBroker, 115 config ControllerConfig, 116 ) *Controller { 117 118 logger := log.With(). 119 Str("component", "dkg_controller"). 120 Str("dkg_instance_id", dkgInstanceID). 121 Logger() 122 123 return &Controller{ 124 log: logger, 125 dkg: dkg, 126 seed: seed, 127 broker: broker, 128 h1Ch: make(chan struct{}), 129 h2Ch: make(chan struct{}), 130 endCh: make(chan struct{}), 131 shutdownCh: make(chan struct{}), 132 once: new(sync.Once), 133 config: config, 134 } 135 } 136 137 /******************************************************************************* 138 Implement DKGController 139 *******************************************************************************/ 140 141 // Run starts the DKG controller and executes the DKG state-machine. It blocks 142 // until the controller is shutdown or until an error is encountered in one of 143 // the protocol phases. 144 func (c *Controller) Run() error { 145 146 // Start DKG and transition to phase 1 147 err := c.start() 148 if err != nil { 149 return err 150 } 151 152 // Start a background routine to listen for incoming private and broadcast 153 // messages from other nodes 154 go c.doBackgroundWork() 155 156 // Execute DKG State Machine 157 for { 158 state := c.GetState() 159 c.log.Debug().Msgf("DKG: %s", c.state) 160 161 switch state { 162 case Phase1: 163 err := c.phase1() 164 if err != nil { 165 return err 166 } 167 case Phase2: 168 err := c.phase2() 169 if err != nil { 170 return err 171 } 172 case Phase3: 173 err := c.phase3() 174 if err != nil { 175 return err 176 } 177 case End: 178 c.Shutdown() 179 case Shutdown: 180 return nil 181 } 182 } 183 } 184 185 // EndPhase1 notifies the controller to end phase 1, and start phase 2 186 func (c *Controller) EndPhase1() error { 187 state := c.GetState() 188 if state != Phase1 { 189 return NewInvalidStateTransitionError(state, Phase2) 190 } 191 192 c.SetState(Phase2) 193 close(c.h1Ch) 194 195 return nil 196 } 197 198 // EndPhase2 notifies the controller to end phase 2, and start phase 3 199 func (c *Controller) EndPhase2() error { 200 state := c.GetState() 201 if state != Phase2 { 202 return NewInvalidStateTransitionError(state, Phase3) 203 } 204 205 c.SetState(Phase3) 206 close(c.h2Ch) 207 208 return nil 209 } 210 211 // End terminates the DKG state machine and records the artifacts. 212 func (c *Controller) End() error { 213 state := c.GetState() 214 if state != Phase3 { 215 return NewInvalidStateTransitionError(state, End) 216 } 217 218 c.log.Debug().Msg("DKG engine end") 219 220 // end and retrieve products of the DKG protocol 221 c.dkgLock.Lock() 222 223 privateShare, groupPublicKey, publicKeys, err := c.dkg.End() 224 c.dkgLock.Unlock() 225 if err != nil { 226 return err 227 } 228 229 c.artifactsLock.Lock() 230 c.privateShare = privateShare 231 c.groupPublicKey = groupPublicKey 232 c.publicKeys = publicKeys 233 c.artifactsLock.Unlock() 234 235 c.SetState(End) 236 close(c.endCh) 237 238 return nil 239 } 240 241 // Shutdown stops the controller regardless of the current state. 242 func (c *Controller) Shutdown() { 243 c.broker.Shutdown() 244 c.SetState(Shutdown) 245 close(c.shutdownCh) 246 } 247 248 // Poll instructs the broker to read new broadcast messages, which will be 249 // relayed through the message channel. The function does not return until the 250 // received messages are processed. 251 func (c *Controller) Poll(blockReference flow.Identifier) error { 252 return c.broker.Poll(blockReference) 253 } 254 255 // GetArtifacts returns our node's private key share, the group public key, 256 // and the list of all nodes' public keys (including ours), as computed by 257 // the DKG. 258 func (c *Controller) GetArtifacts() (crypto.PrivateKey, crypto.PublicKey, []crypto.PublicKey) { 259 c.artifactsLock.Lock() 260 defer c.artifactsLock.Unlock() 261 return c.privateShare, c.groupPublicKey, c.publicKeys 262 } 263 264 // GetIndex returns the index of this node in the DKG committee list. 265 func (c *Controller) GetIndex() int { 266 return c.broker.GetIndex() 267 } 268 269 // SubmitResult instructs the broker to submit DKG results. It is up to the 270 // caller to ensure that this method is called after a succesfull run of the 271 // protocol. 272 func (c *Controller) SubmitResult() error { 273 _, pubKey, groupKeys := c.GetArtifacts() 274 return c.broker.SubmitResult(pubKey, groupKeys) 275 } 276 277 /******************************************************************************* 278 WORKERS 279 *******************************************************************************/ 280 281 func (c *Controller) doBackgroundWork() { 282 privateMsgCh := c.broker.GetPrivateMsgCh() 283 broadcastMsgCh := c.broker.GetBroadcastMsgCh() 284 for { 285 select { 286 case msg := <-privateMsgCh: 287 c.dkgLock.Lock() 288 err := c.dkg.HandlePrivateMsg(int(msg.CommitteeMemberIndex), msg.Data) 289 c.dkgLock.Unlock() 290 if err != nil { 291 c.log.Err(err).Msg("error processing DKG private message") 292 } 293 294 case msg := <-broadcastMsgCh: 295 296 // before processing a broadcast message during phase 1, sleep for a 297 // random delay to avoid synchronizing this expensive operation across 298 // all consensus nodes 299 state := c.GetState() 300 if state == Phase1 { 301 302 // introduce a large, uniformly sampled delay prior to processing 303 // the first message 304 isFirstMessage := false 305 c.once.Do(func() { 306 isFirstMessage = true 307 delay := c.preHandleFirstBroadcastDelay() 308 c.log.Info().Msgf("sleeping for %s before processing first phase 1 broadcast message", delay) 309 time.Sleep(delay) 310 }) 311 312 if !isFirstMessage { 313 // introduce a constant delay for all subsequent messages 314 c.log.Debug().Msgf("sleeping for %s before processing subsequent phase 1 broadcast message", c.config.HandleSubsequentBroadcastDelay) 315 time.Sleep(c.config.HandleSubsequentBroadcastDelay) 316 } 317 } 318 319 c.dkgLock.Lock() 320 err := c.dkg.HandleBroadcastMsg(int(msg.CommitteeMemberIndex), msg.Data) 321 c.dkgLock.Unlock() 322 if err != nil { 323 c.log.Err(err).Msg("error processing DKG broadcast message") 324 } 325 326 case <-c.shutdownCh: 327 return 328 } 329 } 330 } 331 332 func (c *Controller) start() error { 333 state := c.GetState() 334 if state != Init { 335 return fmt.Errorf("cannot execute start routine in state %s", state) 336 } 337 338 // before starting the DKG, sleep for a random delay to avoid synchronizing 339 // this expensive operation across all consensus nodes 340 delay := c.preStartDelay() 341 c.log.Debug().Msgf("sleeping for %s before starting DKG", delay) 342 time.Sleep(delay) 343 344 c.dkgLock.Lock() 345 err := c.dkg.Start(c.seed) 346 c.dkgLock.Unlock() 347 if err != nil { 348 return fmt.Errorf("Error starting DKG: %w", err) 349 } 350 351 c.log.Debug().Msg("DKG engine started") 352 c.SetState(Phase1) 353 return nil 354 } 355 356 func (c *Controller) phase1() error { 357 state := c.GetState() 358 if state != Phase1 { 359 return fmt.Errorf("Cannot execute phase1 routine in state %s", state) 360 } 361 362 c.log.Debug().Msg("Waiting for end of phase 1") 363 for { 364 select { 365 case <-c.h1Ch: 366 return nil 367 case <-c.shutdownCh: 368 return nil 369 } 370 } 371 } 372 373 func (c *Controller) phase2() error { 374 state := c.GetState() 375 if state != Phase2 { 376 return fmt.Errorf("Cannot execute phase2 routine in state %s", state) 377 } 378 379 c.dkgLock.Lock() 380 err := c.dkg.NextTimeout() 381 c.dkgLock.Unlock() 382 if err != nil { 383 return fmt.Errorf("Error calling NextTimeout: %w", err) 384 } 385 386 c.log.Debug().Msg("Waiting for end of phase 2") 387 for { 388 select { 389 case <-c.h2Ch: 390 return nil 391 case <-c.shutdownCh: 392 return nil 393 } 394 } 395 } 396 397 func (c *Controller) phase3() error { 398 state := c.GetState() 399 if state != Phase3 { 400 return fmt.Errorf("Cannot execute phase3 routine in state %s", state) 401 } 402 403 c.dkgLock.Lock() 404 err := c.dkg.NextTimeout() 405 c.dkgLock.Unlock() 406 if err != nil { 407 return fmt.Errorf("Error calling NextTimeout: %w", err) 408 } 409 410 c.log.Debug().Msg("Waiting for end of phase 3") 411 for { 412 select { 413 case <-c.endCh: 414 return nil 415 case <-c.shutdownCh: 416 return nil 417 } 418 } 419 } 420 421 // preStartDelay returns a duration to delay prior to starting the DKG process. 422 // This prevents synchronization of the DKG starting (an expensive operation) 423 // across the network, which can impact finalization. 424 func (c *Controller) preStartDelay() time.Duration { 425 delay := computePreprocessingDelay(c.config.BaseStartDelay, c.dkg.Size()) 426 return delay 427 } 428 429 // preHandleFirstBroadcastDelay returns a duration to delay prior to handling 430 // the first broadcast message. This delay is used only during phase 1 of the DKG. 431 // This prevents synchronization of processing verification vectors (an 432 // expensive operation) across the network, which can impact finalization. 433 func (c *Controller) preHandleFirstBroadcastDelay() time.Duration { 434 delay := computePreprocessingDelay(c.config.BaseHandleFirstBroadcastDelay, c.dkg.Size()) 435 return delay 436 } 437 438 // computePreprocessingDelay computes a random delay to introduce before an 439 // expensive operation. 440 // 441 // The maximum delay is m=b*n^2 where: 442 // * b is a configurable base delay 443 // * n is the size of the DKG committee 444 func computePreprocessingDelay(baseDelay time.Duration, dkgSize int) time.Duration { 445 446 maxDelay := computePreprocessingDelayMax(baseDelay, dkgSize) 447 if maxDelay <= 0 { 448 return 0 449 } 450 // select delay from [0,m) 451 delay := time.Duration(rand.Int63n(maxDelay.Nanoseconds())) 452 return delay 453 } 454 455 // computePreprocessingDelayMax computes the maximum dely for computePreprocessingDelay. 456 func computePreprocessingDelayMax(baseDelay time.Duration, dkgSize int) time.Duration { 457 // sanity checks 458 if baseDelay < 0 { 459 baseDelay = 0 460 } 461 if dkgSize < 0 { 462 dkgSize = 0 463 } 464 465 // m=b*n^2 466 maxDelay := time.Duration(math.Pow(float64(dkgSize), 2)) * baseDelay 467 if maxDelay <= 0 { 468 return 0 469 } 470 return maxDelay 471 }