github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/consensus/hotstuff/integration/liveness_test.go (about) 1 package integration 2 3 import ( 4 "errors" 5 "sync" 6 "testing" 7 "time" 8 9 "github.com/stretchr/testify/assert" 10 "github.com/stretchr/testify/require" 11 12 "github.com/onflow/flow-go/consensus/hotstuff/model" 13 "github.com/onflow/flow-go/consensus/hotstuff/pacemaker/timeout" 14 "github.com/onflow/flow-go/model/flow" 15 "github.com/onflow/flow-go/utils/unittest" 16 ) 17 18 // pacemaker timeout 19 // if your laptop is fast enough, 10 ms is enough 20 const pmTimeout = 100 * time.Millisecond 21 22 // maxTimeoutRebroadcast specifies how often the PaceMaker rebroadcasts 23 // its timeout object in case there is no progress. We keep the value 24 // small so we have smaller latency 25 const maxTimeoutRebroadcast = 1 * time.Second 26 27 // If 2 nodes are down in a 7 nodes cluster, the rest of 5 nodes can 28 // still make progress and reach consensus 29 func Test2TimeoutOutof7Instances(t *testing.T) { 30 31 healthyReplicas := 5 32 notVotingReplicas := 2 33 finalView := uint64(30) 34 35 // generate the seven hotstuff participants 36 participants := unittest.IdentityListFixture(healthyReplicas + notVotingReplicas) 37 instances := make([]*Instance, 0, healthyReplicas+notVotingReplicas) 38 root := DefaultRoot() 39 timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast) 40 require.NoError(t, err) 41 42 // set up five instances that work fully 43 for n := 0; n < healthyReplicas; n++ { 44 in := NewInstance(t, 45 WithRoot(root), 46 WithParticipants(participants), 47 WithLocalID(participants[n].NodeID), 48 WithTimeouts(timeouts), 49 WithStopCondition(ViewFinalized(finalView)), 50 ) 51 instances = append(instances, in) 52 } 53 54 // set up two instances which can't vote, nor propose 55 for n := healthyReplicas; n < healthyReplicas+notVotingReplicas; n++ { 56 in := NewInstance(t, 57 WithRoot(root), 58 WithParticipants(participants), 59 WithLocalID(participants[n].NodeID), 60 WithTimeouts(timeouts), 61 WithStopCondition(ViewFinalized(finalView)), 62 WithOutgoingVotes(BlockAllVotes), 63 WithOutgoingProposals(BlockAllProposals), 64 ) 65 instances = append(instances, in) 66 } 67 68 // connect the communicators of the instances together 69 Connect(t, instances) 70 71 // start all seven instances and wait for them to wrap up 72 var wg sync.WaitGroup 73 for _, in := range instances { 74 wg.Add(1) 75 go func(in *Instance) { 76 err := in.Run() 77 require.True(t, errors.Is(err, errStopCondition)) 78 wg.Done() 79 }(in) 80 } 81 unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout") 82 83 // check that all instances have the same finalized block 84 ref := instances[0] 85 assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't") 86 finalizedViews := FinalizedViews(ref) 87 for i := 1; i < healthyReplicas; i++ { 88 assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance") 89 assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance") 90 } 91 } 92 93 // 2 nodes in a 4-node cluster are configured to be able only to send timeout messages (no voting or proposing). 94 // The other 2 unconstrained nodes should be able to make progress through the recovery path by creating TCs 95 // for every round, but no block will be finalized, because finalization requires direct 1-chain and QC. 96 func Test2TimeoutOutof4Instances(t *testing.T) { 97 98 healthyReplicas := 2 99 replicasDroppingHappyPathMsgs := 2 100 finalView := uint64(30) 101 102 // generate the 4 hotstuff participants 103 participants := unittest.IdentityListFixture(healthyReplicas + replicasDroppingHappyPathMsgs) 104 instances := make([]*Instance, 0, healthyReplicas+replicasDroppingHappyPathMsgs) 105 root := DefaultRoot() 106 timeouts, err := timeout.NewConfig(10*time.Millisecond, 50*time.Millisecond, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast) 107 require.NoError(t, err) 108 109 // set up two instances that work fully 110 for n := 0; n < healthyReplicas; n++ { 111 in := NewInstance(t, 112 WithRoot(root), 113 WithParticipants(participants), 114 WithLocalID(participants[n].NodeID), 115 WithTimeouts(timeouts), 116 WithStopCondition(ViewReached(finalView)), 117 ) 118 instances = append(instances, in) 119 } 120 121 // set up instances which can't vote, nor propose 122 for n := healthyReplicas; n < healthyReplicas+replicasDroppingHappyPathMsgs; n++ { 123 in := NewInstance(t, 124 WithRoot(root), 125 WithParticipants(participants), 126 WithLocalID(participants[n].NodeID), 127 WithTimeouts(timeouts), 128 WithStopCondition(ViewReached(finalView)), 129 WithOutgoingVotes(BlockAllVotes), 130 WithIncomingVotes(BlockAllVotes), 131 WithOutgoingProposals(BlockAllProposals), 132 ) 133 instances = append(instances, in) 134 } 135 136 // connect the communicators of the instances together 137 Connect(t, instances) 138 139 // start the instances and wait for them to finish 140 var wg sync.WaitGroup 141 for _, in := range instances { 142 wg.Add(1) 143 go func(in *Instance) { 144 err := in.Run() 145 require.True(t, errors.Is(err, errStopCondition), "should run until stop condition") 146 wg.Done() 147 }(in) 148 } 149 unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout") 150 151 // check that all instances have the same finalized block 152 ref := instances[0] 153 finalizedViews := FinalizedViews(ref) 154 assert.Equal(t, []uint64{0}, finalizedViews, "no view was finalized, because finalization requires 2 direct chain plus a QC which never happen in this case") 155 assert.Equal(t, finalView, ref.pacemaker.CurView(), "expect instance 0 should made enough progress, but didn't") 156 for i := 1; i < healthyReplicas; i++ { 157 assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance", i) 158 assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance", i) 159 assert.Equal(t, finalView, instances[i].pacemaker.CurView(), "instance %d should have same active view as first instance", i) 160 } 161 } 162 163 // If 1 node is down in a 5 nodes cluster, the rest of 4 nodes can 164 // make progress and reach consensus 165 func Test1TimeoutOutof5Instances(t *testing.T) { 166 167 healthyReplicas := 4 168 blockedReplicas := 1 169 finalView := uint64(30) 170 171 // generate the seven hotstuff participants 172 participants := unittest.IdentityListFixture(healthyReplicas + blockedReplicas) 173 instances := make([]*Instance, 0, healthyReplicas+blockedReplicas) 174 root := DefaultRoot() 175 timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast) 176 require.NoError(t, err) 177 178 // set up instances that work fully 179 for n := 0; n < healthyReplicas; n++ { 180 in := NewInstance(t, 181 WithRoot(root), 182 WithParticipants(participants), 183 WithLocalID(participants[n].NodeID), 184 WithTimeouts(timeouts), 185 WithStopCondition(ViewFinalized(finalView)), 186 ) 187 instances = append(instances, in) 188 } 189 190 // set up one instance which can't vote, nor propose 191 for n := healthyReplicas; n < healthyReplicas+blockedReplicas; n++ { 192 in := NewInstance(t, 193 WithRoot(root), 194 WithParticipants(participants), 195 WithLocalID(participants[n].NodeID), 196 WithTimeouts(timeouts), 197 WithStopCondition(ViewReached(finalView)), 198 WithOutgoingVotes(BlockAllVotes), 199 WithOutgoingProposals(BlockAllProposals), 200 ) 201 instances = append(instances, in) 202 } 203 204 // connect the communicators of the instances together 205 Connect(t, instances) 206 207 // start all seven instances and wait for them to wrap up 208 var wg sync.WaitGroup 209 for _, in := range instances { 210 wg.Add(1) 211 go func(in *Instance) { 212 err := in.Run() 213 require.True(t, errors.Is(err, errStopCondition)) 214 wg.Done() 215 }(in) 216 } 217 success := unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout") 218 if !success { 219 t.Logf("dumping state of system:") 220 for i, inst := range instances { 221 t.Logf( 222 "instance %d: %d %d %d", 223 i, 224 inst.pacemaker.CurView(), 225 inst.pacemaker.NewestQC().View, 226 inst.forks.FinalizedBlock().View, 227 ) 228 } 229 } 230 231 // check that all instances have the same finalized block 232 ref := instances[0] 233 finalizedViews := FinalizedViews(ref) 234 assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't") 235 for i := 1; i < healthyReplicas; i++ { 236 assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance") 237 assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance") 238 } 239 } 240 241 // TestBlockDelayIsHigherThanTimeout tests an edge case protocol edge case, where 242 // - The block arrives in time for replicas to vote. 243 // - The next primary does not respond in time with a follow-up proposal, 244 // so nodes start sending TimeoutObjects. 245 // - However, eventually, the next primary successfully constructs a QC and a new 246 // block before a TC leads to the round timing out. 247 // 248 // This test verifies that nodes still make progress on the happy path (QC constructed), 249 // despite already having initiated the timeout. 250 // Example scenarios, how this timing edge case could manifest: 251 // - block delay is very close (or larger) than round duration 252 // - delayed message transmission (specifically votes) within network 253 // - overwhelmed / slowed-down primary 254 // - byzantine primary 255 // 256 // Implementation: 257 // - We have 4 nodes in total where the TimeoutObjects from two of them are always 258 // discarded. Therefore, no TC can be constructed. 259 // - To force nodes to initiate the timeout (i.e. send TimeoutObjects), we set 260 // the `blockRateDelay` to _twice_ the PaceMaker Timeout. Furthermore, we configure 261 // the PaceMaker to only increase timeout duration after 6 successive round failures. 262 func TestBlockDelayIsHigherThanTimeout(t *testing.T) { 263 healthyReplicas := 2 264 replicasNotGeneratingTimeouts := 2 265 finalView := uint64(20) 266 267 // generate the 4 hotstuff participants 268 participants := unittest.IdentityListFixture(healthyReplicas + replicasNotGeneratingTimeouts) 269 instances := make([]*Instance, 0, healthyReplicas+replicasNotGeneratingTimeouts) 270 root := DefaultRoot() 271 // set block rate delay to be bigger than minimal timeout 272 timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast) 273 require.NoError(t, err) 274 275 // set up 2 instances that fully work (incl. sending TimeoutObjects) 276 for n := 0; n < healthyReplicas; n++ { 277 in := NewInstance(t, 278 WithRoot(root), 279 WithParticipants(participants), 280 WithLocalID(participants[n].NodeID), 281 WithTimeouts(timeouts), 282 WithStopCondition(ViewFinalized(finalView)), 283 ) 284 instances = append(instances, in) 285 } 286 287 // set up two instances which don't generate and receive timeout objects 288 for n := healthyReplicas; n < healthyReplicas+replicasNotGeneratingTimeouts; n++ { 289 in := NewInstance(t, 290 WithRoot(root), 291 WithParticipants(participants), 292 WithLocalID(participants[n].NodeID), 293 WithTimeouts(timeouts), 294 WithStopCondition(ViewFinalized(finalView)), 295 WithIncomingTimeoutObjects(BlockAllTimeoutObjects), 296 WithOutgoingTimeoutObjects(BlockAllTimeoutObjects), 297 ) 298 instances = append(instances, in) 299 } 300 301 // connect the communicators of the instances together 302 Connect(t, instances) 303 304 // start all 4 instances and wait for them to wrap up 305 var wg sync.WaitGroup 306 for _, in := range instances { 307 wg.Add(1) 308 go func(in *Instance) { 309 err := in.Run() 310 require.True(t, errors.Is(err, errStopCondition)) 311 wg.Done() 312 }(in) 313 } 314 unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout") 315 316 // check that all instances have the same finalized block 317 ref := instances[0] 318 assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't") 319 finalizedViews := FinalizedViews(ref) 320 // in this test we rely on QC being produced in each view 321 // make sure that all views are strictly in increasing order with no gaps 322 for i := 1; i < len(finalizedViews); i++ { 323 // finalized views are sorted in descending order 324 if finalizedViews[i-1] != finalizedViews[i]+1 { 325 t.Fatalf("finalized views series has gap, this is not expected: %v", finalizedViews) 326 return 327 } 328 } 329 for i := 1; i < healthyReplicas; i++ { 330 assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance") 331 assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance") 332 } 333 } 334 335 // TestAsyncClusterStartup tests a realistic scenario where nodes are started asynchronously: 336 // - Replicas are started in sequential order 337 // - Each replica skips voting for first block(emulating message omission). 338 // - Each replica skips first Timeout Object [TO] (emulating message omission). 339 // - At this point protocol loses liveness unless a timeout rebroadcast happens from super-majority of replicas. 340 // 341 // This test verifies that nodes still make progress, despite first TO messages being lost. 342 // Implementation: 343 // - We have 4 replicas in total, each of them skips voting for first view to force a timeout 344 // - Block TOs for whole committee until each replica has generated its first TO. 345 // - After each replica has generated a timeout allow subsequent timeout rebroadcasts to make progress. 346 func TestAsyncClusterStartup(t *testing.T) { 347 replicas := 4 348 finalView := uint64(20) 349 350 // generate the seven hotstuff participants 351 participants := unittest.IdentityListFixture(replicas) 352 instances := make([]*Instance, 0, replicas) 353 root := DefaultRoot() 354 // set block rate delay to be bigger than minimal timeout 355 timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, 6, maxTimeoutRebroadcast) 356 require.NoError(t, err) 357 358 // set up instances that work fully 359 var lock sync.Mutex 360 timeoutObjectGenerated := make(map[flow.Identifier]struct{}, 0) 361 for n := 0; n < replicas; n++ { 362 in := NewInstance(t, 363 WithRoot(root), 364 WithParticipants(participants), 365 WithLocalID(participants[n].NodeID), 366 WithTimeouts(timeouts), 367 WithStopCondition(ViewFinalized(finalView)), 368 WithOutgoingVotes(func(vote *model.Vote) bool { 369 return vote.View == 1 370 }), 371 WithOutgoingTimeoutObjects(func(object *model.TimeoutObject) bool { 372 lock.Lock() 373 defer lock.Unlock() 374 timeoutObjectGenerated[object.SignerID] = struct{}{} 375 // start allowing timeouts when every node has generated one 376 // when nodes will broadcast again, it will go through 377 return len(timeoutObjectGenerated) != replicas 378 }), 379 ) 380 instances = append(instances, in) 381 } 382 383 // connect the communicators of the instances together 384 Connect(t, instances) 385 386 // start each node only after previous one has started 387 var wg sync.WaitGroup 388 for _, in := range instances { 389 wg.Add(1) 390 go func(in *Instance) { 391 err := in.Run() 392 require.True(t, errors.Is(err, errStopCondition)) 393 wg.Done() 394 }(in) 395 } 396 unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout") 397 398 // check that all instances have the same finalized block 399 ref := instances[0] 400 assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't") 401 finalizedViews := FinalizedViews(ref) 402 for i := 1; i < replicas; i++ { 403 assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance") 404 assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance") 405 } 406 }