github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/consensus/hotstuff/integration/liveness_test.go (about)

     1  package integration
     2  
     3  import (
     4  	"errors"
     5  	"sync"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/stretchr/testify/assert"
    10  	"github.com/stretchr/testify/require"
    11  
    12  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    13  	"github.com/onflow/flow-go/consensus/hotstuff/pacemaker/timeout"
    14  	"github.com/onflow/flow-go/model/flow"
    15  	"github.com/onflow/flow-go/utils/unittest"
    16  )
    17  
    18  // pacemaker timeout
    19  // if your laptop is fast enough, 10 ms is enough
    20  const pmTimeout = 100 * time.Millisecond
    21  
    22  // maxTimeoutRebroadcast specifies how often the PaceMaker rebroadcasts
    23  // its timeout object in case there is no progress. We keep the value
    24  // small so we have smaller latency
    25  const maxTimeoutRebroadcast = 1 * time.Second
    26  
    27  // If 2 nodes are down in a 7 nodes cluster, the rest of 5 nodes can
    28  // still make progress and reach consensus
    29  func Test2TimeoutOutof7Instances(t *testing.T) {
    30  
    31  	healthyReplicas := 5
    32  	notVotingReplicas := 2
    33  	finalView := uint64(30)
    34  
    35  	// generate the seven hotstuff participants
    36  	participants := unittest.IdentityListFixture(healthyReplicas + notVotingReplicas)
    37  	instances := make([]*Instance, 0, healthyReplicas+notVotingReplicas)
    38  	root := DefaultRoot()
    39  	timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast)
    40  	require.NoError(t, err)
    41  
    42  	// set up five instances that work fully
    43  	for n := 0; n < healthyReplicas; n++ {
    44  		in := NewInstance(t,
    45  			WithRoot(root),
    46  			WithParticipants(participants),
    47  			WithLocalID(participants[n].NodeID),
    48  			WithTimeouts(timeouts),
    49  			WithStopCondition(ViewFinalized(finalView)),
    50  		)
    51  		instances = append(instances, in)
    52  	}
    53  
    54  	// set up two instances which can't vote, nor propose
    55  	for n := healthyReplicas; n < healthyReplicas+notVotingReplicas; n++ {
    56  		in := NewInstance(t,
    57  			WithRoot(root),
    58  			WithParticipants(participants),
    59  			WithLocalID(participants[n].NodeID),
    60  			WithTimeouts(timeouts),
    61  			WithStopCondition(ViewFinalized(finalView)),
    62  			WithOutgoingVotes(BlockAllVotes),
    63  			WithOutgoingProposals(BlockAllProposals),
    64  		)
    65  		instances = append(instances, in)
    66  	}
    67  
    68  	// connect the communicators of the instances together
    69  	Connect(t, instances)
    70  
    71  	// start all seven instances and wait for them to wrap up
    72  	var wg sync.WaitGroup
    73  	for _, in := range instances {
    74  		wg.Add(1)
    75  		go func(in *Instance) {
    76  			err := in.Run()
    77  			require.True(t, errors.Is(err, errStopCondition))
    78  			wg.Done()
    79  		}(in)
    80  	}
    81  	unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout")
    82  
    83  	// check that all instances have the same finalized block
    84  	ref := instances[0]
    85  	assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't")
    86  	finalizedViews := FinalizedViews(ref)
    87  	for i := 1; i < healthyReplicas; i++ {
    88  		assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance")
    89  		assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance")
    90  	}
    91  }
    92  
    93  // 2 nodes in a 4-node cluster are configured to be able only to send timeout messages (no voting or proposing).
    94  // The other 2 unconstrained nodes should be able to make progress through the recovery path by creating TCs
    95  // for every round, but no block will be finalized, because finalization requires direct 1-chain and QC.
    96  func Test2TimeoutOutof4Instances(t *testing.T) {
    97  
    98  	healthyReplicas := 2
    99  	replicasDroppingHappyPathMsgs := 2
   100  	finalView := uint64(30)
   101  
   102  	// generate the 4 hotstuff participants
   103  	participants := unittest.IdentityListFixture(healthyReplicas + replicasDroppingHappyPathMsgs)
   104  	instances := make([]*Instance, 0, healthyReplicas+replicasDroppingHappyPathMsgs)
   105  	root := DefaultRoot()
   106  	timeouts, err := timeout.NewConfig(10*time.Millisecond, 50*time.Millisecond, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast)
   107  	require.NoError(t, err)
   108  
   109  	// set up two instances that work fully
   110  	for n := 0; n < healthyReplicas; n++ {
   111  		in := NewInstance(t,
   112  			WithRoot(root),
   113  			WithParticipants(participants),
   114  			WithLocalID(participants[n].NodeID),
   115  			WithTimeouts(timeouts),
   116  			WithStopCondition(ViewReached(finalView)),
   117  		)
   118  		instances = append(instances, in)
   119  	}
   120  
   121  	// set up instances which can't vote, nor propose
   122  	for n := healthyReplicas; n < healthyReplicas+replicasDroppingHappyPathMsgs; n++ {
   123  		in := NewInstance(t,
   124  			WithRoot(root),
   125  			WithParticipants(participants),
   126  			WithLocalID(participants[n].NodeID),
   127  			WithTimeouts(timeouts),
   128  			WithStopCondition(ViewReached(finalView)),
   129  			WithOutgoingVotes(BlockAllVotes),
   130  			WithIncomingVotes(BlockAllVotes),
   131  			WithOutgoingProposals(BlockAllProposals),
   132  		)
   133  		instances = append(instances, in)
   134  	}
   135  
   136  	// connect the communicators of the instances together
   137  	Connect(t, instances)
   138  
   139  	// start the instances and wait for them to finish
   140  	var wg sync.WaitGroup
   141  	for _, in := range instances {
   142  		wg.Add(1)
   143  		go func(in *Instance) {
   144  			err := in.Run()
   145  			require.True(t, errors.Is(err, errStopCondition), "should run until stop condition")
   146  			wg.Done()
   147  		}(in)
   148  	}
   149  	unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout")
   150  
   151  	// check that all instances have the same finalized block
   152  	ref := instances[0]
   153  	finalizedViews := FinalizedViews(ref)
   154  	assert.Equal(t, []uint64{0}, finalizedViews, "no view was finalized, because finalization requires 2 direct chain plus a QC which never happen in this case")
   155  	assert.Equal(t, finalView, ref.pacemaker.CurView(), "expect instance 0 should made enough progress, but didn't")
   156  	for i := 1; i < healthyReplicas; i++ {
   157  		assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance", i)
   158  		assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance", i)
   159  		assert.Equal(t, finalView, instances[i].pacemaker.CurView(), "instance %d should have same active view as first instance", i)
   160  	}
   161  }
   162  
   163  // If 1 node is down in a 5 nodes cluster, the rest of 4 nodes can
   164  // make progress and reach consensus
   165  func Test1TimeoutOutof5Instances(t *testing.T) {
   166  
   167  	healthyReplicas := 4
   168  	blockedReplicas := 1
   169  	finalView := uint64(30)
   170  
   171  	// generate the seven hotstuff participants
   172  	participants := unittest.IdentityListFixture(healthyReplicas + blockedReplicas)
   173  	instances := make([]*Instance, 0, healthyReplicas+blockedReplicas)
   174  	root := DefaultRoot()
   175  	timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast)
   176  	require.NoError(t, err)
   177  
   178  	// set up instances that work fully
   179  	for n := 0; n < healthyReplicas; n++ {
   180  		in := NewInstance(t,
   181  			WithRoot(root),
   182  			WithParticipants(participants),
   183  			WithLocalID(participants[n].NodeID),
   184  			WithTimeouts(timeouts),
   185  			WithStopCondition(ViewFinalized(finalView)),
   186  		)
   187  		instances = append(instances, in)
   188  	}
   189  
   190  	// set up one instance which can't vote, nor propose
   191  	for n := healthyReplicas; n < healthyReplicas+blockedReplicas; n++ {
   192  		in := NewInstance(t,
   193  			WithRoot(root),
   194  			WithParticipants(participants),
   195  			WithLocalID(participants[n].NodeID),
   196  			WithTimeouts(timeouts),
   197  			WithStopCondition(ViewReached(finalView)),
   198  			WithOutgoingVotes(BlockAllVotes),
   199  			WithOutgoingProposals(BlockAllProposals),
   200  		)
   201  		instances = append(instances, in)
   202  	}
   203  
   204  	// connect the communicators of the instances together
   205  	Connect(t, instances)
   206  
   207  	// start all seven instances and wait for them to wrap up
   208  	var wg sync.WaitGroup
   209  	for _, in := range instances {
   210  		wg.Add(1)
   211  		go func(in *Instance) {
   212  			err := in.Run()
   213  			require.True(t, errors.Is(err, errStopCondition))
   214  			wg.Done()
   215  		}(in)
   216  	}
   217  	success := unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout")
   218  	if !success {
   219  		t.Logf("dumping state of system:")
   220  		for i, inst := range instances {
   221  			t.Logf(
   222  				"instance %d: %d %d %d",
   223  				i,
   224  				inst.pacemaker.CurView(),
   225  				inst.pacemaker.NewestQC().View,
   226  				inst.forks.FinalizedBlock().View,
   227  			)
   228  		}
   229  	}
   230  
   231  	// check that all instances have the same finalized block
   232  	ref := instances[0]
   233  	finalizedViews := FinalizedViews(ref)
   234  	assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't")
   235  	for i := 1; i < healthyReplicas; i++ {
   236  		assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance")
   237  		assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance")
   238  	}
   239  }
   240  
   241  // TestBlockDelayIsHigherThanTimeout tests an edge case protocol edge case, where
   242  //   - The block arrives in time for replicas to vote.
   243  //   - The next primary does not respond in time with a follow-up proposal,
   244  //     so nodes start sending TimeoutObjects.
   245  //   - However, eventually, the next primary successfully constructs a QC and a new
   246  //     block before a TC leads to the round timing out.
   247  //
   248  // This test verifies that nodes still make progress on the happy path (QC constructed),
   249  // despite already having initiated the timeout.
   250  // Example scenarios, how this timing edge case could manifest:
   251  //   - block delay is very close (or larger) than round duration
   252  //   - delayed message transmission (specifically votes) within network
   253  //   - overwhelmed / slowed-down primary
   254  //   - byzantine primary
   255  //
   256  // Implementation:
   257  //   - We have 4 nodes in total where the TimeoutObjects from two of them are always
   258  //     discarded. Therefore, no TC can be constructed.
   259  //   - To force nodes to initiate the timeout (i.e. send TimeoutObjects), we set
   260  //     the `blockRateDelay` to _twice_ the PaceMaker Timeout. Furthermore, we configure
   261  //     the PaceMaker to only increase timeout duration after 6 successive round failures.
   262  func TestBlockDelayIsHigherThanTimeout(t *testing.T) {
   263  	healthyReplicas := 2
   264  	replicasNotGeneratingTimeouts := 2
   265  	finalView := uint64(20)
   266  
   267  	// generate the 4 hotstuff participants
   268  	participants := unittest.IdentityListFixture(healthyReplicas + replicasNotGeneratingTimeouts)
   269  	instances := make([]*Instance, 0, healthyReplicas+replicasNotGeneratingTimeouts)
   270  	root := DefaultRoot()
   271  	// set block rate delay to be bigger than minimal timeout
   272  	timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, happyPathMaxRoundFailures, maxTimeoutRebroadcast)
   273  	require.NoError(t, err)
   274  
   275  	// set up 2 instances that fully work (incl. sending TimeoutObjects)
   276  	for n := 0; n < healthyReplicas; n++ {
   277  		in := NewInstance(t,
   278  			WithRoot(root),
   279  			WithParticipants(participants),
   280  			WithLocalID(participants[n].NodeID),
   281  			WithTimeouts(timeouts),
   282  			WithStopCondition(ViewFinalized(finalView)),
   283  		)
   284  		instances = append(instances, in)
   285  	}
   286  
   287  	// set up two instances which don't generate and receive timeout objects
   288  	for n := healthyReplicas; n < healthyReplicas+replicasNotGeneratingTimeouts; n++ {
   289  		in := NewInstance(t,
   290  			WithRoot(root),
   291  			WithParticipants(participants),
   292  			WithLocalID(participants[n].NodeID),
   293  			WithTimeouts(timeouts),
   294  			WithStopCondition(ViewFinalized(finalView)),
   295  			WithIncomingTimeoutObjects(BlockAllTimeoutObjects),
   296  			WithOutgoingTimeoutObjects(BlockAllTimeoutObjects),
   297  		)
   298  		instances = append(instances, in)
   299  	}
   300  
   301  	// connect the communicators of the instances together
   302  	Connect(t, instances)
   303  
   304  	// start all 4 instances and wait for them to wrap up
   305  	var wg sync.WaitGroup
   306  	for _, in := range instances {
   307  		wg.Add(1)
   308  		go func(in *Instance) {
   309  			err := in.Run()
   310  			require.True(t, errors.Is(err, errStopCondition))
   311  			wg.Done()
   312  		}(in)
   313  	}
   314  	unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout")
   315  
   316  	// check that all instances have the same finalized block
   317  	ref := instances[0]
   318  	assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't")
   319  	finalizedViews := FinalizedViews(ref)
   320  	// in this test we rely on QC being produced in each view
   321  	// make sure that all views are strictly in increasing order with no gaps
   322  	for i := 1; i < len(finalizedViews); i++ {
   323  		// finalized views are sorted in descending order
   324  		if finalizedViews[i-1] != finalizedViews[i]+1 {
   325  			t.Fatalf("finalized views series has gap, this is not expected: %v", finalizedViews)
   326  			return
   327  		}
   328  	}
   329  	for i := 1; i < healthyReplicas; i++ {
   330  		assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance")
   331  		assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance")
   332  	}
   333  }
   334  
   335  // TestAsyncClusterStartup tests a realistic scenario where nodes are started asynchronously:
   336  //   - Replicas are started in sequential order
   337  //   - Each replica skips voting for first block(emulating message omission).
   338  //   - Each replica skips first Timeout Object [TO] (emulating message omission).
   339  //   - At this point protocol loses liveness unless a timeout rebroadcast happens from super-majority of replicas.
   340  //
   341  // This test verifies that nodes still make progress, despite first TO messages being lost.
   342  // Implementation:
   343  //   - We have 4 replicas in total, each of them skips voting for first view to force a timeout
   344  //   - Block TOs for whole committee until each replica has generated its first TO.
   345  //   - After each replica has generated a timeout allow subsequent timeout rebroadcasts to make progress.
   346  func TestAsyncClusterStartup(t *testing.T) {
   347  	replicas := 4
   348  	finalView := uint64(20)
   349  
   350  	// generate the seven hotstuff participants
   351  	participants := unittest.IdentityListFixture(replicas)
   352  	instances := make([]*Instance, 0, replicas)
   353  	root := DefaultRoot()
   354  	// set block rate delay to be bigger than minimal timeout
   355  	timeouts, err := timeout.NewConfig(pmTimeout, pmTimeout, 1.5, 6, maxTimeoutRebroadcast)
   356  	require.NoError(t, err)
   357  
   358  	// set up instances that work fully
   359  	var lock sync.Mutex
   360  	timeoutObjectGenerated := make(map[flow.Identifier]struct{}, 0)
   361  	for n := 0; n < replicas; n++ {
   362  		in := NewInstance(t,
   363  			WithRoot(root),
   364  			WithParticipants(participants),
   365  			WithLocalID(participants[n].NodeID),
   366  			WithTimeouts(timeouts),
   367  			WithStopCondition(ViewFinalized(finalView)),
   368  			WithOutgoingVotes(func(vote *model.Vote) bool {
   369  				return vote.View == 1
   370  			}),
   371  			WithOutgoingTimeoutObjects(func(object *model.TimeoutObject) bool {
   372  				lock.Lock()
   373  				defer lock.Unlock()
   374  				timeoutObjectGenerated[object.SignerID] = struct{}{}
   375  				// start allowing timeouts when every node has generated one
   376  				// when nodes will broadcast again, it will go through
   377  				return len(timeoutObjectGenerated) != replicas
   378  			}),
   379  		)
   380  		instances = append(instances, in)
   381  	}
   382  
   383  	// connect the communicators of the instances together
   384  	Connect(t, instances)
   385  
   386  	// start each node only after previous one has started
   387  	var wg sync.WaitGroup
   388  	for _, in := range instances {
   389  		wg.Add(1)
   390  		go func(in *Instance) {
   391  			err := in.Run()
   392  			require.True(t, errors.Is(err, errStopCondition))
   393  			wg.Done()
   394  		}(in)
   395  	}
   396  	unittest.AssertReturnsBefore(t, wg.Wait, 10*time.Second, "expect to finish before timeout")
   397  
   398  	// check that all instances have the same finalized block
   399  	ref := instances[0]
   400  	assert.Equal(t, finalView, ref.forks.FinalizedBlock().View, "expect instance 0 should made enough progress, but didn't")
   401  	finalizedViews := FinalizedViews(ref)
   402  	for i := 1; i < replicas; i++ {
   403  		assert.Equal(t, ref.forks.FinalizedBlock(), instances[i].forks.FinalizedBlock(), "instance %d should have same finalized block as first instance")
   404  		assert.Equal(t, finalizedViews, FinalizedViews(instances[i]), "instance %d should have same finalized view as first instance")
   405  	}
   406  }