github.com/hechain20/hechain@v0.0.0-20220316014945-b544036ba106/orderer/common/cluster/deliver_test.go (about)

     1  /*
     2  Copyright hechain. 2018 All Rights Reserved.
     3  
     4  SPDX-License-Identifier: Apache-2.0
     5  */
     6  
     7  package cluster_test
     8  
     9  import (
    10  	"context"
    11  	"crypto/rand"
    12  	"encoding/hex"
    13  	"fmt"
    14  	"net"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"sync/atomic"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/golang/protobuf/proto"
    23  	"github.com/hechain20/hechain/bccsp/factory"
    24  	"github.com/hechain20/hechain/common/flogging"
    25  	"github.com/hechain20/hechain/internal/pkg/comm"
    26  	"github.com/hechain20/hechain/internal/pkg/identity"
    27  	"github.com/hechain20/hechain/orderer/common/cluster"
    28  	"github.com/hechain20/hechain/orderer/common/cluster/mocks"
    29  	"github.com/hechain20/hechain/protoutil"
    30  	"github.com/hyperledger/fabric-protos-go/common"
    31  	"github.com/hyperledger/fabric-protos-go/orderer"
    32  	"github.com/onsi/gomega"
    33  	"github.com/pkg/errors"
    34  	"github.com/stretchr/testify/require"
    35  	"go.uber.org/zap"
    36  	"go.uber.org/zap/zapcore"
    37  	"google.golang.org/grpc"
    38  	"google.golang.org/grpc/balancer"
    39  	"google.golang.org/grpc/balancer/roundrobin"
    40  )
    41  
    42  // protects gRPC balancer registration
    43  var gRPCBalancerLock = sync.Mutex{}
    44  
    45  func init() {
    46  	factory.InitFactories(nil)
    47  }
    48  
    49  //go:generate counterfeiter -o mocks/signer_serializer.go --fake-name SignerSerializer . signerSerializer
    50  
    51  type signerSerializer interface {
    52  	identity.SignerSerializer
    53  }
    54  
    55  type wrappedBalancer struct {
    56  	balancer.Balancer
    57  	cd *countingDialer
    58  }
    59  
    60  func (wb *wrappedBalancer) Close() {
    61  	defer atomic.AddUint32(&wb.cd.connectionCount, ^uint32(0))
    62  	wb.Balancer.Close()
    63  }
    64  
    65  type countingDialer struct {
    66  	name            string
    67  	baseBuilder     balancer.Builder
    68  	connectionCount uint32
    69  }
    70  
    71  func newCountingDialer() *countingDialer {
    72  	gRPCBalancerLock.Lock()
    73  	builder := balancer.Get(roundrobin.Name)
    74  	gRPCBalancerLock.Unlock()
    75  
    76  	buff := make([]byte, 16)
    77  	rand.Read(buff)
    78  	cb := &countingDialer{
    79  		name:        hex.EncodeToString(buff),
    80  		baseBuilder: builder,
    81  	}
    82  
    83  	gRPCBalancerLock.Lock()
    84  	balancer.Register(cb)
    85  	gRPCBalancerLock.Unlock()
    86  
    87  	return cb
    88  }
    89  
    90  func (d *countingDialer) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer {
    91  	defer atomic.AddUint32(&d.connectionCount, 1)
    92  	lb := d.baseBuilder.Build(cc, opts)
    93  	return &wrappedBalancer{
    94  		Balancer: lb,
    95  		cd:       d,
    96  	}
    97  }
    98  
    99  func (d *countingDialer) Name() string {
   100  	return d.name
   101  }
   102  
   103  func (d *countingDialer) assertAllConnectionsClosed(t *testing.T) {
   104  	timeLimit := time.Now().Add(timeout)
   105  	for atomic.LoadUint32(&d.connectionCount) != uint32(0) && time.Now().Before(timeLimit) {
   106  		time.Sleep(time.Millisecond)
   107  	}
   108  	require.Equal(t, uint32(0), atomic.LoadUint32(&d.connectionCount))
   109  }
   110  
   111  func (d *countingDialer) Dial(address cluster.EndpointCriteria) (*grpc.ClientConn, error) {
   112  	ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*100)
   113  	defer cancel()
   114  
   115  	gRPCBalancerLock.Lock()
   116  	balancer := grpc.WithBalancerName(d.name)
   117  	gRPCBalancerLock.Unlock()
   118  	return grpc.DialContext(ctx, address.Endpoint, grpc.WithBlock(), grpc.WithInsecure(), balancer)
   119  }
   120  
   121  func noopBlockVerifierf(_ []*common.Block, _ string) error {
   122  	return nil
   123  }
   124  
   125  func readSeekEnvelope(stream orderer.AtomicBroadcast_DeliverServer) (*orderer.SeekInfo, string, error) {
   126  	env, err := stream.Recv()
   127  	if err != nil {
   128  		return nil, "", err
   129  	}
   130  	payload, err := protoutil.UnmarshalPayload(env.Payload)
   131  	if err != nil {
   132  		return nil, "", err
   133  	}
   134  	seekInfo := &orderer.SeekInfo{}
   135  	if err = proto.Unmarshal(payload.Data, seekInfo); err != nil {
   136  		return nil, "", err
   137  	}
   138  	chdr := &common.ChannelHeader{}
   139  	if err = proto.Unmarshal(payload.Header.ChannelHeader, chdr); err != nil {
   140  		return nil, "", err
   141  	}
   142  	return seekInfo, chdr.ChannelId, nil
   143  }
   144  
   145  type deliverServer struct {
   146  	logger *flogging.FabricLogger
   147  
   148  	t *testing.T
   149  	sync.Mutex
   150  	err            error
   151  	srv            *comm.GRPCServer
   152  	seekAssertions chan func(*orderer.SeekInfo, string)
   153  	blockResponses chan *orderer.DeliverResponse
   154  	done           chan struct{}
   155  }
   156  
   157  func (ds *deliverServer) endpointCriteria() cluster.EndpointCriteria {
   158  	return cluster.EndpointCriteria{Endpoint: ds.srv.Address()}
   159  }
   160  
   161  func (ds *deliverServer) isFaulty() bool {
   162  	ds.Lock()
   163  	defer ds.Unlock()
   164  	return ds.err != nil
   165  }
   166  
   167  func (*deliverServer) Broadcast(orderer.AtomicBroadcast_BroadcastServer) error {
   168  	panic("implement me")
   169  }
   170  
   171  func (ds *deliverServer) Deliver(stream orderer.AtomicBroadcast_DeliverServer) error {
   172  	ds.Lock()
   173  	err := ds.err
   174  	ds.Unlock()
   175  
   176  	if err != nil {
   177  		return err
   178  	}
   179  	seekInfo, channel, err := readSeekEnvelope(stream)
   180  	require.NoError(ds.t, err)
   181  
   182  	// FAB-16233 This is meant to mitigate timeouts when
   183  	// seekAssertions does not receive a value
   184  	timer := time.NewTimer(1 * time.Minute)
   185  	defer timer.Stop()
   186  
   187  	select {
   188  	case <-timer.C:
   189  		ds.t.Fatalf("timed out waiting for seek assertions to receive a value\n")
   190  	// Get the next seek assertion and ensure the next seek is of the expected type
   191  	case seekAssert := <-ds.seekAssertions:
   192  		ds.logger.Debugf("Received seekInfo: %+v", seekInfo)
   193  		seekAssert(seekInfo, channel)
   194  	case <-ds.done:
   195  		return nil
   196  	}
   197  
   198  	if seekInfo.GetStart().GetSpecified() != nil {
   199  		return ds.deliverBlocks(stream)
   200  	}
   201  	if seekInfo.GetStart().GetNewest() != nil {
   202  		select {
   203  		case resp := <-ds.blocks():
   204  			if resp == nil {
   205  				return nil
   206  			}
   207  			return stream.Send(resp)
   208  		case <-ds.done:
   209  		}
   210  	}
   211  	ds.t.Fatalf("expected either specified or newest seek but got %v\n", seekInfo.GetStart())
   212  	return nil // unreachable
   213  }
   214  
   215  func (ds *deliverServer) deliverBlocks(stream orderer.AtomicBroadcast_DeliverServer) error {
   216  	for {
   217  		blockChan := ds.blocks()
   218  		var response *orderer.DeliverResponse
   219  		select {
   220  		case response = <-blockChan:
   221  		case <-ds.done:
   222  			return nil
   223  		}
   224  
   225  		// A nil response is a signal from the test to close the stream.
   226  		// This is needed to avoid reading from the block buffer, hence
   227  		// consuming by accident a block that is tabled to be pulled
   228  		// later in the test.
   229  		if response == nil {
   230  			return nil
   231  		}
   232  		if err := stream.Send(response); err != nil {
   233  			return err
   234  		}
   235  	}
   236  }
   237  
   238  func (ds *deliverServer) blocks() chan *orderer.DeliverResponse {
   239  	ds.Lock()
   240  	defer ds.Unlock()
   241  	blockChan := ds.blockResponses
   242  	return blockChan
   243  }
   244  
   245  func (ds *deliverServer) setBlocks(blocks chan *orderer.DeliverResponse) {
   246  	ds.Lock()
   247  	defer ds.Unlock()
   248  	ds.blockResponses = blocks
   249  }
   250  
   251  func (ds *deliverServer) port() int {
   252  	_, portStr, err := net.SplitHostPort(ds.srv.Address())
   253  	require.NoError(ds.t, err)
   254  
   255  	port, err := strconv.ParseInt(portStr, 10, 32)
   256  	require.NoError(ds.t, err)
   257  	return int(port)
   258  }
   259  
   260  func (ds *deliverServer) resurrect() {
   261  	var err error
   262  	// copy the responses channel into a fresh one
   263  	respChan := make(chan *orderer.DeliverResponse, 100)
   264  	for resp := range ds.blocks() {
   265  		respChan <- resp
   266  	}
   267  	ds.blockResponses = respChan
   268  	ds.done = make(chan struct{})
   269  	ds.srv.Stop()
   270  	// And re-create the gRPC server on that port
   271  	ds.srv, err = comm.NewGRPCServer(fmt.Sprintf("127.0.0.1:%d", ds.port()), comm.ServerConfig{})
   272  	require.NoError(ds.t, err)
   273  	orderer.RegisterAtomicBroadcastServer(ds.srv.Server(), ds)
   274  	go ds.srv.Start()
   275  }
   276  
   277  func (ds *deliverServer) stop() {
   278  	ds.srv.Stop()
   279  	close(ds.blocks())
   280  	close(ds.done)
   281  }
   282  
   283  func (ds *deliverServer) enqueueResponse(seq uint64) {
   284  	select {
   285  	case ds.blocks() <- &orderer.DeliverResponse{
   286  		Type: &orderer.DeliverResponse_Block{Block: protoutil.NewBlock(seq, nil)},
   287  	}:
   288  	case <-ds.done:
   289  	}
   290  }
   291  
   292  func (ds *deliverServer) addExpectProbeAssert() {
   293  	select {
   294  	case ds.seekAssertions <- func(info *orderer.SeekInfo, _ string) {
   295  		require.NotNil(ds.t, info.GetStart().GetNewest())
   296  		require.Equal(ds.t, info.ErrorResponse, orderer.SeekInfo_BEST_EFFORT)
   297  	}:
   298  	case <-ds.done:
   299  	}
   300  }
   301  
   302  func (ds *deliverServer) addExpectPullAssert(seq uint64) {
   303  	select {
   304  	case ds.seekAssertions <- func(info *orderer.SeekInfo, _ string) {
   305  		seekPosition := info.GetStart()
   306  		require.NotNil(ds.t, seekPosition)
   307  		seekSpecified := seekPosition.GetSpecified()
   308  		require.NotNil(ds.t, seekSpecified)
   309  		require.Equal(ds.t, seq, seekSpecified.Number)
   310  		require.Equal(ds.t, info.ErrorResponse, orderer.SeekInfo_BEST_EFFORT)
   311  	}:
   312  	case <-ds.done:
   313  	}
   314  }
   315  
   316  func newClusterNode(t *testing.T) *deliverServer {
   317  	srv, err := comm.NewGRPCServer("127.0.0.1:0", comm.ServerConfig{})
   318  	require.NoError(t, err)
   319  	ds := &deliverServer{
   320  		logger:         flogging.MustGetLogger("test.debug"),
   321  		t:              t,
   322  		seekAssertions: make(chan func(*orderer.SeekInfo, string), 100),
   323  		blockResponses: make(chan *orderer.DeliverResponse, 100),
   324  		done:           make(chan struct{}),
   325  		srv:            srv,
   326  	}
   327  	orderer.RegisterAtomicBroadcastServer(srv.Server(), ds)
   328  	go srv.Start()
   329  	return ds
   330  }
   331  
   332  func newBlockPuller(dialer *countingDialer, orderers ...string) *cluster.BlockPuller {
   333  	return &cluster.BlockPuller{
   334  		Dialer:              dialer,
   335  		Channel:             "mychannel",
   336  		Signer:              &mocks.SignerSerializer{},
   337  		Endpoints:           endpointCriteriaFromEndpoints(orderers...),
   338  		FetchTimeout:        time.Second * 10,
   339  		MaxTotalBufferBytes: 1024 * 1024, // 1MB
   340  		RetryTimeout:        time.Millisecond * 10,
   341  		VerifyBlockSequence: noopBlockVerifierf,
   342  		Logger:              flogging.MustGetLogger("test"),
   343  		StopChannel:         make(chan struct{}),
   344  	}
   345  }
   346  
   347  func endpointCriteriaFromEndpoints(orderers ...string) []cluster.EndpointCriteria {
   348  	var res []cluster.EndpointCriteria
   349  	for _, orderer := range orderers {
   350  		res = append(res, cluster.EndpointCriteria{Endpoint: orderer})
   351  	}
   352  	return res
   353  }
   354  
   355  func TestBlockPullerBasicHappyPath(t *testing.T) {
   356  	// Scenario: Single ordering node,
   357  	// and the block puller pulls blocks 5 to 10.
   358  	osn := newClusterNode(t)
   359  	defer osn.stop()
   360  
   361  	dialer := newCountingDialer()
   362  	bp := newBlockPuller(dialer, osn.srv.Address())
   363  
   364  	// The first seek request asks for the latest block
   365  	osn.addExpectProbeAssert()
   366  	// The first response says that the height is 10
   367  	osn.enqueueResponse(10)
   368  	// The next seek request is for block 5
   369  	osn.addExpectPullAssert(5)
   370  	// The later responses are the block themselves
   371  	for i := 5; i <= 10; i++ {
   372  		osn.enqueueResponse(uint64(i))
   373  	}
   374  
   375  	for i := 5; i <= 10; i++ {
   376  		require.Equal(t, uint64(i), bp.PullBlock(uint64(i)).Header.Number)
   377  	}
   378  	require.Len(t, osn.blockResponses, 0)
   379  
   380  	bp.Close()
   381  	dialer.assertAllConnectionsClosed(t)
   382  }
   383  
   384  func TestBlockPullerDuplicate(t *testing.T) {
   385  	// Scenario: The address of the ordering node
   386  	// is found twice in the configuration, but this
   387  	// doesn't create a problem.
   388  	osn := newClusterNode(t)
   389  	defer osn.stop()
   390  
   391  	dialer := newCountingDialer()
   392  	// Add the address twice
   393  	bp := newBlockPuller(dialer, osn.srv.Address(), osn.srv.Address())
   394  
   395  	// The first seek request asks for the latest block (twice)
   396  	osn.addExpectProbeAssert()
   397  	osn.addExpectProbeAssert()
   398  	// The first response says that the height is 3
   399  	osn.enqueueResponse(3)
   400  	osn.enqueueResponse(3)
   401  	// The next seek request is for block 1, only from some OSN
   402  	osn.addExpectPullAssert(1)
   403  	// The later responses are the block themselves
   404  	for i := 1; i <= 3; i++ {
   405  		osn.enqueueResponse(uint64(i))
   406  	}
   407  
   408  	for i := 1; i <= 3; i++ {
   409  		require.Equal(t, uint64(i), bp.PullBlock(uint64(i)).Header.Number)
   410  	}
   411  	require.Len(t, osn.blockResponses, 0)
   412  
   413  	bp.Close()
   414  	dialer.assertAllConnectionsClosed(t)
   415  }
   416  
   417  func TestBlockPullerHeavyBlocks(t *testing.T) {
   418  	// Scenario: Single ordering node,
   419  	// and the block puller pulls 50 blocks, each
   420  	// weighing 1K, but the buffer can only contain
   421  	// 10K, so it should pull the 50 blocks in chunks of 10,
   422  	// and verify 5 sequences at a time.
   423  
   424  	osn := newClusterNode(t)
   425  	defer osn.stop()
   426  	osn.addExpectProbeAssert()
   427  	osn.addExpectPullAssert(1)
   428  	osn.enqueueResponse(100) // The last sequence is 100
   429  
   430  	enqueueBlockBatch := func(start, end uint64) {
   431  		for seq := start; seq <= end; seq++ {
   432  			resp := &orderer.DeliverResponse{
   433  				Type: &orderer.DeliverResponse_Block{
   434  					Block: protoutil.NewBlock(seq, nil),
   435  				},
   436  			}
   437  			data := resp.GetBlock().Data.Data
   438  			resp.GetBlock().Data.Data = append(data, make([]byte, 1024))
   439  			osn.blockResponses <- resp
   440  		}
   441  	}
   442  
   443  	dialer := newCountingDialer()
   444  	bp := newBlockPuller(dialer, osn.srv.Address())
   445  	var gotBlockMessageCount int
   446  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   447  		if strings.Contains(entry.Message, "Got block") {
   448  			gotBlockMessageCount++
   449  		}
   450  		return nil
   451  	}))
   452  	bp.MaxTotalBufferBytes = 1024 * 10 // 10K
   453  
   454  	// Enqueue only the next batch in the orderer node.
   455  	// This ensures that only 10 blocks are fetched into the buffer
   456  	// and not more.
   457  	for i := uint64(0); i < 5; i++ {
   458  		enqueueBlockBatch(i*10+uint64(1), i*10+uint64(10))
   459  		for seq := i*10 + uint64(1); seq <= i*10+uint64(10); seq++ {
   460  			require.Equal(t, seq, bp.PullBlock(seq).Header.Number)
   461  		}
   462  	}
   463  
   464  	require.Equal(t, 50, gotBlockMessageCount)
   465  	bp.Close()
   466  	dialer.assertAllConnectionsClosed(t)
   467  }
   468  
   469  func TestBlockPullerClone(t *testing.T) {
   470  	// Scenario: We have a block puller that is connected
   471  	// to an ordering node, and we clone it.
   472  	// We expect that the new block puller is a clean slate
   473  	// and doesn't share any internal state with its origin.
   474  	osn1 := newClusterNode(t)
   475  	defer osn1.stop()
   476  
   477  	osn1.addExpectProbeAssert()
   478  	osn1.addExpectPullAssert(1)
   479  	// last block sequence is 100
   480  	osn1.enqueueResponse(100)
   481  	osn1.enqueueResponse(1)
   482  	// The block puller is expected to disconnect after pulling
   483  	// a single block. So signal the server-side to avoid
   484  	// grabbing the next block after block 1 is pulled.
   485  	osn1.blockResponses <- nil
   486  
   487  	dialer := newCountingDialer()
   488  	bp := newBlockPuller(dialer, osn1.srv.Address())
   489  	bp.FetchTimeout = time.Millisecond * 100
   490  	// Pull a block at a time and don't buffer them
   491  	bp.MaxTotalBufferBytes = 1
   492  	// Clone the block puller
   493  	bpClone := bp.Clone()
   494  	// and override its channel
   495  	bpClone.Channel = "foo"
   496  	// Ensure the channel change doesn't reflect in the original puller
   497  	require.Equal(t, "mychannel", bp.Channel)
   498  
   499  	block := bp.PullBlock(1)
   500  	require.Equal(t, uint64(1), block.Header.Number)
   501  
   502  	// After the original block puller is closed, the
   503  	// clone should not be affected
   504  	bp.Close()
   505  	dialer.assertAllConnectionsClosed(t)
   506  
   507  	// The clone block puller should not have cached the internal state
   508  	// from its origin block puller, thus it should probe again for the last block
   509  	// sequence as if it is a new block puller.
   510  	osn1.addExpectProbeAssert()
   511  	osn1.addExpectPullAssert(2)
   512  	osn1.enqueueResponse(100)
   513  	osn1.enqueueResponse(2)
   514  
   515  	block = bpClone.PullBlock(2)
   516  	require.Equal(t, uint64(2), block.Header.Number)
   517  
   518  	bpClone.Close()
   519  	dialer.assertAllConnectionsClosed(t)
   520  }
   521  
   522  func TestBlockPullerHeightsByEndpoints(t *testing.T) {
   523  	// Scenario: We ask for the latest block from all the known ordering nodes.
   524  	// One ordering node is unavailable (offline).
   525  	// One ordering node doesn't have blocks for that channel.
   526  	// The remaining node returns the latest block.
   527  	osn1 := newClusterNode(t)
   528  
   529  	osn2 := newClusterNode(t)
   530  	defer osn2.stop()
   531  
   532  	osn3 := newClusterNode(t)
   533  	defer osn3.stop()
   534  
   535  	dialer := newCountingDialer()
   536  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address(), osn3.srv.Address())
   537  
   538  	// The first seek request asks for the latest block from some ordering node
   539  	osn1.addExpectProbeAssert()
   540  	osn2.addExpectProbeAssert()
   541  	osn3.addExpectProbeAssert()
   542  
   543  	// The first ordering node is offline
   544  	osn1.stop()
   545  	// The second returns a bad answer
   546  	osn2.blockResponses <- &orderer.DeliverResponse{
   547  		Type: &orderer.DeliverResponse_Status{Status: common.Status_FORBIDDEN},
   548  	}
   549  	// The third returns the latest block
   550  	osn3.enqueueResponse(5)
   551  
   552  	res, err := bp.HeightsByEndpoints()
   553  	require.NoError(t, err)
   554  	expected := map[string]uint64{
   555  		osn3.srv.Address(): 6,
   556  	}
   557  	require.Equal(t, expected, res)
   558  
   559  	bp.Close()
   560  	dialer.assertAllConnectionsClosed(t)
   561  }
   562  
   563  func TestBlockPullerMultipleOrderers(t *testing.T) {
   564  	// Scenario: 3 ordering nodes,
   565  	// and the block puller pulls blocks 3 to 5 from some
   566  	// orderer node.
   567  	// We ensure that exactly a single orderer node sent blocks.
   568  
   569  	osn1 := newClusterNode(t)
   570  	defer osn1.stop()
   571  
   572  	osn2 := newClusterNode(t)
   573  	defer osn2.stop()
   574  
   575  	osn3 := newClusterNode(t)
   576  	defer osn3.stop()
   577  
   578  	dialer := newCountingDialer()
   579  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address(), osn3.srv.Address())
   580  
   581  	// The first seek request asks for the latest block from some ordering node
   582  	osn1.addExpectProbeAssert()
   583  	osn2.addExpectProbeAssert()
   584  	osn3.addExpectProbeAssert()
   585  	// The first response says that the height is 5
   586  	osn1.enqueueResponse(5)
   587  	osn2.enqueueResponse(5)
   588  	osn3.enqueueResponse(5)
   589  
   590  	// The next seek request is for block 3
   591  	osn1.addExpectPullAssert(3)
   592  	osn2.addExpectPullAssert(3)
   593  	osn3.addExpectPullAssert(3)
   594  
   595  	// The later responses are the block themselves
   596  	for i := 3; i <= 5; i++ {
   597  		osn1.enqueueResponse(uint64(i))
   598  		osn2.enqueueResponse(uint64(i))
   599  		osn3.enqueueResponse(uint64(i))
   600  	}
   601  
   602  	initialTotalBlockAmount := len(osn1.blockResponses) + len(osn2.blockResponses) + len(osn3.blockResponses)
   603  
   604  	for i := 3; i <= 5; i++ {
   605  		require.Equal(t, uint64(i), bp.PullBlock(uint64(i)).Header.Number)
   606  	}
   607  
   608  	// Assert the cumulative amount of blocks in the OSNs went down by 6:
   609  	// blocks 3, 4, 5 were pulled - that's 3 blocks.
   610  	// block 5 was pulled 3 times at the probe phase.
   611  	finalTotalBlockAmount := len(osn1.blockResponses) + len(osn2.blockResponses) + len(osn3.blockResponses)
   612  	require.Equal(t, initialTotalBlockAmount-6, finalTotalBlockAmount)
   613  
   614  	bp.Close()
   615  	dialer.assertAllConnectionsClosed(t)
   616  }
   617  
   618  func TestBlockPullerFailover(t *testing.T) {
   619  	// Scenario:
   620  	// The block puller is expected to pull blocks 1 to 3.
   621  	// There are two ordering nodes, but at first -  only node 1 is available.
   622  	// the block puller first connects to it, but as it pulls the first block,
   623  	// it crashes.
   624  	// A second orderer is then spawned and the block puller is expected to
   625  	// connect to it and pull the rest of the blocks.
   626  
   627  	osn1 := newClusterNode(t)
   628  	osn1.addExpectProbeAssert()
   629  	osn1.addExpectPullAssert(1)
   630  	osn1.enqueueResponse(3)
   631  	osn1.enqueueResponse(1)
   632  
   633  	osn2 := newClusterNode(t)
   634  	defer osn2.stop()
   635  
   636  	osn2.addExpectProbeAssert()
   637  	osn2.addExpectPullAssert(1)
   638  	// First response is for the probe
   639  	osn2.enqueueResponse(3)
   640  	// Next three responses are for the pulling.
   641  	osn2.enqueueResponse(1)
   642  	osn2.enqueueResponse(2)
   643  	osn2.enqueueResponse(3)
   644  
   645  	// First we stop node 2 to make sure the block puller can't connect to it when it is created
   646  	osn2.stop()
   647  
   648  	dialer := newCountingDialer()
   649  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address())
   650  
   651  	// We don't want to rely on fetch timeout, but on pure failover logic,
   652  	// so make the fetch timeout huge
   653  	bp.FetchTimeout = time.Hour
   654  
   655  	// Configure the block puller logger to signal the wait group once the block puller
   656  	// received the first block.
   657  	pulledBlock1Done := make(chan struct{})
   658  	var once sync.Once
   659  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   660  		if strings.Contains(entry.Message, "Got block [1] of size") {
   661  			once.Do(func() { close(pulledBlock1Done) })
   662  		}
   663  		return nil
   664  	}))
   665  
   666  	go func() {
   667  		// Wait for the block puller to pull the first block
   668  		select {
   669  		case <-pulledBlock1Done:
   670  		case <-time.After(10 * time.Second):
   671  			t.Errorf("message was not logged before timeout")
   672  			return
   673  		}
   674  		// and now crash node1 and resurrect node 2
   675  		osn1.stop()
   676  		osn2.resurrect()
   677  	}()
   678  
   679  	// Assert reception of blocks 1 to 3
   680  	require.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
   681  	require.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
   682  	require.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
   683  
   684  	bp.Close()
   685  	dialer.assertAllConnectionsClosed(t)
   686  }
   687  
   688  func TestBlockPullerNoneResponsiveOrderer(t *testing.T) {
   689  	// Scenario: There are two ordering nodes, and the block puller
   690  	// connects to one of them.
   691  	// it fetches the first block, but then the second block isn't sent
   692  	// for too long, and the block puller should abort, and try the other
   693  	// node.
   694  
   695  	osn1 := newClusterNode(t)
   696  	defer osn1.stop()
   697  
   698  	osn2 := newClusterNode(t)
   699  	defer osn2.stop()
   700  
   701  	osn1.addExpectProbeAssert()
   702  	osn2.addExpectProbeAssert()
   703  	osn1.enqueueResponse(3)
   704  	osn2.enqueueResponse(3)
   705  
   706  	dialer := newCountingDialer()
   707  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address())
   708  	bp.FetchTimeout = time.Millisecond * 100
   709  
   710  	notInUseOrdererNode := osn2
   711  	// Configure the logger to tell us who is the orderer node that the block puller
   712  	// isn't connected to. This is done by intercepting the appropriate message
   713  	waitForConnectionDone := make(chan struct{})
   714  	var once sync.Once
   715  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   716  		if !strings.Contains(entry.Message, "Sending request for block [1]") {
   717  			return nil
   718  		}
   719  		defer once.Do(func() { close(waitForConnectionDone) })
   720  		s := entry.Message[len("Sending request for block [1] to 127.0.0.1:"):]
   721  		port, err := strconv.ParseInt(s, 10, 32)
   722  		require.NoError(t, err)
   723  		// If osn2 is the current orderer we're connected to,
   724  		// the orderer we're not connected to, is osn1
   725  		if osn2.port() == int(port) {
   726  			notInUseOrdererNode = osn1
   727  			// Enqueue block 1 to the current orderer the block puller
   728  			// is connected to
   729  			osn2.enqueueResponse(1)
   730  			osn2.addExpectPullAssert(1)
   731  		} else {
   732  			// We're connected to osn1, so enqueue block 1
   733  			osn1.enqueueResponse(1)
   734  			osn1.addExpectPullAssert(1)
   735  		}
   736  		return nil
   737  	}))
   738  
   739  	go func() {
   740  		select {
   741  		case <-waitForConnectionDone:
   742  		case <-time.After(10 * time.Second):
   743  			t.Errorf("message was not logged before timeout")
   744  			return
   745  		}
   746  
   747  		// Enqueue the height int the orderer we're connected to
   748  		notInUseOrdererNode.enqueueResponse(3)
   749  		notInUseOrdererNode.addExpectProbeAssert()
   750  		// Enqueue blocks 1, 2, 3 to the orderer node we're not connected to.
   751  		notInUseOrdererNode.addExpectPullAssert(1)
   752  		notInUseOrdererNode.enqueueResponse(1)
   753  		notInUseOrdererNode.enqueueResponse(2)
   754  		notInUseOrdererNode.enqueueResponse(3)
   755  	}()
   756  
   757  	// Assert reception of blocks 1 to 3
   758  	require.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
   759  	require.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
   760  	require.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
   761  
   762  	bp.Close()
   763  	dialer.assertAllConnectionsClosed(t)
   764  }
   765  
   766  func TestBlockPullerNoOrdererAliveAtStartup(t *testing.T) {
   767  	// Scenario: Single ordering node, and when the block puller
   768  	// starts up - the orderer is nowhere to be found.
   769  	osn := newClusterNode(t)
   770  	osn.stop()
   771  	defer osn.stop()
   772  
   773  	dialer := newCountingDialer()
   774  	bp := newBlockPuller(dialer, osn.srv.Address())
   775  
   776  	// Configure the logger to wait until the a failed connection attempt was made
   777  	connectionAttemptDone := make(chan struct{})
   778  	var closeOnce sync.Once
   779  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   780  		if strings.Contains(entry.Message, "Failed connecting to") {
   781  			closeOnce.Do(func() { close(connectionAttemptDone) })
   782  		}
   783  		return nil
   784  	}))
   785  
   786  	go func() {
   787  		select {
   788  		case <-connectionAttemptDone:
   789  		case <-time.After(10 * time.Second):
   790  			t.Errorf("message was not logged before timeout")
   791  			return
   792  		}
   793  
   794  		osn.resurrect()
   795  		// The first seek request asks for the latest block
   796  		osn.addExpectProbeAssert()
   797  		// The first response says that the last sequence is 2
   798  		osn.enqueueResponse(2)
   799  		// The next seek request is for block 1
   800  		osn.addExpectPullAssert(1)
   801  		// And the orderer returns block 1 and 2
   802  		osn.enqueueResponse(1)
   803  		osn.enqueueResponse(2)
   804  	}()
   805  
   806  	require.Equal(t, uint64(1), bp.PullBlock(1).Header.Number)
   807  	require.Equal(t, uint64(2), bp.PullBlock(2).Header.Number)
   808  
   809  	bp.Close()
   810  	dialer.assertAllConnectionsClosed(t)
   811  }
   812  
   813  func TestBlockPullerFailures(t *testing.T) {
   814  	// This test case is flaky, so let's add some logs for the next time it fails.
   815  	flogging.ActivateSpec("debug")
   816  	defer flogging.ActivateSpec("info")
   817  	testLogger := flogging.MustGetLogger("test.debug")
   818  
   819  	// Scenario: Single ordering node is faulty, but later
   820  	// on it recovers.
   821  	failureError := errors.New("oops, something went wrong")
   822  	failStream := func(osn *deliverServer, _ *cluster.BlockPuller) {
   823  		testLogger.Info("failStream")
   824  		osn.Lock()
   825  		osn.err = failureError
   826  		osn.Unlock()
   827  	}
   828  
   829  	badSigErr := errors.New("bad signature")
   830  	malformBlockSignatureAndRecreateOSNBuffer := func(osn *deliverServer, bp *cluster.BlockPuller) {
   831  		testLogger.Info("FailFunc of: failure at verifying pulled block")
   832  		bp.VerifyBlockSequence = func(_ []*common.Block, _ string) error {
   833  			close(osn.blocks())
   834  			// After failing once, recover and remove the bad signature error.
   835  			defer func() {
   836  				// Skip recovery if we already recovered.
   837  				if badSigErr == nil {
   838  					return
   839  				}
   840  				badSigErr = nil
   841  				osn.setBlocks(make(chan *orderer.DeliverResponse, 100))
   842  				osn.enqueueResponse(3)
   843  				osn.enqueueResponse(1)
   844  				osn.enqueueResponse(2)
   845  				osn.enqueueResponse(3)
   846  			}()
   847  			return badSigErr
   848  		}
   849  	}
   850  
   851  	noFailFunc := func(_ *deliverServer, _ *cluster.BlockPuller) {}
   852  
   853  	recover := func(osn *deliverServer, bp *cluster.BlockPuller) func(entry zapcore.Entry) error {
   854  		return func(entry zapcore.Entry) error {
   855  			if osn.isFaulty() && strings.Contains(entry.Message, failureError.Error()) {
   856  				testLogger.Info("recover osn.err")
   857  				osn.Lock()
   858  				osn.err = nil
   859  				osn.Unlock()
   860  			}
   861  			if strings.Contains(entry.Message, "Failed verifying") {
   862  				testLogger.Info("recover verifier")
   863  				bp.VerifyBlockSequence = noopBlockVerifierf
   864  			}
   865  			return nil
   866  		}
   867  	}
   868  
   869  	failAfterConnection := func(osn *deliverServer, logTrigger string, failFunc func()) func(entry zapcore.Entry) error {
   870  		once := &sync.Once{}
   871  		return func(entry zapcore.Entry) error {
   872  			if !osn.isFaulty() && strings.Contains(entry.Message, logTrigger) {
   873  				once.Do(func() {
   874  					failFunc()
   875  				})
   876  			}
   877  			return nil
   878  		}
   879  	}
   880  
   881  	for _, testCase := range []struct {
   882  		name       string
   883  		logTrigger string
   884  		beforeFunc func(*deliverServer, *cluster.BlockPuller)
   885  		failFunc   func(*deliverServer, *cluster.BlockPuller)
   886  	}{
   887  		{
   888  			name:       "failure at probe",
   889  			logTrigger: "skip this for this test case",
   890  			beforeFunc: func(osn *deliverServer, bp *cluster.BlockPuller) {
   891  				failStream(osn, nil)
   892  				// The first seek request asks for the latest block but it fails
   893  				osn.addExpectProbeAssert()
   894  				// And the last block sequence is returned
   895  				osn.enqueueResponse(3)
   896  				// The next seek request is for block 1
   897  				osn.addExpectPullAssert(1)
   898  			},
   899  			failFunc: noFailFunc,
   900  		},
   901  		{
   902  			name:       "failure at pull",
   903  			logTrigger: "Sending request for block [1]",
   904  			beforeFunc: func(osn *deliverServer, bp *cluster.BlockPuller) {
   905  				// The first seek request asks for the latest block and succeeds
   906  				osn.addExpectProbeAssert()
   907  				// But as the block puller tries to pull, the stream fails, so it should reconnect.
   908  				osn.addExpectProbeAssert()
   909  				// We need to send the latest sequence twice because the stream fails after the first time.
   910  				osn.enqueueResponse(3)
   911  				osn.enqueueResponse(3)
   912  				osn.addExpectPullAssert(1)
   913  			},
   914  			failFunc: failStream,
   915  		},
   916  		{
   917  			name:       "failure at verifying pulled block",
   918  			logTrigger: "Sending request for block [1]",
   919  			beforeFunc: func(osn *deliverServer, bp *cluster.BlockPuller) {
   920  				testLogger.Infof("BeforeFunc of: %s", "failure at verifying pulled block")
   921  				// The first seek request asks for the latest block and succeeds
   922  				osn.addExpectProbeAssert()
   923  				osn.enqueueResponse(3)
   924  				// It then pulls all 3 blocks, but fails verifying them.
   925  				osn.addExpectPullAssert(1)
   926  				osn.enqueueResponse(1)
   927  				osn.enqueueResponse(2)
   928  				osn.enqueueResponse(3)
   929  				// So it probes again, and then requests block 1 again.
   930  				osn.addExpectProbeAssert()
   931  				osn.addExpectPullAssert(1)
   932  			},
   933  			failFunc: malformBlockSignatureAndRecreateOSNBuffer,
   934  		},
   935  	} {
   936  		testCase := testCase
   937  		t.Run(testCase.name, func(t *testing.T) {
   938  			testLogger.Infof("Starting test case: %s", testCase.name)
   939  			osn := newClusterNode(t)
   940  			defer osn.stop()
   941  
   942  			dialer := newCountingDialer()
   943  			bp := newBlockPuller(dialer, osn.srv.Address())
   944  
   945  			testCase.beforeFunc(osn, bp)
   946  
   947  			// Configure the logger to trigger failure at the appropriate time.
   948  			fail := func() {
   949  				testCase.failFunc(osn, bp)
   950  			}
   951  			bp.Logger = bp.Logger.WithOptions(zap.Hooks(recover(osn, bp), failAfterConnection(osn, testCase.logTrigger, fail)))
   952  
   953  			// The orderer sends blocks 1 to 3
   954  			osn.enqueueResponse(1)
   955  			osn.enqueueResponse(2)
   956  			osn.enqueueResponse(3)
   957  
   958  			require.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
   959  			require.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
   960  			require.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
   961  
   962  			bp.Close()
   963  			dialer.assertAllConnectionsClosed(t)
   964  		})
   965  	}
   966  }
   967  
   968  func TestBlockPullerBadBlocks(t *testing.T) {
   969  	// Scenario: ordering node sends malformed blocks.
   970  
   971  	// This test case is flaky, so let's add some logs for the next time it fails.
   972  	flogging.ActivateSpec("debug")
   973  	defer flogging.ActivateSpec("info")
   974  	testLogger := flogging.MustGetLogger("test.debug")
   975  
   976  	removeHeader := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   977  		resp.GetBlock().Header = nil
   978  		return resp
   979  	}
   980  
   981  	removeData := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   982  		resp.GetBlock().Data = nil
   983  		return resp
   984  	}
   985  
   986  	removeMetadata := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   987  		resp.GetBlock().Metadata = nil
   988  		return resp
   989  	}
   990  
   991  	changeType := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   992  		resp.Type = nil
   993  		return resp
   994  	}
   995  
   996  	statusType := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   997  		resp.Type = &orderer.DeliverResponse_Status{
   998  			Status: common.Status_INTERNAL_SERVER_ERROR,
   999  		}
  1000  		return resp
  1001  	}
  1002  
  1003  	changeSequence := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
  1004  		resp.GetBlock().Header.Number = 3
  1005  		return resp
  1006  	}
  1007  
  1008  	testcases := []struct {
  1009  		name           string
  1010  		corruptBlock   func(block *orderer.DeliverResponse) *orderer.DeliverResponse
  1011  		expectedErrMsg string
  1012  	}{
  1013  		{
  1014  			name:           "nil header",
  1015  			corruptBlock:   removeHeader,
  1016  			expectedErrMsg: "block header is nil",
  1017  		},
  1018  		{
  1019  			name:           "nil data",
  1020  			corruptBlock:   removeData,
  1021  			expectedErrMsg: "block data is nil",
  1022  		},
  1023  		{
  1024  			name:           "nil metadata",
  1025  			corruptBlock:   removeMetadata,
  1026  			expectedErrMsg: "block metadata is empty",
  1027  		},
  1028  		{
  1029  			name:           "wrong type",
  1030  			corruptBlock:   changeType,
  1031  			expectedErrMsg: "response is of type <nil>, but expected a block",
  1032  		},
  1033  		{
  1034  			name:           "bad type",
  1035  			corruptBlock:   statusType,
  1036  			expectedErrMsg: "faulty node, received: status:INTERNAL_SERVER_ERROR ",
  1037  		},
  1038  		{
  1039  			name:           "wrong number",
  1040  			corruptBlock:   changeSequence,
  1041  			expectedErrMsg: "got unexpected sequence",
  1042  		},
  1043  	}
  1044  
  1045  	for _, testCase := range testcases {
  1046  		testCase := testCase
  1047  		t.Run(testCase.name, func(t *testing.T) {
  1048  			osn := newClusterNode(t)
  1049  			defer osn.stop()
  1050  
  1051  			osn.addExpectProbeAssert()
  1052  			osn.addExpectPullAssert(10)
  1053  
  1054  			dialer := newCountingDialer()
  1055  			bp := newBlockPuller(dialer, osn.srv.Address())
  1056  
  1057  			osn.enqueueResponse(10)
  1058  			osn.enqueueResponse(10)
  1059  			// Fetch the first
  1060  			block := <-osn.blockResponses
  1061  			// Enqueue the probe response
  1062  			// re-insert the pull response after corrupting it, to the tail of the queue
  1063  			osn.blockResponses <- testCase.corruptBlock(block)
  1064  			// [ 10  10* ]
  1065  			// Expect the block puller to retry and this time give it what it wants
  1066  			osn.addExpectProbeAssert()
  1067  			osn.addExpectPullAssert(10)
  1068  			require.Len(t, osn.seekAssertions, 4)
  1069  
  1070  			// Wait until the block is pulled and the malleability is detected
  1071  			detectedBadBlockDone := make(chan struct{})
  1072  			var closeOnce sync.Once
  1073  			bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1074  				if strings.Contains(entry.Message, fmt.Sprintf("Failed pulling blocks: %s", testCase.expectedErrMsg)) {
  1075  					closeOnce.Do(func() { close(detectedBadBlockDone) })
  1076  
  1077  					// Close the channel to make the current server-side deliver stream close
  1078  					close(osn.blocks())
  1079  					testLogger.Infof("Expected err log has been printed: %s\n", testCase.expectedErrMsg)
  1080  					// And reset the block buffer to be able to write into it again
  1081  					osn.setBlocks(make(chan *orderer.DeliverResponse, 100))
  1082  					// Put a correct block after it, 1 for the probing and 1 for the fetch
  1083  					osn.enqueueResponse(10)
  1084  					osn.enqueueResponse(10)
  1085  				}
  1086  				return nil
  1087  			}))
  1088  
  1089  			endPullBlock := make(chan struct{})
  1090  			go func() {
  1091  				bp.PullBlock(10)
  1092  				close(endPullBlock)
  1093  			}()
  1094  
  1095  			select {
  1096  			case <-detectedBadBlockDone:
  1097  			case <-time.After(10 * time.Second):
  1098  				t.Fatalf("expected %q to be logged but it was not seen", testCase.expectedErrMsg)
  1099  			}
  1100  
  1101  			select {
  1102  			case <-endPullBlock:
  1103  			case <-time.After(10 * time.Second):
  1104  				// Signal PullBlock to give up on the retries & mark the test as fail
  1105  				close(bp.StopChannel)
  1106  				t.Fatalf("PullBlock did not complete within time")
  1107  			}
  1108  
  1109  			bp.Close()
  1110  			dialer.assertAllConnectionsClosed(t)
  1111  		})
  1112  	}
  1113  }
  1114  
  1115  func TestImpatientStreamFailure(t *testing.T) {
  1116  	osn := newClusterNode(t)
  1117  	dialer := newCountingDialer()
  1118  	defer dialer.assertAllConnectionsClosed(t)
  1119  	// Wait for OSN to start
  1120  	// by trying to connect to it
  1121  	var conn *grpc.ClientConn
  1122  	var err error
  1123  
  1124  	gt := gomega.NewGomegaWithT(t)
  1125  	gt.Eventually(func() (bool, error) {
  1126  		conn, err = dialer.Dial(osn.endpointCriteria())
  1127  		return true, err
  1128  	}).Should(gomega.BeTrue())
  1129  	newStream := cluster.NewImpatientStream(conn, time.Millisecond*100)
  1130  	defer conn.Close()
  1131  	// Shutdown the OSN
  1132  	osn.stop()
  1133  	// Ensure the port isn't open anymore
  1134  	gt.Eventually(func() (bool, error) {
  1135  		ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10)
  1136  		defer cancel()
  1137  		conn, _ := grpc.DialContext(ctx, osn.srv.Address(), grpc.WithBlock(), grpc.WithInsecure())
  1138  		if conn != nil {
  1139  			conn.Close()
  1140  			return false, nil
  1141  		}
  1142  		return true, nil
  1143  	}).Should(gomega.BeTrue())
  1144  	stream, err := newStream()
  1145  	if err != nil {
  1146  		return
  1147  	}
  1148  	_, err = stream.Recv()
  1149  	require.Error(t, err)
  1150  }
  1151  
  1152  func TestBlockPullerMaxRetriesExhausted(t *testing.T) {
  1153  	// Scenario:
  1154  	// The block puller is expected to pull blocks 1 to 3.
  1155  	// But the orderer only has blocks 1,2, and from some reason
  1156  	// it sends back block 2 twice (we do this so that we
  1157  	// don't rely on timeout, because timeouts are flaky in tests).
  1158  	// It should attempt to re-connect and to send requests
  1159  	// until the attempt number is exhausted, after which
  1160  	// it gives up, and nil is returned.
  1161  
  1162  	osn := newClusterNode(t)
  1163  	defer osn.stop()
  1164  
  1165  	// We report having up to block 3.
  1166  	osn.enqueueResponse(3)
  1167  	osn.addExpectProbeAssert()
  1168  	// We send blocks 1
  1169  	osn.addExpectPullAssert(1)
  1170  	osn.enqueueResponse(1)
  1171  	// And 2, twice.
  1172  	osn.enqueueResponse(2)
  1173  	osn.enqueueResponse(2)
  1174  	// A nil message signals the deliver stream closes.
  1175  	// This is to signal the server side to prepare for a new deliver
  1176  	// stream that the client should open.
  1177  	osn.blockResponses <- nil
  1178  
  1179  	for i := 0; i < 2; i++ {
  1180  		// Therefore, the block puller should disconnect and reconnect.
  1181  		osn.addExpectProbeAssert()
  1182  		// We report having up to block 3.
  1183  		osn.enqueueResponse(3)
  1184  		// And we expect to be asked for block 3, since blocks 1, 2
  1185  		// have already been passed to the caller.
  1186  		osn.addExpectPullAssert(3)
  1187  		// Once again, we send 2 instead of 3
  1188  		osn.enqueueResponse(2)
  1189  		// The client disconnects again
  1190  		osn.blockResponses <- nil
  1191  	}
  1192  
  1193  	dialer := newCountingDialer()
  1194  	bp := newBlockPuller(dialer, osn.srv.Address())
  1195  
  1196  	var exhaustedRetryAttemptsLogged bool
  1197  
  1198  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1199  		if entry.Message == "Failed pulling block [3]: retry count exhausted(2)" {
  1200  			exhaustedRetryAttemptsLogged = true
  1201  		}
  1202  		return nil
  1203  	}))
  1204  
  1205  	bp.MaxPullBlockRetries = 2
  1206  	// We don't expect to timeout in this test, so make the timeout large
  1207  	// to prevent flakes due to CPU starvation.
  1208  	bp.FetchTimeout = time.Hour
  1209  	// Make the buffer tiny, only a single byte - in order deliver blocks
  1210  	// to the caller one by one and not store them in the buffer.
  1211  	bp.MaxTotalBufferBytes = 1
  1212  
  1213  	// Assert reception of blocks 1 to 3
  1214  	require.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
  1215  	require.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
  1216  	require.Nil(t, bp.PullBlock(uint64(3)))
  1217  
  1218  	bp.Close()
  1219  	dialer.assertAllConnectionsClosed(t)
  1220  	require.True(t, exhaustedRetryAttemptsLogged)
  1221  }
  1222  
  1223  func TestBlockPullerToBadEndpointWithStop(t *testing.T) {
  1224  	// Scenario:
  1225  	// The block puller is initialized with endpoints that do not exist.
  1226  	// It should attempt to re-connect but will fail, causing the PullBlock()
  1227  	// to block the calling go-routine. Closing the StopChannel should signal
  1228  	// the blocked go-routine to gives up, and nil is returned.
  1229  
  1230  	dialer := newCountingDialer()
  1231  	bp := newBlockPuller(dialer, "10.10.10.10:666") // Nobody is there
  1232  
  1233  	var couldNotConnectLogged bool
  1234  	var receivedStopLogged bool
  1235  
  1236  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1237  		if entry.Message == "Could not connect to any endpoint of [{\"CAs\":null,\"Endpoint\":\"10.10.10.10:666\"}]" {
  1238  			couldNotConnectLogged = true
  1239  		}
  1240  		if entry.Message == "Received a stop signal" {
  1241  			receivedStopLogged = true
  1242  		}
  1243  		return nil
  1244  	}))
  1245  
  1246  	// Retry for a long time
  1247  	bp.MaxPullBlockRetries = 100
  1248  	bp.RetryTimeout = time.Hour
  1249  
  1250  	releaseDone := make(chan struct{})
  1251  	var closeOnce sync.Once
  1252  
  1253  	go func() {
  1254  		// But this will get stuck until the StopChannel is closed
  1255  		require.Nil(t, bp.PullBlock(uint64(1)))
  1256  		bp.Close()
  1257  		closeOnce.Do(func() { close(releaseDone) })
  1258  	}()
  1259  
  1260  	close(bp.StopChannel)
  1261  	select {
  1262  	case <-releaseDone:
  1263  	case <-time.After(10 * time.Second):
  1264  		t.Fatalf("timed out waiting for PullBlock to complete")
  1265  	}
  1266  
  1267  	dialer.assertAllConnectionsClosed(t)
  1268  	require.True(t, couldNotConnectLogged)
  1269  	require.True(t, receivedStopLogged)
  1270  }
  1271  
  1272  func TestBlockPullerToBadEndpoint(t *testing.T) {
  1273  	// Scenario: The block puller does not get stuck in an endless loop if it cannot connect to any endpoint.
  1274  	dialer := newCountingDialer()
  1275  	bp := newBlockPuller(dialer, "10.10.10.10:666") // Nobody is there
  1276  
  1277  	var exhaustedRetryAttemptsLogged bool
  1278  	var couldNotConnectLogged bool
  1279  
  1280  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1281  		if entry.Message == "Failed pulling block [1]: retry count exhausted(2)" {
  1282  			exhaustedRetryAttemptsLogged = true
  1283  		}
  1284  		if entry.Message == "Failed to connect to some endpoint, attempts exhausted(2), seq: 1, endpoints: [{\"CAs\":null,\"Endpoint\":\"10.10.10.10:666\"}]" {
  1285  			couldNotConnectLogged = true
  1286  		}
  1287  		return nil
  1288  	}))
  1289  
  1290  	bp.MaxPullBlockRetries = 2
  1291  	// We don't expect to timeout in this test, so make the timeout large
  1292  	// to prevent flakes due to CPU starvation.
  1293  	bp.FetchTimeout = time.Hour
  1294  
  1295  	require.Nil(t, bp.PullBlock(uint64(1)))
  1296  
  1297  	bp.Close()
  1298  	dialer.assertAllConnectionsClosed(t)
  1299  	require.True(t, exhaustedRetryAttemptsLogged)
  1300  	require.True(t, couldNotConnectLogged)
  1301  }
  1302  
  1303  func TestBlockPuller_UpdateEndpoint(t *testing.T) {
  1304  	t.Run("HeightsByEndpoints are changing", func(t *testing.T) {
  1305  		// Scenario: We ask for the latest block from a set of ordering nodes.
  1306  		// We then add an endpoint and remove an endpoint, update the puller's endpoints, and ask again.
  1307  		osn1 := newClusterNode(t)
  1308  		defer osn1.stop()
  1309  		osn2 := newClusterNode(t)
  1310  		defer osn2.stop()
  1311  		osn3 := newClusterNode(t)
  1312  		defer osn3.stop()
  1313  
  1314  		dialer := newCountingDialer()
  1315  		bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address(), osn3.srv.Address())
  1316  
  1317  		// The first seek request asks for the latest block from some ordering node
  1318  		osn1.addExpectProbeAssert()
  1319  		osn2.addExpectProbeAssert()
  1320  		osn3.addExpectProbeAssert()
  1321  
  1322  		osn1.enqueueResponse(3)
  1323  		osn2.enqueueResponse(4)
  1324  		osn3.enqueueResponse(5)
  1325  
  1326  		res, err := bp.HeightsByEndpoints()
  1327  		require.NoError(t, err)
  1328  		expected := map[string]uint64{
  1329  			osn1.srv.Address(): 4,
  1330  			osn2.srv.Address(): 5,
  1331  			osn3.srv.Address(): 6,
  1332  		}
  1333  		require.Equal(t, expected, res)
  1334  
  1335  		osn4 := newClusterNode(t)
  1336  		defer osn4.stop()
  1337  
  1338  		bp.UpdateEndpoints(endpointCriteriaFromEndpoints(osn2.srv.Address(), osn3.srv.Address(), osn4.srv.Address()))
  1339  
  1340  		osn2.addExpectProbeAssert()
  1341  		osn3.addExpectProbeAssert()
  1342  		osn4.addExpectProbeAssert()
  1343  
  1344  		osn2.enqueueResponse(44)
  1345  		osn3.enqueueResponse(55)
  1346  		osn4.enqueueResponse(66)
  1347  
  1348  		res, err = bp.HeightsByEndpoints()
  1349  		require.NoError(t, err)
  1350  		expected = map[string]uint64{
  1351  			osn2.srv.Address(): 45,
  1352  			osn3.srv.Address(): 56,
  1353  			osn4.srv.Address(): 67,
  1354  		}
  1355  		require.Equal(t, expected, res)
  1356  
  1357  		bp.Close()
  1358  		dialer.assertAllConnectionsClosed(t)
  1359  	})
  1360  
  1361  	t.Run("switch pulling between endpoints", func(t *testing.T) {
  1362  		// Scenario:
  1363  		// The block puller is expected to pull blocks 1 to 2 from osn1.
  1364  		// After block 2 the endpoints are updated to osn2.
  1365  		// The block puller is expected to pull blocks 3 to 4 from osn2.
  1366  
  1367  		osn1 := newClusterNode(t)
  1368  		defer osn1.stop()
  1369  
  1370  		osn1.enqueueResponse(2)
  1371  		osn1.addExpectProbeAssert()
  1372  		// We send blocks 1 and 2
  1373  		osn1.addExpectPullAssert(1)
  1374  		osn1.enqueueResponse(1)
  1375  		osn1.enqueueResponse(2)
  1376  
  1377  		dialer := newCountingDialer()
  1378  		bp := newBlockPuller(dialer, osn1.srv.Address())
  1379  
  1380  		require.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
  1381  		require.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
  1382  
  1383  		osn2 := newClusterNode(t)
  1384  		defer osn2.stop()
  1385  
  1386  		osn2.enqueueResponse(4)
  1387  		osn2.addExpectProbeAssert()
  1388  		// We send blocks 1 and 2
  1389  		osn2.addExpectPullAssert(3)
  1390  		osn2.enqueueResponse(3)
  1391  		osn2.enqueueResponse(4)
  1392  
  1393  		// This will disconnect
  1394  		bp.UpdateEndpoints(endpointCriteriaFromEndpoints(osn2.srv.Address()))
  1395  
  1396  		require.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
  1397  		require.Equal(t, uint64(4), bp.PullBlock(uint64(4)).Header.Number)
  1398  
  1399  		bp.Close()
  1400  		dialer.assertAllConnectionsClosed(t)
  1401  	})
  1402  	t.Run("update to a bad endpoint", func(t *testing.T) {
  1403  		// Scenario:
  1404  		// The block puller is expected to pull blocks 1 to 3. The target orderer delivers only blocks 1,2.
  1405  		// The puller pulls blocks 1 & 2 into the buffer. After block 1 the endpoints are updated to a bad endpoint.
  1406  		// The puller gets block 2 from the buffer. It should attempt to re-connect until the attempt number is exhausted,
  1407  		// after which it gives up, and nil is returned.
  1408  
  1409  		osn := newClusterNode(t)
  1410  
  1411  		osn.enqueueResponse(2)
  1412  		osn.addExpectProbeAssert()
  1413  		// We send blocks 1 and 2
  1414  		osn.addExpectPullAssert(1)
  1415  		osn.enqueueResponse(1)
  1416  		osn.enqueueResponse(2)
  1417  
  1418  		dialer := newCountingDialer()
  1419  		bp := newBlockPuller(dialer, osn.srv.Address())
  1420  
  1421  		var exhaustedRetryAttemptsLogged bool
  1422  		var couldNotConnectLogged bool
  1423  		bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1424  			if entry.Message == "Failed pulling block [3]: retry count exhausted(2)" {
  1425  				exhaustedRetryAttemptsLogged = true
  1426  			}
  1427  			if entry.Message ==
  1428  				"Failed to connect to some endpoint, attempts exhausted(2), seq: 3, endpoints: [{\"CAs\":null,\"Endpoint\":\"10.10.10.10:666\"}]" {
  1429  				couldNotConnectLogged = true
  1430  			}
  1431  			return nil
  1432  		}))
  1433  
  1434  		bp.MaxPullBlockRetries = 2
  1435  		// We don't expect to timeout in this test, so make the timeout large
  1436  		// to prevent flakes due to CPU starvation.
  1437  		bp.FetchTimeout = time.Second
  1438  
  1439  		require.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
  1440  		osn.stop()
  1441  		// This will disconnect
  1442  		bp.UpdateEndpoints(endpointCriteriaFromEndpoints("10.10.10.10:666"))
  1443  		// Block 2 from the buffer
  1444  		require.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
  1445  		// Block 3 will never arrive
  1446  		require.Nil(t, bp.PullBlock(uint64(3)))
  1447  
  1448  		bp.Close()
  1449  		dialer.assertAllConnectionsClosed(t)
  1450  		require.True(t, exhaustedRetryAttemptsLogged)
  1451  		require.True(t, couldNotConnectLogged)
  1452  	})
  1453  }