github.com/anjalikarhana/fabric@v2.1.1+incompatible/orderer/common/cluster/deliver_test.go (about)

     1  /*
     2  Copyright IBM Corp. 2018 All Rights Reserved.
     3  
     4  SPDX-License-Identifier: Apache-2.0
     5  */
     6  
     7  package cluster_test
     8  
     9  import (
    10  	"context"
    11  	"crypto/rand"
    12  	"encoding/hex"
    13  	"fmt"
    14  	"net"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"sync/atomic"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/golang/protobuf/proto"
    23  	"github.com/hyperledger/fabric-protos-go/common"
    24  	"github.com/hyperledger/fabric-protos-go/orderer"
    25  	"github.com/hyperledger/fabric/bccsp/factory"
    26  	"github.com/hyperledger/fabric/common/flogging"
    27  	"github.com/hyperledger/fabric/internal/pkg/comm"
    28  	"github.com/hyperledger/fabric/internal/pkg/identity"
    29  	"github.com/hyperledger/fabric/orderer/common/cluster"
    30  	"github.com/hyperledger/fabric/orderer/common/cluster/mocks"
    31  	"github.com/hyperledger/fabric/protoutil"
    32  	"github.com/onsi/gomega"
    33  	"github.com/pkg/errors"
    34  	"github.com/stretchr/testify/assert"
    35  	"github.com/stretchr/testify/require"
    36  	"go.uber.org/zap"
    37  	"go.uber.org/zap/zapcore"
    38  	"google.golang.org/grpc"
    39  	"google.golang.org/grpc/balancer"
    40  	"google.golang.org/grpc/balancer/roundrobin"
    41  )
    42  
    43  // protects gRPC balancer registration
    44  var gRPCBalancerLock = sync.Mutex{}
    45  
    46  func init() {
    47  	factory.InitFactories(nil)
    48  }
    49  
    50  //go:generate counterfeiter -o mocks/signer_serializer.go --fake-name SignerSerializer . signerSerializer
    51  
    52  type signerSerializer interface {
    53  	identity.SignerSerializer
    54  }
    55  
    56  type wrappedBalancer struct {
    57  	balancer.Balancer
    58  	balancer.V2Balancer
    59  	cd *countingDialer
    60  }
    61  
    62  func (wb *wrappedBalancer) Close() {
    63  	defer atomic.AddUint32(&wb.cd.connectionCount, ^uint32(0))
    64  	wb.Balancer.Close()
    65  }
    66  
    67  type countingDialer struct {
    68  	name            string
    69  	baseBuilder     balancer.Builder
    70  	connectionCount uint32
    71  }
    72  
    73  func newCountingDialer() *countingDialer {
    74  	gRPCBalancerLock.Lock()
    75  	builder := balancer.Get(roundrobin.Name)
    76  	gRPCBalancerLock.Unlock()
    77  
    78  	buff := make([]byte, 16)
    79  	rand.Read(buff)
    80  	cb := &countingDialer{
    81  		name:        hex.EncodeToString(buff),
    82  		baseBuilder: builder,
    83  	}
    84  
    85  	gRPCBalancerLock.Lock()
    86  	balancer.Register(cb)
    87  	gRPCBalancerLock.Unlock()
    88  
    89  	return cb
    90  }
    91  
    92  func (d *countingDialer) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer {
    93  	defer atomic.AddUint32(&d.connectionCount, 1)
    94  	lb := d.baseBuilder.Build(cc, opts)
    95  	return &wrappedBalancer{
    96  		Balancer:   lb,
    97  		V2Balancer: lb.(balancer.V2Balancer),
    98  		cd:         d,
    99  	}
   100  }
   101  
   102  func (d *countingDialer) Name() string {
   103  	return d.name
   104  }
   105  
   106  func (d *countingDialer) assertAllConnectionsClosed(t *testing.T) {
   107  	timeLimit := time.Now().Add(timeout)
   108  	for atomic.LoadUint32(&d.connectionCount) != uint32(0) && time.Now().Before(timeLimit) {
   109  		time.Sleep(time.Millisecond)
   110  	}
   111  	assert.Equal(t, uint32(0), atomic.LoadUint32(&d.connectionCount))
   112  }
   113  
   114  func (d *countingDialer) Dial(address cluster.EndpointCriteria) (*grpc.ClientConn, error) {
   115  	ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*100)
   116  	defer cancel()
   117  
   118  	gRPCBalancerLock.Lock()
   119  	balancer := grpc.WithBalancerName(d.name)
   120  	gRPCBalancerLock.Unlock()
   121  	return grpc.DialContext(ctx, address.Endpoint, grpc.WithBlock(), grpc.WithInsecure(), balancer)
   122  }
   123  
   124  func noopBlockVerifierf(_ []*common.Block, _ string) error {
   125  	return nil
   126  }
   127  
   128  func readSeekEnvelope(stream orderer.AtomicBroadcast_DeliverServer) (*orderer.SeekInfo, string, error) {
   129  	env, err := stream.Recv()
   130  	if err != nil {
   131  		return nil, "", err
   132  	}
   133  	payload, err := protoutil.UnmarshalPayload(env.Payload)
   134  	if err != nil {
   135  		return nil, "", err
   136  	}
   137  	seekInfo := &orderer.SeekInfo{}
   138  	if err = proto.Unmarshal(payload.Data, seekInfo); err != nil {
   139  		return nil, "", err
   140  	}
   141  	chdr := &common.ChannelHeader{}
   142  	if err = proto.Unmarshal(payload.Header.ChannelHeader, chdr); err != nil {
   143  		return nil, "", err
   144  	}
   145  	return seekInfo, chdr.ChannelId, nil
   146  }
   147  
   148  type deliverServer struct {
   149  	t *testing.T
   150  	sync.Mutex
   151  	err            error
   152  	srv            *comm.GRPCServer
   153  	seekAssertions chan func(*orderer.SeekInfo, string)
   154  	blockResponses chan *orderer.DeliverResponse
   155  }
   156  
   157  func (ds *deliverServer) endpointCriteria() cluster.EndpointCriteria {
   158  	return cluster.EndpointCriteria{Endpoint: ds.srv.Address()}
   159  }
   160  
   161  func (ds *deliverServer) isFaulty() bool {
   162  	ds.Lock()
   163  	defer ds.Unlock()
   164  	return ds.err != nil
   165  }
   166  
   167  func (*deliverServer) Broadcast(orderer.AtomicBroadcast_BroadcastServer) error {
   168  	panic("implement me")
   169  }
   170  
   171  func (ds *deliverServer) Deliver(stream orderer.AtomicBroadcast_DeliverServer) error {
   172  	ds.Lock()
   173  	err := ds.err
   174  	ds.Unlock()
   175  
   176  	if err != nil {
   177  		return err
   178  	}
   179  	seekInfo, channel, err := readSeekEnvelope(stream)
   180  	if err != nil {
   181  		panic(err)
   182  	}
   183  
   184  	// FAB-16233 This is meant to mitigate timeouts when
   185  	// seekAssertions does not receive a value
   186  	timer := time.NewTimer(1 * time.Minute)
   187  	defer timer.Stop()
   188  
   189  	select {
   190  	case <-timer.C:
   191  		panic("timed out waiting for seek assertions to receive a value")
   192  	// Get the next seek assertion and ensure the next seek is of the expected type
   193  	case seekAssert := <-ds.seekAssertions:
   194  		seekAssert(seekInfo, channel)
   195  	}
   196  
   197  	if seekInfo.GetStart().GetSpecified() != nil {
   198  		return ds.deliverBlocks(stream)
   199  	}
   200  	if seekInfo.GetStart().GetNewest() != nil {
   201  		resp := <-ds.blocks()
   202  		if resp == nil {
   203  			return nil
   204  		}
   205  		return stream.Send(resp)
   206  	}
   207  	panic(fmt.Sprintf("expected either specified or newest seek but got %v", seekInfo.GetStart()))
   208  }
   209  
   210  func (ds *deliverServer) deliverBlocks(stream orderer.AtomicBroadcast_DeliverServer) error {
   211  	for {
   212  		blockChan := ds.blocks()
   213  		response := <-blockChan
   214  		// A nil response is a signal from the test to close the stream.
   215  		// This is needed to avoid reading from the block buffer, hence
   216  		// consuming by accident a block that is tabled to be pulled
   217  		// later in the test.
   218  		if response == nil {
   219  			return nil
   220  		}
   221  		if err := stream.Send(response); err != nil {
   222  			return err
   223  		}
   224  	}
   225  }
   226  
   227  func (ds *deliverServer) blocks() chan *orderer.DeliverResponse {
   228  	ds.Lock()
   229  	defer ds.Unlock()
   230  	blockChan := ds.blockResponses
   231  	return blockChan
   232  }
   233  
   234  func (ds *deliverServer) setBlocks(blocks chan *orderer.DeliverResponse) {
   235  	ds.Lock()
   236  	defer ds.Unlock()
   237  	ds.blockResponses = blocks
   238  }
   239  
   240  func (ds *deliverServer) port() int {
   241  	_, portStr, err := net.SplitHostPort(ds.srv.Address())
   242  	assert.NoError(ds.t, err)
   243  
   244  	port, err := strconv.ParseInt(portStr, 10, 32)
   245  	assert.NoError(ds.t, err)
   246  	return int(port)
   247  }
   248  
   249  func (ds *deliverServer) resurrect() {
   250  	var err error
   251  	// copy the responses channel into a fresh one
   252  	respChan := make(chan *orderer.DeliverResponse, 100)
   253  	for resp := range ds.blocks() {
   254  		respChan <- resp
   255  	}
   256  	ds.blockResponses = respChan
   257  	ds.srv.Stop()
   258  	// And re-create the gRPC server on that port
   259  	ds.srv, err = comm.NewGRPCServer(fmt.Sprintf("127.0.0.1:%d", ds.port()), comm.ServerConfig{})
   260  	assert.NoError(ds.t, err)
   261  	orderer.RegisterAtomicBroadcastServer(ds.srv.Server(), ds)
   262  	go ds.srv.Start()
   263  }
   264  
   265  func (ds *deliverServer) stop() {
   266  	ds.srv.Stop()
   267  	close(ds.blocks())
   268  }
   269  
   270  func (ds *deliverServer) enqueueResponse(seq uint64) {
   271  	ds.blocks() <- &orderer.DeliverResponse{
   272  		Type: &orderer.DeliverResponse_Block{Block: protoutil.NewBlock(seq, nil)},
   273  	}
   274  }
   275  
   276  func (ds *deliverServer) addExpectProbeAssert() {
   277  	ds.seekAssertions <- func(info *orderer.SeekInfo, _ string) {
   278  		assert.NotNil(ds.t, info.GetStart().GetNewest())
   279  		assert.Equal(ds.t, info.ErrorResponse, orderer.SeekInfo_BEST_EFFORT)
   280  	}
   281  }
   282  
   283  func (ds *deliverServer) addExpectPullAssert(seq uint64) {
   284  	ds.seekAssertions <- func(info *orderer.SeekInfo, _ string) {
   285  		seekPosition := info.GetStart()
   286  		require.NotNil(ds.t, seekPosition)
   287  		seekSpecified := seekPosition.GetSpecified()
   288  		require.NotNil(ds.t, seekSpecified)
   289  		assert.Equal(ds.t, seq, seekSpecified.Number)
   290  		assert.Equal(ds.t, info.ErrorResponse, orderer.SeekInfo_BEST_EFFORT)
   291  	}
   292  }
   293  
   294  func newClusterNode(t *testing.T) *deliverServer {
   295  	srv, err := comm.NewGRPCServer("127.0.0.1:0", comm.ServerConfig{})
   296  	if err != nil {
   297  		panic(err)
   298  	}
   299  	ds := &deliverServer{
   300  		t:              t,
   301  		seekAssertions: make(chan func(*orderer.SeekInfo, string), 100),
   302  		blockResponses: make(chan *orderer.DeliverResponse, 100),
   303  		srv:            srv,
   304  	}
   305  	orderer.RegisterAtomicBroadcastServer(srv.Server(), ds)
   306  	go srv.Start()
   307  	return ds
   308  }
   309  
   310  func newBlockPuller(dialer *countingDialer, orderers ...string) *cluster.BlockPuller {
   311  	return &cluster.BlockPuller{
   312  		Dialer:              dialer,
   313  		Channel:             "mychannel",
   314  		Signer:              &mocks.SignerSerializer{},
   315  		Endpoints:           endpointCriteriaFromEndpoints(orderers...),
   316  		FetchTimeout:        time.Second * 10,
   317  		MaxTotalBufferBytes: 1024 * 1024, // 1MB
   318  		RetryTimeout:        time.Millisecond * 10,
   319  		VerifyBlockSequence: noopBlockVerifierf,
   320  		Logger:              flogging.MustGetLogger("test"),
   321  	}
   322  }
   323  
   324  func endpointCriteriaFromEndpoints(orderers ...string) []cluster.EndpointCriteria {
   325  	var res []cluster.EndpointCriteria
   326  	for _, orderer := range orderers {
   327  		res = append(res, cluster.EndpointCriteria{Endpoint: orderer})
   328  	}
   329  	return res
   330  }
   331  
   332  func TestBlockPullerBasicHappyPath(t *testing.T) {
   333  	// Scenario: Single ordering node,
   334  	// and the block puller pulls blocks 5 to 10.
   335  	osn := newClusterNode(t)
   336  	defer osn.stop()
   337  
   338  	dialer := newCountingDialer()
   339  	bp := newBlockPuller(dialer, osn.srv.Address())
   340  
   341  	// The first seek request asks for the latest block
   342  	osn.addExpectProbeAssert()
   343  	// The first response says that the height is 10
   344  	osn.enqueueResponse(10)
   345  	// The next seek request is for block 5
   346  	osn.addExpectPullAssert(5)
   347  	// The later responses are the block themselves
   348  	for i := 5; i <= 10; i++ {
   349  		osn.enqueueResponse(uint64(i))
   350  	}
   351  
   352  	for i := 5; i <= 10; i++ {
   353  		assert.Equal(t, uint64(i), bp.PullBlock(uint64(i)).Header.Number)
   354  	}
   355  	assert.Len(t, osn.blockResponses, 0)
   356  
   357  	bp.Close()
   358  	dialer.assertAllConnectionsClosed(t)
   359  }
   360  
   361  func TestBlockPullerDuplicate(t *testing.T) {
   362  	// Scenario: The address of the ordering node
   363  	// is found twice in the configuration, but this
   364  	// doesn't create a problem.
   365  	osn := newClusterNode(t)
   366  	defer osn.stop()
   367  
   368  	dialer := newCountingDialer()
   369  	// Add the address twice
   370  	bp := newBlockPuller(dialer, osn.srv.Address(), osn.srv.Address())
   371  
   372  	// The first seek request asks for the latest block (twice)
   373  	osn.addExpectProbeAssert()
   374  	osn.addExpectProbeAssert()
   375  	// The first response says that the height is 3
   376  	osn.enqueueResponse(3)
   377  	osn.enqueueResponse(3)
   378  	// The next seek request is for block 1, only from some OSN
   379  	osn.addExpectPullAssert(1)
   380  	// The later responses are the block themselves
   381  	for i := 1; i <= 3; i++ {
   382  		osn.enqueueResponse(uint64(i))
   383  	}
   384  
   385  	for i := 1; i <= 3; i++ {
   386  		assert.Equal(t, uint64(i), bp.PullBlock(uint64(i)).Header.Number)
   387  	}
   388  	assert.Len(t, osn.blockResponses, 0)
   389  
   390  	bp.Close()
   391  	dialer.assertAllConnectionsClosed(t)
   392  }
   393  
   394  func TestBlockPullerHeavyBlocks(t *testing.T) {
   395  	// Scenario: Single ordering node,
   396  	// and the block puller pulls 50 blocks, each
   397  	// weighing 1K, but the buffer can only contain
   398  	// 10K, so it should pull the 50 blocks in chunks of 10,
   399  	// and verify 5 sequences at a time.
   400  
   401  	osn := newClusterNode(t)
   402  	defer osn.stop()
   403  	osn.addExpectProbeAssert()
   404  	osn.addExpectPullAssert(1)
   405  	osn.enqueueResponse(100) // The last sequence is 100
   406  
   407  	enqueueBlockBatch := func(start, end uint64) {
   408  		for seq := start; seq <= end; seq++ {
   409  			resp := &orderer.DeliverResponse{
   410  				Type: &orderer.DeliverResponse_Block{
   411  					Block: protoutil.NewBlock(seq, nil),
   412  				},
   413  			}
   414  			data := resp.GetBlock().Data.Data
   415  			resp.GetBlock().Data.Data = append(data, make([]byte, 1024))
   416  			osn.blockResponses <- resp
   417  		}
   418  	}
   419  
   420  	dialer := newCountingDialer()
   421  	bp := newBlockPuller(dialer, osn.srv.Address())
   422  	var gotBlockMessageCount int
   423  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   424  		if strings.Contains(entry.Message, "Got block") {
   425  			gotBlockMessageCount++
   426  		}
   427  		return nil
   428  	}))
   429  	bp.MaxTotalBufferBytes = 1024 * 10 // 10K
   430  
   431  	// Enqueue only the next batch in the orderer node.
   432  	// This ensures that only 10 blocks are fetched into the buffer
   433  	// and not more.
   434  	for i := uint64(0); i < 5; i++ {
   435  		enqueueBlockBatch(i*10+uint64(1), i*10+uint64(10))
   436  		for seq := i*10 + uint64(1); seq <= i*10+uint64(10); seq++ {
   437  			assert.Equal(t, seq, bp.PullBlock(seq).Header.Number)
   438  		}
   439  	}
   440  
   441  	assert.Equal(t, 50, gotBlockMessageCount)
   442  	bp.Close()
   443  	dialer.assertAllConnectionsClosed(t)
   444  }
   445  
   446  func TestBlockPullerClone(t *testing.T) {
   447  	// Scenario: We have a block puller that is connected
   448  	// to an ordering node, and we clone it.
   449  	// We expect that the new block puller is a clean slate
   450  	// and doesn't share any internal state with its origin.
   451  	osn1 := newClusterNode(t)
   452  	defer osn1.stop()
   453  
   454  	osn1.addExpectProbeAssert()
   455  	osn1.addExpectPullAssert(1)
   456  	// last block sequence is 100
   457  	osn1.enqueueResponse(100)
   458  	osn1.enqueueResponse(1)
   459  	// The block puller is expected to disconnect after pulling
   460  	// a single block. So signal the server-side to avoid
   461  	// grabbing the next block after block 1 is pulled.
   462  	osn1.blockResponses <- nil
   463  
   464  	dialer := newCountingDialer()
   465  	bp := newBlockPuller(dialer, osn1.srv.Address())
   466  	bp.FetchTimeout = time.Millisecond * 100
   467  	// Pull a block at a time and don't buffer them
   468  	bp.MaxTotalBufferBytes = 1
   469  	// Clone the block puller
   470  	bpClone := bp.Clone()
   471  	// and override its channel
   472  	bpClone.Channel = "foo"
   473  	// Ensure the channel change doesn't reflect in the original puller
   474  	assert.Equal(t, "mychannel", bp.Channel)
   475  
   476  	block := bp.PullBlock(1)
   477  	assert.Equal(t, uint64(1), block.Header.Number)
   478  
   479  	// After the original block puller is closed, the
   480  	// clone should not be affected
   481  	bp.Close()
   482  	dialer.assertAllConnectionsClosed(t)
   483  
   484  	// The clone block puller should not have cached the internal state
   485  	// from its origin block puller, thus it should probe again for the last block
   486  	// sequence as if it is a new block puller.
   487  	osn1.addExpectProbeAssert()
   488  	osn1.addExpectPullAssert(2)
   489  	osn1.enqueueResponse(100)
   490  	osn1.enqueueResponse(2)
   491  
   492  	block = bpClone.PullBlock(2)
   493  	assert.Equal(t, uint64(2), block.Header.Number)
   494  
   495  	bpClone.Close()
   496  	dialer.assertAllConnectionsClosed(t)
   497  }
   498  
   499  func TestBlockPullerHeightsByEndpoints(t *testing.T) {
   500  	// Scenario: We ask for the latest block from all the known ordering nodes.
   501  	// One ordering node is unavailable (offline).
   502  	// One ordering node doesn't have blocks for that channel.
   503  	// The remaining node returns the latest block.
   504  	osn1 := newClusterNode(t)
   505  
   506  	osn2 := newClusterNode(t)
   507  	defer osn2.stop()
   508  
   509  	osn3 := newClusterNode(t)
   510  	defer osn3.stop()
   511  
   512  	dialer := newCountingDialer()
   513  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address(), osn3.srv.Address())
   514  
   515  	// The first seek request asks for the latest block from some ordering node
   516  	osn1.addExpectProbeAssert()
   517  	osn2.addExpectProbeAssert()
   518  	osn3.addExpectProbeAssert()
   519  
   520  	// The first ordering node is offline
   521  	osn1.stop()
   522  	// The second returns a bad answer
   523  	osn2.blockResponses <- &orderer.DeliverResponse{
   524  		Type: &orderer.DeliverResponse_Status{Status: common.Status_FORBIDDEN},
   525  	}
   526  	// The third returns the latest block
   527  	osn3.enqueueResponse(5)
   528  
   529  	res, err := bp.HeightsByEndpoints()
   530  	assert.NoError(t, err)
   531  	expected := map[string]uint64{
   532  		osn3.srv.Address(): 6,
   533  	}
   534  	assert.Equal(t, expected, res)
   535  
   536  	bp.Close()
   537  	dialer.assertAllConnectionsClosed(t)
   538  }
   539  
   540  func TestBlockPullerMultipleOrderers(t *testing.T) {
   541  	// Scenario: 3 ordering nodes,
   542  	// and the block puller pulls blocks 3 to 5 from some
   543  	// orderer node.
   544  	// We ensure that exactly a single orderer node sent blocks.
   545  
   546  	osn1 := newClusterNode(t)
   547  	defer osn1.stop()
   548  
   549  	osn2 := newClusterNode(t)
   550  	defer osn2.stop()
   551  
   552  	osn3 := newClusterNode(t)
   553  	defer osn3.stop()
   554  
   555  	dialer := newCountingDialer()
   556  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address(), osn3.srv.Address())
   557  
   558  	// The first seek request asks for the latest block from some ordering node
   559  	osn1.addExpectProbeAssert()
   560  	osn2.addExpectProbeAssert()
   561  	osn3.addExpectProbeAssert()
   562  	// The first response says that the height is 5
   563  	osn1.enqueueResponse(5)
   564  	osn2.enqueueResponse(5)
   565  	osn3.enqueueResponse(5)
   566  
   567  	// The next seek request is for block 3
   568  	osn1.addExpectPullAssert(3)
   569  	osn2.addExpectPullAssert(3)
   570  	osn3.addExpectPullAssert(3)
   571  
   572  	// The later responses are the block themselves
   573  	for i := 3; i <= 5; i++ {
   574  		osn1.enqueueResponse(uint64(i))
   575  		osn2.enqueueResponse(uint64(i))
   576  		osn3.enqueueResponse(uint64(i))
   577  	}
   578  
   579  	initialTotalBlockAmount := len(osn1.blockResponses) + len(osn2.blockResponses) + len(osn3.blockResponses)
   580  
   581  	for i := 3; i <= 5; i++ {
   582  		assert.Equal(t, uint64(i), bp.PullBlock(uint64(i)).Header.Number)
   583  	}
   584  
   585  	// Assert the cumulative amount of blocks in the OSNs went down by 6:
   586  	// blocks 3, 4, 5 were pulled - that's 3 blocks.
   587  	// block 5 was pulled 3 times at the probe phase.
   588  	finalTotalBlockAmount := len(osn1.blockResponses) + len(osn2.blockResponses) + len(osn3.blockResponses)
   589  	assert.Equal(t, initialTotalBlockAmount-6, finalTotalBlockAmount)
   590  
   591  	bp.Close()
   592  	dialer.assertAllConnectionsClosed(t)
   593  }
   594  
   595  func TestBlockPullerFailover(t *testing.T) {
   596  	// Scenario:
   597  	// The block puller is expected to pull blocks 1 to 3.
   598  	// There are two ordering nodes, but at first -  only node 1 is available.
   599  	// the block puller first connects to it, but as it pulls the first block,
   600  	// it crashes.
   601  	// A second orderer is then spawned and the block puller is expected to
   602  	// connect to it and pull the rest of the blocks.
   603  
   604  	osn1 := newClusterNode(t)
   605  	osn1.addExpectProbeAssert()
   606  	osn1.addExpectPullAssert(1)
   607  	osn1.enqueueResponse(3)
   608  	osn1.enqueueResponse(1)
   609  
   610  	osn2 := newClusterNode(t)
   611  	defer osn2.stop()
   612  
   613  	osn2.addExpectProbeAssert()
   614  	osn2.addExpectPullAssert(1)
   615  	// First response is for the probe
   616  	osn2.enqueueResponse(3)
   617  	// Next three responses are for the pulling.
   618  	osn2.enqueueResponse(1)
   619  	osn2.enqueueResponse(2)
   620  	osn2.enqueueResponse(3)
   621  
   622  	// First we stop node 2 to make sure the block puller can't connect to it when it is created
   623  	osn2.stop()
   624  
   625  	dialer := newCountingDialer()
   626  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address())
   627  
   628  	// We don't want to rely on fetch timeout, but on pure failover logic,
   629  	// so make the fetch timeout huge
   630  	bp.FetchTimeout = time.Hour
   631  
   632  	// Configure the block puller logger to signal the wait group once the block puller
   633  	// received the first block.
   634  	var pulledBlock1 sync.WaitGroup
   635  	pulledBlock1.Add(1)
   636  	var once sync.Once
   637  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   638  		if strings.Contains(entry.Message, "Got block [1] of size") {
   639  			once.Do(pulledBlock1.Done)
   640  		}
   641  		return nil
   642  	}))
   643  
   644  	go func() {
   645  		// Wait for the block puller to pull the first block
   646  		pulledBlock1.Wait()
   647  		// and now crash node1 and resurrect node 2
   648  		osn1.stop()
   649  		osn2.resurrect()
   650  	}()
   651  
   652  	// Assert reception of blocks 1 to 3
   653  	assert.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
   654  	assert.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
   655  	assert.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
   656  
   657  	bp.Close()
   658  	dialer.assertAllConnectionsClosed(t)
   659  }
   660  
   661  func TestBlockPullerNoneResponsiveOrderer(t *testing.T) {
   662  	// Scenario: There are two ordering nodes, and the block puller
   663  	// connects to one of them.
   664  	// it fetches the first block, but then the second block isn't sent
   665  	// for too long, and the block puller should abort, and try the other
   666  	// node.
   667  
   668  	osn1 := newClusterNode(t)
   669  	defer osn1.stop()
   670  
   671  	osn2 := newClusterNode(t)
   672  	defer osn2.stop()
   673  
   674  	osn1.addExpectProbeAssert()
   675  	osn2.addExpectProbeAssert()
   676  	osn1.enqueueResponse(3)
   677  	osn2.enqueueResponse(3)
   678  
   679  	dialer := newCountingDialer()
   680  	bp := newBlockPuller(dialer, osn1.srv.Address(), osn2.srv.Address())
   681  	bp.FetchTimeout = time.Millisecond * 100
   682  
   683  	notInUseOrdererNode := osn2
   684  	// Configure the logger to tell us who is the orderer node that the block puller
   685  	// isn't connected to. This is done by intercepting the appropriate message
   686  	var waitForConnection sync.WaitGroup
   687  	waitForConnection.Add(1)
   688  	var once sync.Once
   689  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   690  		if !strings.Contains(entry.Message, "Sending request for block [1]") {
   691  			return nil
   692  		}
   693  		defer once.Do(waitForConnection.Done)
   694  		s := entry.Message[len("Sending request for block [1] to 127.0.0.1:"):]
   695  		port, err := strconv.ParseInt(s, 10, 32)
   696  		assert.NoError(t, err)
   697  		// If osn2 is the current orderer we're connected to,
   698  		// the orderer we're not connected to, is osn1
   699  		if osn2.port() == int(port) {
   700  			notInUseOrdererNode = osn1
   701  			// Enqueue block 1 to the current orderer the block puller
   702  			// is connected to
   703  			osn2.enqueueResponse(1)
   704  			osn2.addExpectPullAssert(1)
   705  		} else {
   706  			// We're connected to osn1, so enqueue block 1
   707  			osn1.enqueueResponse(1)
   708  			osn1.addExpectPullAssert(1)
   709  		}
   710  		return nil
   711  	}))
   712  
   713  	go func() {
   714  		waitForConnection.Wait()
   715  		// Enqueue the height int the orderer we're connected to
   716  		notInUseOrdererNode.enqueueResponse(3)
   717  		notInUseOrdererNode.addExpectProbeAssert()
   718  		// Enqueue blocks 1, 2, 3 to the orderer node we're not connected to.
   719  		notInUseOrdererNode.addExpectPullAssert(1)
   720  		notInUseOrdererNode.enqueueResponse(1)
   721  		notInUseOrdererNode.enqueueResponse(2)
   722  		notInUseOrdererNode.enqueueResponse(3)
   723  	}()
   724  
   725  	// Assert reception of blocks 1 to 3
   726  	assert.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
   727  	assert.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
   728  	assert.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
   729  
   730  	bp.Close()
   731  	dialer.assertAllConnectionsClosed(t)
   732  }
   733  
   734  func TestBlockPullerNoOrdererAliveAtStartup(t *testing.T) {
   735  	// Scenario: Single ordering node, and when the block puller
   736  	// starts up - the orderer is nowhere to be found.
   737  	osn := newClusterNode(t)
   738  	osn.stop()
   739  	defer osn.stop()
   740  
   741  	dialer := newCountingDialer()
   742  	bp := newBlockPuller(dialer, osn.srv.Address())
   743  
   744  	// Configure the logger to wait until the a failed connection attempt was made
   745  	var connectionAttempt sync.WaitGroup
   746  	connectionAttempt.Add(1)
   747  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
   748  		if strings.Contains(entry.Message, "Failed connecting to") {
   749  			connectionAttempt.Done()
   750  		}
   751  		return nil
   752  	}))
   753  
   754  	go func() {
   755  		connectionAttempt.Wait()
   756  		osn.resurrect()
   757  		// The first seek request asks for the latest block
   758  		osn.addExpectProbeAssert()
   759  		// The first response says that the last sequence is 2
   760  		osn.enqueueResponse(2)
   761  		// The next seek request is for block 1
   762  		osn.addExpectPullAssert(1)
   763  		// And the orderer returns block 1 and 2
   764  		osn.enqueueResponse(1)
   765  		osn.enqueueResponse(2)
   766  	}()
   767  
   768  	assert.Equal(t, uint64(1), bp.PullBlock(1).Header.Number)
   769  	assert.Equal(t, uint64(2), bp.PullBlock(2).Header.Number)
   770  
   771  	bp.Close()
   772  	dialer.assertAllConnectionsClosed(t)
   773  }
   774  
   775  func TestBlockPullerFailures(t *testing.T) {
   776  	// Scenario: Single ordering node is faulty, but later
   777  	// on it recovers.
   778  	failureError := errors.New("oops, something went wrong")
   779  	failStream := func(osn *deliverServer, _ *cluster.BlockPuller) {
   780  		osn.Lock()
   781  		osn.err = failureError
   782  		osn.Unlock()
   783  	}
   784  
   785  	badSigErr := errors.New("bad signature")
   786  	malformBlockSignatureAndRecreateOSNBuffer := func(osn *deliverServer, bp *cluster.BlockPuller) {
   787  		bp.VerifyBlockSequence = func(_ []*common.Block, _ string) error {
   788  			close(osn.blocks())
   789  			// After failing once, recover and remove the bad signature error.
   790  			defer func() {
   791  				// Skip recovery if we already recovered.
   792  				if badSigErr == nil {
   793  					return
   794  				}
   795  				badSigErr = nil
   796  				osn.setBlocks(make(chan *orderer.DeliverResponse, 100))
   797  				osn.enqueueResponse(3)
   798  				osn.enqueueResponse(1)
   799  				osn.enqueueResponse(2)
   800  				osn.enqueueResponse(3)
   801  			}()
   802  			return badSigErr
   803  		}
   804  	}
   805  
   806  	noFailFunc := func(_ *deliverServer, _ *cluster.BlockPuller) {}
   807  
   808  	recover := func(osn *deliverServer, bp *cluster.BlockPuller) func(entry zapcore.Entry) error {
   809  		return func(entry zapcore.Entry) error {
   810  			if osn.isFaulty() && strings.Contains(entry.Message, failureError.Error()) {
   811  				osn.Lock()
   812  				osn.err = nil
   813  				osn.Unlock()
   814  			}
   815  			if strings.Contains(entry.Message, "Failed verifying") {
   816  				bp.VerifyBlockSequence = noopBlockVerifierf
   817  			}
   818  			return nil
   819  		}
   820  	}
   821  
   822  	failAfterConnection := func(osn *deliverServer, logTrigger string, failFunc func()) func(entry zapcore.Entry) error {
   823  		once := &sync.Once{}
   824  		return func(entry zapcore.Entry) error {
   825  			if !osn.isFaulty() && strings.Contains(entry.Message, logTrigger) {
   826  				once.Do(func() {
   827  					failFunc()
   828  				})
   829  			}
   830  			return nil
   831  		}
   832  	}
   833  
   834  	for _, testCase := range []struct {
   835  		name       string
   836  		logTrigger string
   837  		beforeFunc func(*deliverServer, *cluster.BlockPuller)
   838  		failFunc   func(*deliverServer, *cluster.BlockPuller)
   839  	}{
   840  		{
   841  			name:       "failure at probe",
   842  			logTrigger: "skip this for this test case",
   843  			beforeFunc: func(osn *deliverServer, bp *cluster.BlockPuller) {
   844  				failStream(osn, nil)
   845  				// The first seek request asks for the latest block but it fails
   846  				osn.addExpectProbeAssert()
   847  				// And the last block sequence is returned
   848  				osn.enqueueResponse(3)
   849  				// The next seek request is for block 1
   850  				osn.addExpectPullAssert(1)
   851  			},
   852  			failFunc: noFailFunc,
   853  		},
   854  		{
   855  			name:       "failure at pull",
   856  			logTrigger: "Sending request for block [1]",
   857  			beforeFunc: func(osn *deliverServer, bp *cluster.BlockPuller) {
   858  				// The first seek request asks for the latest block and succeeds
   859  				osn.addExpectProbeAssert()
   860  				// But as the block puller tries to pull, the stream fails, so it should reconnect.
   861  				osn.addExpectProbeAssert()
   862  				// We need to send the latest sequence twice because the stream fails after the first time.
   863  				osn.enqueueResponse(3)
   864  				osn.enqueueResponse(3)
   865  				osn.addExpectPullAssert(1)
   866  			},
   867  			failFunc: failStream,
   868  		},
   869  		{
   870  			name:       "failure at verifying pulled block",
   871  			logTrigger: "Sending request for block [1]",
   872  			beforeFunc: func(osn *deliverServer, bp *cluster.BlockPuller) {
   873  				// The first seek request asks for the latest block and succeeds
   874  				osn.addExpectProbeAssert()
   875  				osn.enqueueResponse(3)
   876  				// It then pulls all 3 blocks, but fails verifying them.
   877  				osn.addExpectPullAssert(1)
   878  				osn.enqueueResponse(1)
   879  				osn.enqueueResponse(2)
   880  				osn.enqueueResponse(3)
   881  				// So it probes again, and then requests block 1 again.
   882  				osn.addExpectProbeAssert()
   883  				osn.addExpectPullAssert(1)
   884  			},
   885  			failFunc: malformBlockSignatureAndRecreateOSNBuffer,
   886  		},
   887  	} {
   888  		testCase := testCase
   889  		t.Run(testCase.name, func(t *testing.T) {
   890  			osn := newClusterNode(t)
   891  			defer osn.stop()
   892  
   893  			dialer := newCountingDialer()
   894  			bp := newBlockPuller(dialer, osn.srv.Address())
   895  
   896  			testCase.beforeFunc(osn, bp)
   897  
   898  			// Configure the logger to trigger failure at the appropriate time.
   899  			fail := func() {
   900  				testCase.failFunc(osn, bp)
   901  			}
   902  			bp.Logger = bp.Logger.WithOptions(zap.Hooks(recover(osn, bp), failAfterConnection(osn, testCase.logTrigger, fail)))
   903  
   904  			// The orderer sends blocks 1 to 3
   905  			osn.enqueueResponse(1)
   906  			osn.enqueueResponse(2)
   907  			osn.enqueueResponse(3)
   908  
   909  			assert.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
   910  			assert.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
   911  			assert.Equal(t, uint64(3), bp.PullBlock(uint64(3)).Header.Number)
   912  
   913  			bp.Close()
   914  			dialer.assertAllConnectionsClosed(t)
   915  		})
   916  	}
   917  }
   918  
   919  func TestBlockPullerBadBlocks(t *testing.T) {
   920  	// Scenario: ordering node sends malformed blocks.
   921  
   922  	removeHeader := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   923  		resp.GetBlock().Header = nil
   924  		return resp
   925  	}
   926  
   927  	removeData := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   928  		resp.GetBlock().Data = nil
   929  		return resp
   930  	}
   931  
   932  	removeMetadata := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   933  		resp.GetBlock().Metadata = nil
   934  		return resp
   935  	}
   936  
   937  	changeType := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   938  		resp.Type = nil
   939  		return resp
   940  	}
   941  
   942  	statusType := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   943  		resp.Type = &orderer.DeliverResponse_Status{
   944  			Status: common.Status_INTERNAL_SERVER_ERROR,
   945  		}
   946  		return resp
   947  	}
   948  
   949  	changeSequence := func(resp *orderer.DeliverResponse) *orderer.DeliverResponse {
   950  		resp.GetBlock().Header.Number = 3
   951  		return resp
   952  	}
   953  
   954  	testcases := []struct {
   955  		name           string
   956  		corruptBlock   func(block *orderer.DeliverResponse) *orderer.DeliverResponse
   957  		expectedErrMsg string
   958  	}{
   959  		{
   960  			name:           "nil header",
   961  			corruptBlock:   removeHeader,
   962  			expectedErrMsg: "block header is nil",
   963  		},
   964  		{
   965  			name:           "nil data",
   966  			corruptBlock:   removeData,
   967  			expectedErrMsg: "block data is nil",
   968  		},
   969  		{
   970  			name:           "nil metadata",
   971  			corruptBlock:   removeMetadata,
   972  			expectedErrMsg: "block metadata is empty",
   973  		},
   974  		{
   975  			name:           "wrong type",
   976  			corruptBlock:   changeType,
   977  			expectedErrMsg: "response is of type <nil>, but expected a block",
   978  		},
   979  		{
   980  			name:           "bad type",
   981  			corruptBlock:   statusType,
   982  			expectedErrMsg: "faulty node, received: status:INTERNAL_SERVER_ERROR ",
   983  		},
   984  		{
   985  			name:           "wrong number",
   986  			corruptBlock:   changeSequence,
   987  			expectedErrMsg: "got unexpected sequence",
   988  		},
   989  	}
   990  
   991  	for _, testCase := range testcases {
   992  		testCase := testCase
   993  		t.Run(testCase.name, func(t *testing.T) {
   994  
   995  			osn := newClusterNode(t)
   996  			defer osn.stop()
   997  
   998  			osn.addExpectProbeAssert()
   999  			osn.addExpectPullAssert(10)
  1000  
  1001  			dialer := newCountingDialer()
  1002  			bp := newBlockPuller(dialer, osn.srv.Address())
  1003  
  1004  			osn.enqueueResponse(10)
  1005  			osn.enqueueResponse(10)
  1006  			// Fetch the first
  1007  			block := <-osn.blockResponses
  1008  			// Enqueue the probe response
  1009  			// re-insert the pull response after corrupting it, to the tail of the queue
  1010  			osn.blockResponses <- testCase.corruptBlock(block)
  1011  			// [ 10  10* ]
  1012  			// Expect the block puller to retry and this time give it what it wants
  1013  			osn.addExpectProbeAssert()
  1014  			osn.addExpectPullAssert(10)
  1015  
  1016  			// Wait until the block is pulled and the malleability is detected
  1017  			var detectedBadBlock sync.WaitGroup
  1018  			detectedBadBlock.Add(1)
  1019  			bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1020  				if strings.Contains(entry.Message, fmt.Sprintf("Failed pulling blocks: %s", testCase.expectedErrMsg)) {
  1021  					detectedBadBlock.Done()
  1022  					// Close the channel to make the current server-side deliver stream close
  1023  					close(osn.blocks())
  1024  					// Ane reset the block buffer to be able to write into it again
  1025  					osn.setBlocks(make(chan *orderer.DeliverResponse, 100))
  1026  					// Put a correct block after it, 1 for the probing and 1 for the fetch
  1027  					osn.enqueueResponse(10)
  1028  					osn.enqueueResponse(10)
  1029  				}
  1030  				return nil
  1031  			}))
  1032  
  1033  			bp.PullBlock(10)
  1034  			detectedBadBlock.Wait()
  1035  
  1036  			bp.Close()
  1037  			dialer.assertAllConnectionsClosed(t)
  1038  		})
  1039  	}
  1040  }
  1041  
  1042  func TestImpatientStreamFailure(t *testing.T) {
  1043  	osn := newClusterNode(t)
  1044  	dialer := newCountingDialer()
  1045  	defer dialer.assertAllConnectionsClosed(t)
  1046  	// Wait for OSN to start
  1047  	// by trying to connect to it
  1048  	var conn *grpc.ClientConn
  1049  	var err error
  1050  
  1051  	gt := gomega.NewGomegaWithT(t)
  1052  	gt.Eventually(func() (bool, error) {
  1053  		conn, err = dialer.Dial(osn.endpointCriteria())
  1054  		return true, err
  1055  	}).Should(gomega.BeTrue())
  1056  	newStream := cluster.NewImpatientStream(conn, time.Millisecond*100)
  1057  	defer conn.Close()
  1058  	// Shutdown the OSN
  1059  	osn.stop()
  1060  	// Ensure the port isn't open anymore
  1061  	gt.Eventually(func() (bool, error) {
  1062  		ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10)
  1063  		defer cancel()
  1064  		conn, _ := grpc.DialContext(ctx, osn.srv.Address(), grpc.WithBlock(), grpc.WithInsecure())
  1065  		if conn != nil {
  1066  			conn.Close()
  1067  			return false, nil
  1068  		}
  1069  		return true, nil
  1070  	}).Should(gomega.BeTrue())
  1071  	stream, err := newStream()
  1072  	if err != nil {
  1073  		return
  1074  	}
  1075  	_, err = stream.Recv()
  1076  	assert.Error(t, err)
  1077  }
  1078  
  1079  func TestBlockPullerMaxRetriesExhausted(t *testing.T) {
  1080  	// Scenario:
  1081  	// The block puller is expected to pull blocks 1 to 3.
  1082  	// But the orderer only has blocks 1,2, and from some reason
  1083  	// it sends back block 2 twice (we do this so that we
  1084  	// don't rely on timeout, because timeouts are flaky in tests).
  1085  	// It should attempt to re-connect and to send requests
  1086  	// until the attempt number is exhausted, after which
  1087  	// it gives up, and nil is returned.
  1088  
  1089  	osn := newClusterNode(t)
  1090  	defer osn.stop()
  1091  
  1092  	// We report having up to block 3.
  1093  	osn.enqueueResponse(3)
  1094  	osn.addExpectProbeAssert()
  1095  	// We send blocks 1
  1096  	osn.addExpectPullAssert(1)
  1097  	osn.enqueueResponse(1)
  1098  	// And 2, twice.
  1099  	osn.enqueueResponse(2)
  1100  	osn.enqueueResponse(2)
  1101  	// A nil message signals the deliver stream closes.
  1102  	// This is to signal the server side to prepare for a new deliver
  1103  	// stream that the client should open.
  1104  	osn.blockResponses <- nil
  1105  
  1106  	for i := 0; i < 2; i++ {
  1107  		// Therefore, the block puller should disconnect and reconnect.
  1108  		osn.addExpectProbeAssert()
  1109  		// We report having up to block 3.
  1110  		osn.enqueueResponse(3)
  1111  		// And we expect to be asked for block 3, since blocks 1, 2
  1112  		// have already been passed to the caller.
  1113  		osn.addExpectPullAssert(3)
  1114  		// Once again, we send 2 instead of 3
  1115  		osn.enqueueResponse(2)
  1116  		// The client disconnects again
  1117  		osn.blockResponses <- nil
  1118  	}
  1119  
  1120  	dialer := newCountingDialer()
  1121  	bp := newBlockPuller(dialer, osn.srv.Address())
  1122  
  1123  	var exhaustedRetryAttemptsLogged bool
  1124  
  1125  	bp.Logger = bp.Logger.WithOptions(zap.Hooks(func(entry zapcore.Entry) error {
  1126  		if entry.Message == "Failed pulling block [3]: retry count exhausted(2)" {
  1127  			exhaustedRetryAttemptsLogged = true
  1128  		}
  1129  		return nil
  1130  	}))
  1131  
  1132  	bp.MaxPullBlockRetries = 2
  1133  	// We don't expect to timeout in this test, so make the timeout large
  1134  	// to prevent flakes due to CPU starvation.
  1135  	bp.FetchTimeout = time.Hour
  1136  	// Make the buffer tiny, only a single byte - in order deliver blocks
  1137  	// to the caller one by one and not store them in the buffer.
  1138  	bp.MaxTotalBufferBytes = 1
  1139  
  1140  	// Assert reception of blocks 1 to 3
  1141  	assert.Equal(t, uint64(1), bp.PullBlock(uint64(1)).Header.Number)
  1142  	assert.Equal(t, uint64(2), bp.PullBlock(uint64(2)).Header.Number)
  1143  	assert.Nil(t, bp.PullBlock(uint64(3)))
  1144  
  1145  	bp.Close()
  1146  	dialer.assertAllConnectionsClosed(t)
  1147  	assert.True(t, exhaustedRetryAttemptsLogged)
  1148  }