github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/flowinfra/cluster_test.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package flowinfra
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"io"
    17  	"strings"
    18  	"sync/atomic"
    19  	"testing"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/base"
    22  	"github.com/cockroachdb/cockroach/pkg/keys"
    23  	"github.com/cockroachdb/cockroach/pkg/kv"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/rpc"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    33  	"github.com/cockroachdb/cockroach/pkg/testutils"
    34  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    35  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    36  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    37  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    38  	"github.com/cockroachdb/cockroach/pkg/util/log"
    39  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    41  	opentracing "github.com/opentracing/opentracing-go"
    42  	"github.com/stretchr/testify/require"
    43  )
    44  
    45  func TestClusterFlow(t *testing.T) {
    46  	defer leaktest.AfterTest(t)()
    47  	const numRows = 100
    48  
    49  	args := base.TestClusterArgs{ReplicationMode: base.ReplicationManual}
    50  	tc := serverutils.StartTestCluster(t, 3, args)
    51  	defer tc.Stopper().Stop(context.Background())
    52  
    53  	sumDigitsFn := func(row int) tree.Datum {
    54  		sum := 0
    55  		for row > 0 {
    56  			sum += row % 10
    57  			row /= 10
    58  		}
    59  		return tree.NewDInt(tree.DInt(sum))
    60  	}
    61  
    62  	sqlutils.CreateTable(t, tc.ServerConn(0), "t",
    63  		"num INT PRIMARY KEY, digitsum INT, numstr STRING, INDEX s (digitsum)",
    64  		numRows,
    65  		sqlutils.ToRowFn(sqlutils.RowIdxFn, sumDigitsFn, sqlutils.RowEnglishFn))
    66  
    67  	kvDB := tc.Server(0).DB()
    68  	desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "test", "t")
    69  	makeIndexSpan := func(start, end int) execinfrapb.TableReaderSpan {
    70  		var span roachpb.Span
    71  		prefix := roachpb.Key(sqlbase.MakeIndexKeyPrefix(keys.SystemSQLCodec, desc, desc.Indexes[0].ID))
    72  		span.Key = append(prefix, encoding.EncodeVarintAscending(nil, int64(start))...)
    73  		span.EndKey = append(span.EndKey, prefix...)
    74  		span.EndKey = append(span.EndKey, encoding.EncodeVarintAscending(nil, int64(end))...)
    75  		return execinfrapb.TableReaderSpan{Span: span}
    76  	}
    77  
    78  	// Set up table readers on three hosts feeding data into a join reader on
    79  	// the third host. This is a basic test for the distributed flow
    80  	// infrastructure, including local and remote streams.
    81  	//
    82  	// Note that the ranges won't necessarily be local to the table readers, but
    83  	// that doesn't matter for the purposes of this test.
    84  
    85  	// Start a span (useful to look at spans using Lightstep).
    86  	sp := tc.Server(0).ClusterSettings().Tracer.StartSpan("cluster test")
    87  	ctx := opentracing.ContextWithSpan(context.Background(), sp)
    88  	defer sp.Finish()
    89  
    90  	now := tc.Server(0).Clock().Now()
    91  	txnProto := roachpb.MakeTransaction(
    92  		"cluster-test",
    93  		nil, // baseKey
    94  		roachpb.NormalUserPriority,
    95  		now,
    96  		0, // maxOffset
    97  	)
    98  	txn := kv.NewTxnFromProto(ctx, kvDB, tc.Server(0).NodeID(), now, kv.RootTxn, &txnProto)
    99  	leafInputState := txn.GetLeafTxnInputState(ctx)
   100  
   101  	tr1 := execinfrapb.TableReaderSpec{
   102  		Table:    *desc,
   103  		IndexIdx: 1,
   104  		Spans:    []execinfrapb.TableReaderSpan{makeIndexSpan(0, 8)},
   105  	}
   106  
   107  	tr2 := execinfrapb.TableReaderSpec{
   108  		Table:    *desc,
   109  		IndexIdx: 1,
   110  		Spans:    []execinfrapb.TableReaderSpan{makeIndexSpan(8, 12)},
   111  	}
   112  
   113  	tr3 := execinfrapb.TableReaderSpec{
   114  		Table:    *desc,
   115  		IndexIdx: 1,
   116  		Spans:    []execinfrapb.TableReaderSpan{makeIndexSpan(12, 100)},
   117  	}
   118  
   119  	fid := execinfrapb.FlowID{UUID: uuid.MakeV4()}
   120  
   121  	req1 := &execinfrapb.SetupFlowRequest{
   122  		Version:           execinfra.Version,
   123  		LeafTxnInputState: &leafInputState,
   124  		Flow: execinfrapb.FlowSpec{
   125  			FlowID: fid,
   126  			Processors: []execinfrapb.ProcessorSpec{{
   127  				ProcessorID: 1,
   128  				Core:        execinfrapb.ProcessorCoreUnion{TableReader: &tr1},
   129  				Post: execinfrapb.PostProcessSpec{
   130  					Projection:    true,
   131  					OutputColumns: []uint32{0, 1},
   132  				},
   133  				Output: []execinfrapb.OutputRouterSpec{{
   134  					Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
   135  					Streams: []execinfrapb.StreamEndpointSpec{
   136  						{Type: execinfrapb.StreamEndpointSpec_REMOTE, StreamID: 0, TargetNodeID: tc.Server(2).NodeID()},
   137  					},
   138  				}},
   139  			}},
   140  		},
   141  	}
   142  
   143  	req2 := &execinfrapb.SetupFlowRequest{
   144  		Version:           execinfra.Version,
   145  		LeafTxnInputState: &leafInputState,
   146  		Flow: execinfrapb.FlowSpec{
   147  			FlowID: fid,
   148  			Processors: []execinfrapb.ProcessorSpec{{
   149  				ProcessorID: 2,
   150  				Core:        execinfrapb.ProcessorCoreUnion{TableReader: &tr2},
   151  				Post: execinfrapb.PostProcessSpec{
   152  					Projection:    true,
   153  					OutputColumns: []uint32{0, 1},
   154  				},
   155  				Output: []execinfrapb.OutputRouterSpec{{
   156  					Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
   157  					Streams: []execinfrapb.StreamEndpointSpec{
   158  						{Type: execinfrapb.StreamEndpointSpec_REMOTE, StreamID: 1, TargetNodeID: tc.Server(2).NodeID()},
   159  					},
   160  				}},
   161  			}},
   162  		},
   163  	}
   164  
   165  	req3 := &execinfrapb.SetupFlowRequest{
   166  		Version:           execinfra.Version,
   167  		LeafTxnInputState: &leafInputState,
   168  		Flow: execinfrapb.FlowSpec{
   169  			FlowID: fid,
   170  			Processors: []execinfrapb.ProcessorSpec{
   171  				{
   172  					ProcessorID: 3,
   173  					Core:        execinfrapb.ProcessorCoreUnion{TableReader: &tr3},
   174  					Post: execinfrapb.PostProcessSpec{
   175  						Projection:    true,
   176  						OutputColumns: []uint32{0, 1},
   177  					},
   178  					Output: []execinfrapb.OutputRouterSpec{{
   179  						Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
   180  						Streams: []execinfrapb.StreamEndpointSpec{
   181  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 2},
   182  						},
   183  					}},
   184  				},
   185  				{
   186  					ProcessorID: 4,
   187  					Input: []execinfrapb.InputSyncSpec{{
   188  						Type: execinfrapb.InputSyncSpec_ORDERED,
   189  						Ordering: execinfrapb.Ordering{Columns: []execinfrapb.Ordering_Column{
   190  							{ColIdx: 1, Direction: execinfrapb.Ordering_Column_ASC}}},
   191  						Streams: []execinfrapb.StreamEndpointSpec{
   192  							{Type: execinfrapb.StreamEndpointSpec_REMOTE, StreamID: 0},
   193  							{Type: execinfrapb.StreamEndpointSpec_REMOTE, StreamID: 1},
   194  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 2},
   195  						},
   196  						ColumnTypes: sqlbase.TwoIntCols,
   197  					}},
   198  					Core: execinfrapb.ProcessorCoreUnion{JoinReader: &execinfrapb.JoinReaderSpec{Table: *desc}},
   199  					Post: execinfrapb.PostProcessSpec{
   200  						Projection:    true,
   201  						OutputColumns: []uint32{2},
   202  					},
   203  					Output: []execinfrapb.OutputRouterSpec{{
   204  						Type:    execinfrapb.OutputRouterSpec_PASS_THROUGH,
   205  						Streams: []execinfrapb.StreamEndpointSpec{{Type: execinfrapb.StreamEndpointSpec_SYNC_RESPONSE}},
   206  					}},
   207  				},
   208  			},
   209  		},
   210  	}
   211  
   212  	var clients []execinfrapb.DistSQLClient
   213  	for i := 0; i < 3; i++ {
   214  		s := tc.Server(i)
   215  		conn, err := s.RPCContext().GRPCDialNode(s.ServingRPCAddr(), s.NodeID(),
   216  			rpc.DefaultClass).Connect(ctx)
   217  		if err != nil {
   218  			t.Fatal(err)
   219  		}
   220  		clients = append(clients, execinfrapb.NewDistSQLClient(conn))
   221  	}
   222  
   223  	log.Infof(ctx, "Setting up flow on 0")
   224  	if resp, err := clients[0].SetupFlow(ctx, req1); err != nil {
   225  		t.Fatal(err)
   226  	} else if resp.Error != nil {
   227  		t.Fatal(resp.Error)
   228  	}
   229  
   230  	log.Infof(ctx, "Setting up flow on 1")
   231  	if resp, err := clients[1].SetupFlow(ctx, req2); err != nil {
   232  		t.Fatal(err)
   233  	} else if resp.Error != nil {
   234  		t.Fatal(resp.Error)
   235  	}
   236  
   237  	log.Infof(ctx, "Running flow on 2")
   238  	stream, err := clients[2].RunSyncFlow(ctx)
   239  	if err != nil {
   240  		t.Fatal(err)
   241  	}
   242  	err = stream.Send(&execinfrapb.ConsumerSignal{SetupFlowRequest: req3})
   243  	if err != nil {
   244  		t.Fatal(err)
   245  	}
   246  
   247  	var decoder StreamDecoder
   248  	var rows sqlbase.EncDatumRows
   249  	var metas []execinfrapb.ProducerMetadata
   250  	for {
   251  		msg, err := stream.Recv()
   252  		if err != nil {
   253  			if err == io.EOF {
   254  				break
   255  			}
   256  			t.Fatal(err)
   257  		}
   258  		err = decoder.AddMessage(ctx, msg)
   259  		if err != nil {
   260  			t.Fatal(err)
   261  		}
   262  		rows, metas = testGetDecodedRows(t, &decoder, rows, metas)
   263  	}
   264  	metas = ignoreMisplannedRanges(metas)
   265  	metas = ignoreLeafTxnState(metas)
   266  	metas = ignoreMetricsMeta(metas)
   267  	if len(metas) != 0 {
   268  		t.Fatalf("unexpected metadata (%d): %+v", len(metas), metas)
   269  	}
   270  	// The result should be all the numbers in string form, ordered by the
   271  	// digit sum (and then by number).
   272  	var results []string
   273  	for sum := 1; sum <= 50; sum++ {
   274  		for i := 1; i <= numRows; i++ {
   275  			if int(tree.MustBeDInt(sumDigitsFn(i))) == sum {
   276  				results = append(results, fmt.Sprintf("['%s']", sqlutils.IntToEnglish(i)))
   277  			}
   278  		}
   279  	}
   280  	expected := strings.Join(results, " ")
   281  	expected = "[" + expected + "]"
   282  	if rowStr := rows.String([]*types.T{types.String}); rowStr != expected {
   283  		t.Errorf("Result: %s\n Expected: %s\n", rowStr, expected)
   284  	}
   285  }
   286  
   287  // ignoreMisplannedRanges takes a slice of metadata and returns the entries that
   288  // are not about range info from mis-planned ranges.
   289  func ignoreMisplannedRanges(metas []execinfrapb.ProducerMetadata) []execinfrapb.ProducerMetadata {
   290  	res := make([]execinfrapb.ProducerMetadata, 0)
   291  	for _, m := range metas {
   292  		if len(m.Ranges) == 0 {
   293  			res = append(res, m)
   294  		}
   295  	}
   296  	return res
   297  }
   298  
   299  // ignoreLeafTxnState takes a slice of metadata and returns the
   300  // entries excluding the leaf txn state.
   301  func ignoreLeafTxnState(metas []execinfrapb.ProducerMetadata) []execinfrapb.ProducerMetadata {
   302  	res := make([]execinfrapb.ProducerMetadata, 0)
   303  	for _, m := range metas {
   304  		if m.LeafTxnFinalState == nil {
   305  			res = append(res, m)
   306  		}
   307  	}
   308  	return res
   309  }
   310  
   311  // ignoreMetricsMeta takes a slice of metadata and returns the entries
   312  // excluding the metrics about node's goodput.
   313  func ignoreMetricsMeta(metas []execinfrapb.ProducerMetadata) []execinfrapb.ProducerMetadata {
   314  	res := make([]execinfrapb.ProducerMetadata, 0)
   315  	for _, m := range metas {
   316  		if m.Metrics == nil {
   317  			res = append(res, m)
   318  		}
   319  	}
   320  	return res
   321  }
   322  
   323  // TestLimitedBufferingDeadlock sets up a scenario which leads to deadlock if
   324  // a single consumer can block the entire router (#17097).
   325  func TestLimitedBufferingDeadlock(t *testing.T) {
   326  	defer leaktest.AfterTest(t)()
   327  
   328  	tc := serverutils.StartTestCluster(t, 1, base.TestClusterArgs{})
   329  	defer tc.Stopper().Stop(context.Background())
   330  
   331  	// Set up the following network - a simplification of the one described in
   332  	// #17097 (the numbers on the streams are the StreamIDs in the spec below):
   333  	//
   334  	//
   335  	//  +----------+        +----------+
   336  	//  |  Values  |        |  Values  |
   337  	//  +----------+        +-+------+-+
   338  	//         |              | hash |
   339  	//         |              +------+
   340  	//       1 |               |    |
   341  	//         |             2 |    |
   342  	//         v               v    |
   343  	//      +-------------------+   |
   344  	//      |     MergeJoin     |   |
   345  	//      +-------------------+   | 3
   346  	//                |             |
   347  	//                |             |
   348  	//              4 |             |
   349  	//                |             |
   350  	//                v             v
   351  	//              +-----------------+
   352  	//              |  ordered sync   |
   353  	//            +-+-----------------+-+
   354  	//            |       Response      |
   355  	//            +---------------------+
   356  	//
   357  	//
   358  	// This is not something we would end up with from a real SQL query but it's
   359  	// simple and exposes the deadlock: if the hash router outputs a large set of
   360  	// consecutive rows to the left side (which we can ensure by having a bunch of
   361  	// identical rows), the MergeJoiner would be blocked trying to write to the
   362  	// ordered sync, which in turn would block because it's trying to read from
   363  	// the other stream from the hash router. The other stream is blocked because
   364  	// the hash router is already in the process of pushing a row, and we have a
   365  	// deadlock.
   366  	//
   367  	// We set up the left Values processor to emit rows with consecutive values,
   368  	// and the right Values processor to emit groups of identical rows for each
   369  	// value.
   370  
   371  	// All our rows have a single integer column.
   372  	typs := []*types.T{types.Int}
   373  
   374  	// The left values rows are consecutive values.
   375  	leftRows := make(sqlbase.EncDatumRows, 20)
   376  	for i := range leftRows {
   377  		leftRows[i] = sqlbase.EncDatumRow{
   378  			sqlbase.DatumToEncDatum(typs[0], tree.NewDInt(tree.DInt(i))),
   379  		}
   380  	}
   381  	leftValuesSpec, err := execinfra.GenerateValuesSpec(typs, leftRows, 10 /* rows per chunk */)
   382  	if err != nil {
   383  		t.Fatal(err)
   384  	}
   385  
   386  	// The right values rows have groups of identical values (ensuring that large
   387  	// groups of rows go to the same hash bucket).
   388  	rightRows := make(sqlbase.EncDatumRows, 0)
   389  	for i := 1; i <= 20; i++ {
   390  		for j := 1; j <= 4*execinfra.RowChannelBufSize; j++ {
   391  			rightRows = append(rightRows, sqlbase.EncDatumRow{
   392  				sqlbase.DatumToEncDatum(typs[0], tree.NewDInt(tree.DInt(i))),
   393  			})
   394  		}
   395  	}
   396  
   397  	rightValuesSpec, err := execinfra.GenerateValuesSpec(typs, rightRows, 10 /* rows per chunk */)
   398  	if err != nil {
   399  		t.Fatal(err)
   400  	}
   401  
   402  	joinerSpec := execinfrapb.MergeJoinerSpec{
   403  		LeftOrdering: execinfrapb.Ordering{
   404  			Columns: []execinfrapb.Ordering_Column{{ColIdx: 0, Direction: execinfrapb.Ordering_Column_ASC}},
   405  		},
   406  		RightOrdering: execinfrapb.Ordering{
   407  			Columns: []execinfrapb.Ordering_Column{{ColIdx: 0, Direction: execinfrapb.Ordering_Column_ASC}},
   408  		},
   409  		Type: sqlbase.InnerJoin,
   410  	}
   411  
   412  	now := tc.Server(0).Clock().Now()
   413  	txnProto := roachpb.MakeTransaction(
   414  		"deadlock-test",
   415  		nil, // baseKey
   416  		roachpb.NormalUserPriority,
   417  		now,
   418  		0, // maxOffset
   419  	)
   420  	txn := kv.NewTxnFromProto(
   421  		context.Background(), tc.Server(0).DB(), tc.Server(0).NodeID(),
   422  		now, kv.RootTxn, &txnProto)
   423  	leafInputState := txn.GetLeafTxnInputState(context.Background())
   424  
   425  	req := execinfrapb.SetupFlowRequest{
   426  		Version:           execinfra.Version,
   427  		LeafTxnInputState: &leafInputState,
   428  		Flow: execinfrapb.FlowSpec{
   429  			FlowID: execinfrapb.FlowID{UUID: uuid.MakeV4()},
   430  			// The left-hand Values processor in the diagram above.
   431  			Processors: []execinfrapb.ProcessorSpec{
   432  				{
   433  					Core: execinfrapb.ProcessorCoreUnion{Values: &leftValuesSpec},
   434  					Output: []execinfrapb.OutputRouterSpec{{
   435  						Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
   436  						Streams: []execinfrapb.StreamEndpointSpec{
   437  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 1},
   438  						},
   439  					}},
   440  				},
   441  				// The right-hand Values processor in the diagram above.
   442  				{
   443  					Core: execinfrapb.ProcessorCoreUnion{Values: &rightValuesSpec},
   444  					Output: []execinfrapb.OutputRouterSpec{{
   445  						Type:        execinfrapb.OutputRouterSpec_BY_HASH,
   446  						HashColumns: []uint32{0},
   447  						Streams: []execinfrapb.StreamEndpointSpec{
   448  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 2},
   449  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 3},
   450  						},
   451  					}},
   452  				},
   453  				// The MergeJoin processor.
   454  				{
   455  					Input: []execinfrapb.InputSyncSpec{
   456  						{
   457  							Type:        execinfrapb.InputSyncSpec_UNORDERED,
   458  							Streams:     []execinfrapb.StreamEndpointSpec{{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 1}},
   459  							ColumnTypes: typs,
   460  						},
   461  						{
   462  							Type:        execinfrapb.InputSyncSpec_UNORDERED,
   463  							Streams:     []execinfrapb.StreamEndpointSpec{{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 2}},
   464  							ColumnTypes: typs,
   465  						},
   466  					},
   467  					Core: execinfrapb.ProcessorCoreUnion{MergeJoiner: &joinerSpec},
   468  					Post: execinfrapb.PostProcessSpec{
   469  						// Output only one (the left) column.
   470  						Projection:    true,
   471  						OutputColumns: []uint32{0},
   472  					},
   473  					Output: []execinfrapb.OutputRouterSpec{{
   474  						Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
   475  						Streams: []execinfrapb.StreamEndpointSpec{
   476  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 4},
   477  						},
   478  					}},
   479  				},
   480  				// The final (Response) processor.
   481  				{
   482  					Input: []execinfrapb.InputSyncSpec{{
   483  						Type: execinfrapb.InputSyncSpec_ORDERED,
   484  						Ordering: execinfrapb.Ordering{Columns: []execinfrapb.Ordering_Column{
   485  							{ColIdx: 0, Direction: execinfrapb.Ordering_Column_ASC}}},
   486  						Streams: []execinfrapb.StreamEndpointSpec{
   487  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 4},
   488  							{Type: execinfrapb.StreamEndpointSpec_LOCAL, StreamID: 3},
   489  						},
   490  						ColumnTypes: typs,
   491  					}},
   492  					Core: execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}},
   493  					Output: []execinfrapb.OutputRouterSpec{{
   494  						Type:    execinfrapb.OutputRouterSpec_PASS_THROUGH,
   495  						Streams: []execinfrapb.StreamEndpointSpec{{Type: execinfrapb.StreamEndpointSpec_SYNC_RESPONSE}},
   496  					}},
   497  				},
   498  			},
   499  		},
   500  	}
   501  	s := tc.Server(0)
   502  	conn, err := s.RPCContext().GRPCDialNode(s.ServingRPCAddr(), s.NodeID(),
   503  		rpc.DefaultClass).Connect(context.Background())
   504  	if err != nil {
   505  		t.Fatal(err)
   506  	}
   507  
   508  	stream, err := execinfrapb.NewDistSQLClient(conn).RunSyncFlow(context.Background())
   509  	if err != nil {
   510  		t.Fatal(err)
   511  	}
   512  	err = stream.Send(&execinfrapb.ConsumerSignal{SetupFlowRequest: &req})
   513  	if err != nil {
   514  		t.Fatal(err)
   515  	}
   516  
   517  	var decoder StreamDecoder
   518  	var rows sqlbase.EncDatumRows
   519  	var metas []execinfrapb.ProducerMetadata
   520  	for {
   521  		msg, err := stream.Recv()
   522  		if err != nil {
   523  			if err == io.EOF {
   524  				break
   525  			}
   526  			t.Fatal(err)
   527  		}
   528  		err = decoder.AddMessage(context.Background(), msg)
   529  		if err != nil {
   530  			t.Fatal(err)
   531  		}
   532  		rows, metas = testGetDecodedRows(t, &decoder, rows, metas)
   533  	}
   534  	metas = ignoreMisplannedRanges(metas)
   535  	metas = ignoreLeafTxnState(metas)
   536  	metas = ignoreMetricsMeta(metas)
   537  	if len(metas) != 0 {
   538  		t.Errorf("unexpected metadata (%d): %+v", len(metas), metas)
   539  	}
   540  	// TODO(radu): verify the results (should be the same with rightRows)
   541  }
   542  
   543  // Test that DistSQL reads fill the BatchRequest.Header.GatewayNodeID field with
   544  // the ID of the gateway (as opposed to the ID of the node that created the
   545  // batch). Important to lease follow-the-workload transfers.
   546  func TestDistSQLReadsFillGatewayID(t *testing.T) {
   547  	defer leaktest.AfterTest(t)()
   548  
   549  	// We're going to distribute a table and then read it, and we'll expect all
   550  	// the ScanRequests (produced by the different nodes) to identify the one and
   551  	// only gateway.
   552  
   553  	var foundReq int64 // written atomically
   554  	var expectedGateway roachpb.NodeID
   555  
   556  	tc := serverutils.StartTestCluster(t, 3, /* numNodes */
   557  		base.TestClusterArgs{
   558  			ReplicationMode: base.ReplicationManual,
   559  			ServerArgs: base.TestServerArgs{
   560  				UseDatabase: "test",
   561  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
   562  					EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
   563  						TestingEvalFilter: func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   564  							scanReq, ok := filterArgs.Req.(*roachpb.ScanRequest)
   565  							if !ok {
   566  								return nil
   567  							}
   568  							if !strings.HasPrefix(scanReq.Key.String(), "/Table/53/1") {
   569  								return nil
   570  							}
   571  
   572  							atomic.StoreInt64(&foundReq, 1)
   573  							if gw := filterArgs.Hdr.GatewayNodeID; gw != expectedGateway {
   574  								return roachpb.NewErrorf(
   575  									"expected all scans to have gateway 3, found: %d",
   576  									gw)
   577  							}
   578  							return nil
   579  						},
   580  					}},
   581  				},
   582  			},
   583  		})
   584  	defer tc.Stopper().Stop(context.Background())
   585  
   586  	db := tc.ServerConn(0)
   587  	sqlutils.CreateTable(t, db, "t",
   588  		"num INT PRIMARY KEY",
   589  		0, /* numRows */
   590  		sqlutils.ToRowFn(sqlutils.RowIdxFn))
   591  
   592  	if _, err := db.Exec(`
   593  ALTER TABLE t SPLIT AT VALUES (1), (2), (3);
   594  ALTER TABLE t EXPERIMENTAL_RELOCATE VALUES (ARRAY[2], 1), (ARRAY[1], 2), (ARRAY[3], 3);
   595  `); err != nil {
   596  		t.Fatal(err)
   597  	}
   598  
   599  	expectedGateway = tc.Server(2).NodeID()
   600  	if _, err := tc.ServerConn(2).Exec("SELECT * FROM t"); err != nil {
   601  		t.Fatal(err)
   602  	}
   603  	if atomic.LoadInt64(&foundReq) != 1 {
   604  		t.Fatal("TestingEvalFilter failed to find any requests")
   605  	}
   606  }
   607  
   608  // Test that we can evaluate built-in functions that use the txn on remote
   609  // nodes. We have a bug where the EvalCtx.Txn field was only correctly populated
   610  // on the gateway.
   611  func TestEvalCtxTxnOnRemoteNodes(t *testing.T) {
   612  	defer leaktest.AfterTest(t)()
   613  	ctx := context.Background()
   614  
   615  	tc := serverutils.StartTestCluster(t, 2, /* numNodes */
   616  		base.TestClusterArgs{
   617  			ReplicationMode: base.ReplicationManual,
   618  			ServerArgs: base.TestServerArgs{
   619  				UseDatabase: "test",
   620  			},
   621  		})
   622  	defer tc.Stopper().Stop(ctx)
   623  
   624  	db := tc.ServerConn(0)
   625  	sqlutils.CreateTable(t, db, "t",
   626  		"num INT PRIMARY KEY",
   627  		1, /* numRows */
   628  		sqlutils.ToRowFn(sqlutils.RowIdxFn))
   629  
   630  	// Relocate the table to a remote node.
   631  	_, err := db.Exec("ALTER TABLE t EXPERIMENTAL_RELOCATE VALUES (ARRAY[2], 1)")
   632  	require.NoError(t, err)
   633  
   634  	testutils.RunTrueAndFalse(t, "vectorize", func(t *testing.T, vectorize bool) {
   635  		if vectorize {
   636  			t.Skip("skipped because we can't yet vectorize queries using DECIMALs")
   637  		}
   638  		// We're going to use the first node as the gateway and expect everything to
   639  		// be planned remotely.
   640  		db := tc.ServerConn(0)
   641  		var opt string
   642  		if vectorize {
   643  			opt = "experimental_always"
   644  		} else {
   645  			opt = "off"
   646  		}
   647  		_, err := db.Exec(fmt.Sprintf("set vectorize=%s", opt))
   648  		require.NoError(t, err)
   649  
   650  		// Query using a builtin function which uses the transaction (for example,
   651  		// cluster_logical_timestamp()) and expect not to crash.
   652  		_, err = db.Exec("SELECT cluster_logical_timestamp() FROM t")
   653  		require.NoError(t, err)
   654  
   655  		// Query again just in case the previous query executed on the gateway
   656  		// because the leaseholder cache wasn't populated and we fooled ourselves.
   657  		_, err = db.Exec("SELECT cluster_logical_timestamp() FROM t")
   658  		require.NoError(t, err)
   659  	})
   660  }
   661  
   662  // BenchmarkInfrastructure sets up a flow that doesn't use KV at all and runs it
   663  // repeatedly. The intention is to profile the distsql infrastructure itself.
   664  func BenchmarkInfrastructure(b *testing.B) {
   665  	defer leaktest.AfterTest(b)()
   666  
   667  	args := base.TestClusterArgs{ReplicationMode: base.ReplicationManual}
   668  	tc := serverutils.StartTestCluster(b, 3, args)
   669  	defer tc.Stopper().Stop(context.Background())
   670  
   671  	for _, numNodes := range []int{1, 3} {
   672  		b.Run(fmt.Sprintf("n%d", numNodes), func(b *testing.B) {
   673  			for _, numRows := range []int{1, 100, 10000} {
   674  				b.Run(fmt.Sprintf("r%d", numRows), func(b *testing.B) {
   675  					// Generate some data sets, consisting of rows with three values; the first
   676  					// value is increasing.
   677  					rng, _ := randutil.NewPseudoRand()
   678  					lastVal := 1
   679  					valSpecs := make([]execinfrapb.ValuesCoreSpec, numNodes)
   680  					for i := range valSpecs {
   681  						se := StreamEncoder{}
   682  						se.Init(sqlbase.ThreeIntCols)
   683  						for j := 0; j < numRows; j++ {
   684  							row := make(sqlbase.EncDatumRow, 3)
   685  							lastVal += rng.Intn(10)
   686  							row[0] = sqlbase.DatumToEncDatum(types.Int, tree.NewDInt(tree.DInt(lastVal)))
   687  							row[1] = sqlbase.DatumToEncDatum(types.Int, tree.NewDInt(tree.DInt(rng.Intn(100000))))
   688  							row[2] = sqlbase.DatumToEncDatum(types.Int, tree.NewDInt(tree.DInt(rng.Intn(100000))))
   689  							if err := se.AddRow(row); err != nil {
   690  								b.Fatal(err)
   691  							}
   692  						}
   693  						msg := se.FormMessage(context.Background())
   694  						valSpecs[i] = execinfrapb.ValuesCoreSpec{
   695  							Columns:  msg.Typing,
   696  							RawBytes: [][]byte{msg.Data.RawBytes},
   697  						}
   698  					}
   699  
   700  					// Set up the following network:
   701  					//
   702  					//         Node 0              Node 1          ...
   703  					//
   704  					//      +----------+        +----------+
   705  					//      |  Values  |        |  Values  |       ...
   706  					//      +----------+        +----------+
   707  					//          |                  |
   708  					//          |       stream 1   |
   709  					// stream 0 |   /-------------/                ...
   710  					//          |  |
   711  					//          v  v
   712  					//     +---------------+
   713  					//     | ordered* sync |
   714  					//  +--+---------------+--+
   715  					//  |        No-op        |
   716  					//  +---------------------+
   717  					//
   718  					// *unordered if we have a single node.
   719  
   720  					reqs := make([]execinfrapb.SetupFlowRequest, numNodes)
   721  					streamType := func(i int) execinfrapb.StreamEndpointSpec_Type {
   722  						if i == 0 {
   723  							return execinfrapb.StreamEndpointSpec_LOCAL
   724  						}
   725  						return execinfrapb.StreamEndpointSpec_REMOTE
   726  					}
   727  					now := tc.Server(0).Clock().Now()
   728  					txnProto := roachpb.MakeTransaction(
   729  						"cluster-test",
   730  						nil, // baseKey
   731  						roachpb.NormalUserPriority,
   732  						now,
   733  						0, // maxOffset
   734  					)
   735  					txn := kv.NewTxnFromProto(
   736  						context.Background(), tc.Server(0).DB(), tc.Server(0).NodeID(),
   737  						now, kv.RootTxn, &txnProto)
   738  					leafInputState := txn.GetLeafTxnInputState(context.Background())
   739  					for i := range reqs {
   740  						reqs[i] = execinfrapb.SetupFlowRequest{
   741  							Version:           execinfra.Version,
   742  							LeafTxnInputState: &leafInputState,
   743  							Flow: execinfrapb.FlowSpec{
   744  								Processors: []execinfrapb.ProcessorSpec{{
   745  									Core: execinfrapb.ProcessorCoreUnion{Values: &valSpecs[i]},
   746  									Output: []execinfrapb.OutputRouterSpec{{
   747  										Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
   748  										Streams: []execinfrapb.StreamEndpointSpec{
   749  											{Type: streamType(i), StreamID: execinfrapb.StreamID(i), TargetNodeID: tc.Server(0).NodeID()},
   750  										},
   751  									}},
   752  								}},
   753  							},
   754  						}
   755  					}
   756  
   757  					reqs[0].Flow.Processors[0].Output[0].Streams[0] = execinfrapb.StreamEndpointSpec{
   758  						Type:     execinfrapb.StreamEndpointSpec_LOCAL,
   759  						StreamID: 0,
   760  					}
   761  					inStreams := make([]execinfrapb.StreamEndpointSpec, numNodes)
   762  					for i := range inStreams {
   763  						inStreams[i].Type = streamType(i)
   764  						inStreams[i].StreamID = execinfrapb.StreamID(i)
   765  					}
   766  
   767  					lastProc := execinfrapb.ProcessorSpec{
   768  						Input: []execinfrapb.InputSyncSpec{{
   769  							Type: execinfrapb.InputSyncSpec_ORDERED,
   770  							Ordering: execinfrapb.Ordering{Columns: []execinfrapb.Ordering_Column{
   771  								{ColIdx: 0, Direction: execinfrapb.Ordering_Column_ASC}}},
   772  							Streams:     inStreams,
   773  							ColumnTypes: sqlbase.ThreeIntCols,
   774  						}},
   775  						Core: execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}},
   776  						Output: []execinfrapb.OutputRouterSpec{{
   777  							Type:    execinfrapb.OutputRouterSpec_PASS_THROUGH,
   778  							Streams: []execinfrapb.StreamEndpointSpec{{Type: execinfrapb.StreamEndpointSpec_SYNC_RESPONSE}},
   779  						}},
   780  					}
   781  					if numNodes == 1 {
   782  						lastProc.Input[0].Type = execinfrapb.InputSyncSpec_UNORDERED
   783  						lastProc.Input[0].Ordering = execinfrapb.Ordering{}
   784  					}
   785  					reqs[0].Flow.Processors = append(reqs[0].Flow.Processors, lastProc)
   786  
   787  					var clients []execinfrapb.DistSQLClient
   788  					for i := 0; i < numNodes; i++ {
   789  						s := tc.Server(i)
   790  						conn, err := s.RPCContext().GRPCDialNode(s.ServingRPCAddr(), s.NodeID(),
   791  							rpc.DefaultClass).Connect(context.Background())
   792  						if err != nil {
   793  							b.Fatal(err)
   794  						}
   795  						clients = append(clients, execinfrapb.NewDistSQLClient(conn))
   796  					}
   797  
   798  					b.ResetTimer()
   799  					for repeat := 0; repeat < b.N; repeat++ {
   800  						fid := execinfrapb.FlowID{UUID: uuid.MakeV4()}
   801  						for i := range reqs {
   802  							reqs[i].Flow.FlowID = fid
   803  						}
   804  
   805  						for i := 1; i < numNodes; i++ {
   806  							if resp, err := clients[i].SetupFlow(context.Background(), &reqs[i]); err != nil {
   807  								b.Fatal(err)
   808  							} else if resp.Error != nil {
   809  								b.Fatal(resp.Error)
   810  							}
   811  						}
   812  						stream, err := clients[0].RunSyncFlow(context.Background())
   813  						if err != nil {
   814  							b.Fatal(err)
   815  						}
   816  						err = stream.Send(&execinfrapb.ConsumerSignal{SetupFlowRequest: &reqs[0]})
   817  						if err != nil {
   818  							b.Fatal(err)
   819  						}
   820  
   821  						var decoder StreamDecoder
   822  						var rows sqlbase.EncDatumRows
   823  						var metas []execinfrapb.ProducerMetadata
   824  						for {
   825  							msg, err := stream.Recv()
   826  							if err != nil {
   827  								if err == io.EOF {
   828  									break
   829  								}
   830  								b.Fatal(err)
   831  							}
   832  							err = decoder.AddMessage(context.Background(), msg)
   833  							if err != nil {
   834  								b.Fatal(err)
   835  							}
   836  							rows, metas = testGetDecodedRows(b, &decoder, rows, metas)
   837  						}
   838  						metas = ignoreMisplannedRanges(metas)
   839  						metas = ignoreLeafTxnState(metas)
   840  						metas = ignoreMetricsMeta(metas)
   841  						if len(metas) != 0 {
   842  							b.Fatalf("unexpected metadata (%d): %+v", len(metas), metas)
   843  						}
   844  						if len(rows) != numNodes*numRows {
   845  							b.Errorf("got %d rows, expected %d", len(rows), numNodes*numRows)
   846  						}
   847  						var a sqlbase.DatumAlloc
   848  						for i := range rows {
   849  							if err := rows[i][0].EnsureDecoded(types.Int, &a); err != nil {
   850  								b.Fatal(err)
   851  							}
   852  							if i > 0 {
   853  								last := *rows[i-1][0].Datum.(*tree.DInt)
   854  								curr := *rows[i][0].Datum.(*tree.DInt)
   855  								if last > curr {
   856  									b.Errorf("rows not ordered correctly (%d after %d, row %d)", curr, last, i)
   857  									break
   858  								}
   859  							}
   860  						}
   861  					}
   862  				})
   863  			}
   864  		})
   865  	}
   866  }