github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cluster_test.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"os"
    17  	"regexp"
    18  	"runtime"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/testutils"
    23  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    24  	"github.com/cockroachdb/errors"
    25  	"github.com/stretchr/testify/assert"
    26  	"github.com/stretchr/testify/require"
    27  )
    28  
    29  func TestClusterNodes(t *testing.T) {
    30  	c := &cluster{spec: makeClusterSpec(10)}
    31  	opts := func(opts ...option) []option {
    32  		return opts
    33  	}
    34  	testCases := []struct {
    35  		opts     []option
    36  		expected string
    37  	}{
    38  		{opts(), ""},
    39  		{opts(c.All()), ":1-10"},
    40  		{opts(c.Range(1, 2)), ":1-2"},
    41  		{opts(c.Range(2, 5)), ":2-5"},
    42  		{opts(c.All(), c.Range(2, 5)), ":1-10"},
    43  		{opts(c.Range(2, 5), c.Range(7, 9)), ":2-5,7-9"},
    44  		{opts(c.Range(2, 5), c.Range(6, 8)), ":2-8"},
    45  		{opts(c.Node(2), c.Node(4), c.Node(6)), ":2,4,6"},
    46  		{opts(c.Node(2), c.Node(3), c.Node(4)), ":2-4"},
    47  	}
    48  	for _, tc := range testCases {
    49  		t.Run("", func(t *testing.T) {
    50  			nodes := c.makeNodes(tc.opts...)
    51  			if tc.expected != nodes {
    52  				t.Fatalf("expected %s, but found %s", tc.expected, nodes)
    53  			}
    54  		})
    55  	}
    56  }
    57  
    58  type testWrapper struct {
    59  	*testing.T
    60  }
    61  
    62  func (t testWrapper) ArtifactsDir() string {
    63  	return ""
    64  }
    65  
    66  func (t testWrapper) logger() *logger {
    67  	return nil
    68  }
    69  
    70  func TestExecCmd(t *testing.T) {
    71  	cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr}
    72  	logger, err := cfg.newLogger("" /* path */)
    73  	if err != nil {
    74  		t.Fatal(err)
    75  	}
    76  
    77  	t.Run(`success`, func(t *testing.T) {
    78  		res := execCmdEx(context.Background(), logger, "/bin/bash", "-c", "echo guacamole")
    79  		require.NoError(t, res.err)
    80  		require.Contains(t, res.stdout, "guacamole")
    81  	})
    82  
    83  	t.Run(`error`, func(t *testing.T) {
    84  		res := execCmdEx(context.Background(), logger, "/bin/bash", "-c", "echo burrito; false")
    85  		require.Error(t, res.err)
    86  		require.Contains(t, res.stdout, "burrito")
    87  	})
    88  
    89  	t.Run(`returns-on-cancel`, func(t *testing.T) {
    90  		ctx, cancel := context.WithCancel(context.Background())
    91  		go func() {
    92  			time.Sleep(10 * time.Millisecond)
    93  			cancel()
    94  		}()
    95  		tBegin := timeutil.Now()
    96  		require.Error(t, execCmd(ctx, logger, "/bin/bash", "-c", "sleep 100"))
    97  		if max, act := 99*time.Second, timeutil.Since(tBegin); max < act {
    98  			t.Fatalf("took %s despite cancellation", act)
    99  		}
   100  	})
   101  
   102  	t.Run(`returns-on-cancel-subprocess`, func(t *testing.T) {
   103  		// The tricky version of the preceding test. The difference is that the process
   104  		// spawns a stalling subprocess and then waits for it. See execCmdEx for a
   105  		// detailed discussion of how this is made work.
   106  		ctx, cancel := context.WithCancel(context.Background())
   107  		go func() {
   108  			time.Sleep(10 * time.Millisecond)
   109  			cancel()
   110  		}()
   111  		tBegin := timeutil.Now()
   112  		require.Error(t, execCmd(ctx, logger, "/bin/bash", "-c", "sleep 100& wait"))
   113  		if max, act := 99*time.Second, timeutil.Since(tBegin); max < act {
   114  			t.Fatalf("took %s despite cancellation", act)
   115  		}
   116  	})
   117  }
   118  
   119  func TestClusterMonitor(t *testing.T) {
   120  	cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr}
   121  	logger, err := cfg.newLogger("" /* path */)
   122  	if err != nil {
   123  		t.Fatal(err)
   124  	}
   125  
   126  	t.Run(`success`, func(t *testing.T) {
   127  		c := &cluster{t: testWrapper{t}, l: logger}
   128  		m := newMonitor(context.Background(), c)
   129  		m.Go(func(context.Context) error { return nil })
   130  		if err := m.wait(`echo`, `1`); err != nil {
   131  			t.Fatal(err)
   132  		}
   133  	})
   134  
   135  	t.Run(`dead`, func(t *testing.T) {
   136  		c := &cluster{t: testWrapper{t}, l: logger}
   137  		m := newMonitor(context.Background(), c)
   138  		m.Go(func(ctx context.Context) error {
   139  			<-ctx.Done()
   140  			fmt.Printf("worker done\n")
   141  			return ctx.Err()
   142  		})
   143  
   144  		err := m.wait(`echo`, "1: 100\n1: dead")
   145  		expectedErr := `dead`
   146  		if !testutils.IsError(err, expectedErr) {
   147  			t.Errorf(`expected %s err got: %+v`, expectedErr, err)
   148  		}
   149  	})
   150  
   151  	t.Run(`worker-fail`, func(t *testing.T) {
   152  		c := &cluster{t: testWrapper{t}, l: logger}
   153  		m := newMonitor(context.Background(), c)
   154  		m.Go(func(context.Context) error {
   155  			return errors.New(`worker-fail`)
   156  		})
   157  		m.Go(func(ctx context.Context) error {
   158  			<-ctx.Done()
   159  			return ctx.Err()
   160  		})
   161  
   162  		err := m.wait(`sleep`, `100`)
   163  		expectedErr := `worker-fail`
   164  		if !testutils.IsError(err, expectedErr) {
   165  			t.Errorf(`expected %s err got: %+v`, expectedErr, err)
   166  		}
   167  	})
   168  
   169  	t.Run(`wait-fail`, func(t *testing.T) {
   170  		c := &cluster{t: testWrapper{t}, l: logger}
   171  		m := newMonitor(context.Background(), c)
   172  		m.Go(func(ctx context.Context) error {
   173  			<-ctx.Done()
   174  			return ctx.Err()
   175  		})
   176  		m.Go(func(ctx context.Context) error {
   177  			<-ctx.Done()
   178  			return ctx.Err()
   179  		})
   180  
   181  		// Returned error should be that from the wait command.
   182  		err := m.wait(`false`)
   183  		expectedErr := `exit status`
   184  		if !testutils.IsError(err, expectedErr) {
   185  			t.Errorf(`expected %s err got: %+v`, expectedErr, err)
   186  		}
   187  	})
   188  
   189  	t.Run(`wait-ok`, func(t *testing.T) {
   190  		c := &cluster{t: testWrapper{t}, l: logger}
   191  		m := newMonitor(context.Background(), c)
   192  		m.Go(func(ctx context.Context) error {
   193  			<-ctx.Done()
   194  			return ctx.Err()
   195  		})
   196  		m.Go(func(ctx context.Context) error {
   197  			<-ctx.Done()
   198  			return ctx.Err()
   199  		})
   200  
   201  		// If wait terminates, context gets canceled.
   202  		err := m.wait(`true`)
   203  		if !errors.Is(err, context.Canceled) {
   204  			t.Errorf(`expected context canceled, got: %+v`, err)
   205  		}
   206  	})
   207  
   208  	// NB: the forker sleeps in these tests actually get leaked, so it's important to let
   209  	// them finish pretty soon (think stress testing). As a matter of fact, `make test` waits
   210  	// for these child goroutines to finish (so these tests take seconds).
   211  	t.Run(`worker-fd-error`, func(t *testing.T) {
   212  		c := &cluster{t: testWrapper{t}, l: logger}
   213  		m := newMonitor(context.Background(), c)
   214  		m.Go(func(ctx context.Context) error {
   215  			defer func() {
   216  				fmt.Println("sleep returns")
   217  			}()
   218  			return execCmd(ctx, logger, "/bin/bash", "-c", "sleep 3& wait")
   219  		})
   220  		m.Go(func(ctx context.Context) error {
   221  			defer func() {
   222  				fmt.Println("failure returns")
   223  			}()
   224  			time.Sleep(30 * time.Millisecond)
   225  			return execCmd(ctx, logger, "/bin/bash", "-c", "echo hi && notthere")
   226  		})
   227  		expectedErr := regexp.QuoteMeta(`exit status 127`)
   228  		if err := m.wait("sleep", "100"); !testutils.IsError(err, expectedErr) {
   229  			t.Logf("error details: %+v", err)
   230  			t.Error(err)
   231  		}
   232  	})
   233  	t.Run(`worker-fd-fatal`, func(t *testing.T) {
   234  		c := &cluster{t: testWrapper{t}, l: logger}
   235  		m := newMonitor(context.Background(), c)
   236  		m.Go(func(ctx context.Context) error {
   237  			err := execCmd(ctx, logger, "/bin/bash", "-c", "echo foo && sleep 3& wait")
   238  			return err
   239  		})
   240  		m.Go(func(ctx context.Context) error {
   241  			time.Sleep(30 * time.Millisecond)
   242  			// Simulate c.t.Fatal for which there isn't enough mocking here.
   243  			// In reality t.Fatal adds text that is returned when the test fails,
   244  			// so the failing goroutine will be referenced (not like in the expected
   245  			// error below, where all you see is the other one being canceled).
   246  			runtime.Goexit()
   247  			return errors.New("unreachable")
   248  		})
   249  		expectedErr := regexp.QuoteMeta(`Goexit() was called`)
   250  		if err := m.wait("sleep", "100"); !testutils.IsError(err, expectedErr) {
   251  			t.Logf("error details: %+v", err)
   252  			t.Error(err)
   253  		}
   254  	})
   255  }
   256  
   257  func TestClusterMachineType(t *testing.T) {
   258  	testCases := []struct {
   259  		machineType      string
   260  		expectedCPUCount int
   261  	}{
   262  		// AWS machine types
   263  		{"m5.large", 2},
   264  		{"m5.xlarge", 4},
   265  		{"m5.2xlarge", 8},
   266  		{"m5.4xlarge", 16},
   267  		{"m5.12xlarge", 48},
   268  		{"m5.24xlarge", 96},
   269  		{"m5d.large", 2},
   270  		{"m5d.xlarge", 4},
   271  		{"m5d.2xlarge", 8},
   272  		{"m5d.4xlarge", 16},
   273  		{"m5d.12xlarge", 48},
   274  		{"m5d.24xlarge", 96},
   275  		{"c5d.large", 2},
   276  		{"c5d.xlarge", 4},
   277  		{"c5d.2xlarge", 8},
   278  		{"c5d.4xlarge", 16},
   279  		{"c5d.9xlarge", 36},
   280  		{"c5d.18xlarge", 72},
   281  		// GCE machine types
   282  		{"n1-standard-1", 1},
   283  		{"n1-standard-2", 2},
   284  		{"n1-standard-4", 4},
   285  		{"n1-standard-8", 8},
   286  		{"n1-standard-16", 16},
   287  		{"n1-standard-32", 32},
   288  		{"n1-standard-64", 64},
   289  		{"n1-standard-96", 96},
   290  	}
   291  	for _, tc := range testCases {
   292  		t.Run(tc.machineType, func(t *testing.T) {
   293  			cpuCount := MachineTypeToCPUs(tc.machineType)
   294  			if tc.expectedCPUCount != cpuCount {
   295  				t.Fatalf("expected %d CPUs, but found %d", tc.expectedCPUCount, cpuCount)
   296  			}
   297  		})
   298  	}
   299  }
   300  
   301  func TestLoadGroups(t *testing.T) {
   302  	cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr}
   303  	logger, err := cfg.newLogger("" /* path */)
   304  	if err != nil {
   305  		t.Fatal(err)
   306  	}
   307  	for _, tc := range []struct {
   308  		numZones, numRoachNodes, numLoadNodes int
   309  		loadGroups                            loadGroupList
   310  	}{
   311  		{
   312  			3, 9, 3,
   313  			loadGroupList{
   314  				{
   315  					nodeListOption{1, 2, 3},
   316  					nodeListOption{4},
   317  				},
   318  				{
   319  					nodeListOption{5, 6, 7},
   320  					nodeListOption{8},
   321  				},
   322  				{
   323  					nodeListOption{9, 10, 11},
   324  					nodeListOption{12},
   325  				},
   326  			},
   327  		},
   328  		{
   329  			3, 9, 1,
   330  			loadGroupList{
   331  				{
   332  					nodeListOption{1, 2, 3, 4, 5, 6, 7, 8, 9},
   333  					nodeListOption{10},
   334  				},
   335  			},
   336  		},
   337  		{
   338  			4, 8, 2,
   339  			loadGroupList{
   340  				{
   341  					nodeListOption{1, 2, 3, 4},
   342  					nodeListOption{9},
   343  				},
   344  				{
   345  					nodeListOption{5, 6, 7, 8},
   346  					nodeListOption{10},
   347  				},
   348  			},
   349  		},
   350  	} {
   351  		t.Run(fmt.Sprintf("%d/%d/%d", tc.numZones, tc.numRoachNodes, tc.numLoadNodes),
   352  			func(t *testing.T) {
   353  				c := &cluster{t: testWrapper{t}, l: logger, spec: makeClusterSpec(tc.numRoachNodes + tc.numLoadNodes)}
   354  				lg := makeLoadGroups(c, tc.numZones, tc.numRoachNodes, tc.numLoadNodes)
   355  				require.EqualValues(t, lg, tc.loadGroups)
   356  			})
   357  	}
   358  	t.Run("panics with too many load nodes", func(t *testing.T) {
   359  		require.Panics(t, func() {
   360  
   361  			numZones, numRoachNodes, numLoadNodes := 2, 4, 3
   362  			makeLoadGroups(nil, numZones, numRoachNodes, numLoadNodes)
   363  		}, "Failed to panic when number of load nodes exceeded number of zones")
   364  	})
   365  	t.Run("panics with unequal zones per load node", func(t *testing.T) {
   366  		require.Panics(t, func() {
   367  			numZones, numRoachNodes, numLoadNodes := 4, 4, 3
   368  			makeLoadGroups(nil, numZones, numRoachNodes, numLoadNodes)
   369  		}, "Failed to panic when number of zones is not divisible by number of load nodes")
   370  	})
   371  }
   372  
   373  func TestCmdLogFileName(t *testing.T) {
   374  	ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local)
   375  
   376  	const exp = `run_150412.000_n1,3-4,9_cockroach_bla`
   377  	nodes := nodeListOption{1, 3, 4, 9}
   378  	assert.Equal(t,
   379  		exp,
   380  		cmdLogFileName(ts, nodes, "./cockroach", "bla", "--foo", "bar"),
   381  	)
   382  	assert.Equal(t,
   383  		exp,
   384  		cmdLogFileName(ts, nodes, "./cockroach bla --foo bar"),
   385  	)
   386  }