github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cluster_test.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 "fmt" 16 "os" 17 "regexp" 18 "runtime" 19 "testing" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/testutils" 23 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 24 "github.com/cockroachdb/errors" 25 "github.com/stretchr/testify/assert" 26 "github.com/stretchr/testify/require" 27 ) 28 29 func TestClusterNodes(t *testing.T) { 30 c := &cluster{spec: makeClusterSpec(10)} 31 opts := func(opts ...option) []option { 32 return opts 33 } 34 testCases := []struct { 35 opts []option 36 expected string 37 }{ 38 {opts(), ""}, 39 {opts(c.All()), ":1-10"}, 40 {opts(c.Range(1, 2)), ":1-2"}, 41 {opts(c.Range(2, 5)), ":2-5"}, 42 {opts(c.All(), c.Range(2, 5)), ":1-10"}, 43 {opts(c.Range(2, 5), c.Range(7, 9)), ":2-5,7-9"}, 44 {opts(c.Range(2, 5), c.Range(6, 8)), ":2-8"}, 45 {opts(c.Node(2), c.Node(4), c.Node(6)), ":2,4,6"}, 46 {opts(c.Node(2), c.Node(3), c.Node(4)), ":2-4"}, 47 } 48 for _, tc := range testCases { 49 t.Run("", func(t *testing.T) { 50 nodes := c.makeNodes(tc.opts...) 51 if tc.expected != nodes { 52 t.Fatalf("expected %s, but found %s", tc.expected, nodes) 53 } 54 }) 55 } 56 } 57 58 type testWrapper struct { 59 *testing.T 60 } 61 62 func (t testWrapper) ArtifactsDir() string { 63 return "" 64 } 65 66 func (t testWrapper) logger() *logger { 67 return nil 68 } 69 70 func TestExecCmd(t *testing.T) { 71 cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr} 72 logger, err := cfg.newLogger("" /* path */) 73 if err != nil { 74 t.Fatal(err) 75 } 76 77 t.Run(`success`, func(t *testing.T) { 78 res := execCmdEx(context.Background(), logger, "/bin/bash", "-c", "echo guacamole") 79 require.NoError(t, res.err) 80 require.Contains(t, res.stdout, "guacamole") 81 }) 82 83 t.Run(`error`, func(t *testing.T) { 84 res := execCmdEx(context.Background(), logger, "/bin/bash", "-c", "echo burrito; false") 85 require.Error(t, res.err) 86 require.Contains(t, res.stdout, "burrito") 87 }) 88 89 t.Run(`returns-on-cancel`, func(t *testing.T) { 90 ctx, cancel := context.WithCancel(context.Background()) 91 go func() { 92 time.Sleep(10 * time.Millisecond) 93 cancel() 94 }() 95 tBegin := timeutil.Now() 96 require.Error(t, execCmd(ctx, logger, "/bin/bash", "-c", "sleep 100")) 97 if max, act := 99*time.Second, timeutil.Since(tBegin); max < act { 98 t.Fatalf("took %s despite cancellation", act) 99 } 100 }) 101 102 t.Run(`returns-on-cancel-subprocess`, func(t *testing.T) { 103 // The tricky version of the preceding test. The difference is that the process 104 // spawns a stalling subprocess and then waits for it. See execCmdEx for a 105 // detailed discussion of how this is made work. 106 ctx, cancel := context.WithCancel(context.Background()) 107 go func() { 108 time.Sleep(10 * time.Millisecond) 109 cancel() 110 }() 111 tBegin := timeutil.Now() 112 require.Error(t, execCmd(ctx, logger, "/bin/bash", "-c", "sleep 100& wait")) 113 if max, act := 99*time.Second, timeutil.Since(tBegin); max < act { 114 t.Fatalf("took %s despite cancellation", act) 115 } 116 }) 117 } 118 119 func TestClusterMonitor(t *testing.T) { 120 cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr} 121 logger, err := cfg.newLogger("" /* path */) 122 if err != nil { 123 t.Fatal(err) 124 } 125 126 t.Run(`success`, func(t *testing.T) { 127 c := &cluster{t: testWrapper{t}, l: logger} 128 m := newMonitor(context.Background(), c) 129 m.Go(func(context.Context) error { return nil }) 130 if err := m.wait(`echo`, `1`); err != nil { 131 t.Fatal(err) 132 } 133 }) 134 135 t.Run(`dead`, func(t *testing.T) { 136 c := &cluster{t: testWrapper{t}, l: logger} 137 m := newMonitor(context.Background(), c) 138 m.Go(func(ctx context.Context) error { 139 <-ctx.Done() 140 fmt.Printf("worker done\n") 141 return ctx.Err() 142 }) 143 144 err := m.wait(`echo`, "1: 100\n1: dead") 145 expectedErr := `dead` 146 if !testutils.IsError(err, expectedErr) { 147 t.Errorf(`expected %s err got: %+v`, expectedErr, err) 148 } 149 }) 150 151 t.Run(`worker-fail`, func(t *testing.T) { 152 c := &cluster{t: testWrapper{t}, l: logger} 153 m := newMonitor(context.Background(), c) 154 m.Go(func(context.Context) error { 155 return errors.New(`worker-fail`) 156 }) 157 m.Go(func(ctx context.Context) error { 158 <-ctx.Done() 159 return ctx.Err() 160 }) 161 162 err := m.wait(`sleep`, `100`) 163 expectedErr := `worker-fail` 164 if !testutils.IsError(err, expectedErr) { 165 t.Errorf(`expected %s err got: %+v`, expectedErr, err) 166 } 167 }) 168 169 t.Run(`wait-fail`, func(t *testing.T) { 170 c := &cluster{t: testWrapper{t}, l: logger} 171 m := newMonitor(context.Background(), c) 172 m.Go(func(ctx context.Context) error { 173 <-ctx.Done() 174 return ctx.Err() 175 }) 176 m.Go(func(ctx context.Context) error { 177 <-ctx.Done() 178 return ctx.Err() 179 }) 180 181 // Returned error should be that from the wait command. 182 err := m.wait(`false`) 183 expectedErr := `exit status` 184 if !testutils.IsError(err, expectedErr) { 185 t.Errorf(`expected %s err got: %+v`, expectedErr, err) 186 } 187 }) 188 189 t.Run(`wait-ok`, func(t *testing.T) { 190 c := &cluster{t: testWrapper{t}, l: logger} 191 m := newMonitor(context.Background(), c) 192 m.Go(func(ctx context.Context) error { 193 <-ctx.Done() 194 return ctx.Err() 195 }) 196 m.Go(func(ctx context.Context) error { 197 <-ctx.Done() 198 return ctx.Err() 199 }) 200 201 // If wait terminates, context gets canceled. 202 err := m.wait(`true`) 203 if !errors.Is(err, context.Canceled) { 204 t.Errorf(`expected context canceled, got: %+v`, err) 205 } 206 }) 207 208 // NB: the forker sleeps in these tests actually get leaked, so it's important to let 209 // them finish pretty soon (think stress testing). As a matter of fact, `make test` waits 210 // for these child goroutines to finish (so these tests take seconds). 211 t.Run(`worker-fd-error`, func(t *testing.T) { 212 c := &cluster{t: testWrapper{t}, l: logger} 213 m := newMonitor(context.Background(), c) 214 m.Go(func(ctx context.Context) error { 215 defer func() { 216 fmt.Println("sleep returns") 217 }() 218 return execCmd(ctx, logger, "/bin/bash", "-c", "sleep 3& wait") 219 }) 220 m.Go(func(ctx context.Context) error { 221 defer func() { 222 fmt.Println("failure returns") 223 }() 224 time.Sleep(30 * time.Millisecond) 225 return execCmd(ctx, logger, "/bin/bash", "-c", "echo hi && notthere") 226 }) 227 expectedErr := regexp.QuoteMeta(`exit status 127`) 228 if err := m.wait("sleep", "100"); !testutils.IsError(err, expectedErr) { 229 t.Logf("error details: %+v", err) 230 t.Error(err) 231 } 232 }) 233 t.Run(`worker-fd-fatal`, func(t *testing.T) { 234 c := &cluster{t: testWrapper{t}, l: logger} 235 m := newMonitor(context.Background(), c) 236 m.Go(func(ctx context.Context) error { 237 err := execCmd(ctx, logger, "/bin/bash", "-c", "echo foo && sleep 3& wait") 238 return err 239 }) 240 m.Go(func(ctx context.Context) error { 241 time.Sleep(30 * time.Millisecond) 242 // Simulate c.t.Fatal for which there isn't enough mocking here. 243 // In reality t.Fatal adds text that is returned when the test fails, 244 // so the failing goroutine will be referenced (not like in the expected 245 // error below, where all you see is the other one being canceled). 246 runtime.Goexit() 247 return errors.New("unreachable") 248 }) 249 expectedErr := regexp.QuoteMeta(`Goexit() was called`) 250 if err := m.wait("sleep", "100"); !testutils.IsError(err, expectedErr) { 251 t.Logf("error details: %+v", err) 252 t.Error(err) 253 } 254 }) 255 } 256 257 func TestClusterMachineType(t *testing.T) { 258 testCases := []struct { 259 machineType string 260 expectedCPUCount int 261 }{ 262 // AWS machine types 263 {"m5.large", 2}, 264 {"m5.xlarge", 4}, 265 {"m5.2xlarge", 8}, 266 {"m5.4xlarge", 16}, 267 {"m5.12xlarge", 48}, 268 {"m5.24xlarge", 96}, 269 {"m5d.large", 2}, 270 {"m5d.xlarge", 4}, 271 {"m5d.2xlarge", 8}, 272 {"m5d.4xlarge", 16}, 273 {"m5d.12xlarge", 48}, 274 {"m5d.24xlarge", 96}, 275 {"c5d.large", 2}, 276 {"c5d.xlarge", 4}, 277 {"c5d.2xlarge", 8}, 278 {"c5d.4xlarge", 16}, 279 {"c5d.9xlarge", 36}, 280 {"c5d.18xlarge", 72}, 281 // GCE machine types 282 {"n1-standard-1", 1}, 283 {"n1-standard-2", 2}, 284 {"n1-standard-4", 4}, 285 {"n1-standard-8", 8}, 286 {"n1-standard-16", 16}, 287 {"n1-standard-32", 32}, 288 {"n1-standard-64", 64}, 289 {"n1-standard-96", 96}, 290 } 291 for _, tc := range testCases { 292 t.Run(tc.machineType, func(t *testing.T) { 293 cpuCount := MachineTypeToCPUs(tc.machineType) 294 if tc.expectedCPUCount != cpuCount { 295 t.Fatalf("expected %d CPUs, but found %d", tc.expectedCPUCount, cpuCount) 296 } 297 }) 298 } 299 } 300 301 func TestLoadGroups(t *testing.T) { 302 cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr} 303 logger, err := cfg.newLogger("" /* path */) 304 if err != nil { 305 t.Fatal(err) 306 } 307 for _, tc := range []struct { 308 numZones, numRoachNodes, numLoadNodes int 309 loadGroups loadGroupList 310 }{ 311 { 312 3, 9, 3, 313 loadGroupList{ 314 { 315 nodeListOption{1, 2, 3}, 316 nodeListOption{4}, 317 }, 318 { 319 nodeListOption{5, 6, 7}, 320 nodeListOption{8}, 321 }, 322 { 323 nodeListOption{9, 10, 11}, 324 nodeListOption{12}, 325 }, 326 }, 327 }, 328 { 329 3, 9, 1, 330 loadGroupList{ 331 { 332 nodeListOption{1, 2, 3, 4, 5, 6, 7, 8, 9}, 333 nodeListOption{10}, 334 }, 335 }, 336 }, 337 { 338 4, 8, 2, 339 loadGroupList{ 340 { 341 nodeListOption{1, 2, 3, 4}, 342 nodeListOption{9}, 343 }, 344 { 345 nodeListOption{5, 6, 7, 8}, 346 nodeListOption{10}, 347 }, 348 }, 349 }, 350 } { 351 t.Run(fmt.Sprintf("%d/%d/%d", tc.numZones, tc.numRoachNodes, tc.numLoadNodes), 352 func(t *testing.T) { 353 c := &cluster{t: testWrapper{t}, l: logger, spec: makeClusterSpec(tc.numRoachNodes + tc.numLoadNodes)} 354 lg := makeLoadGroups(c, tc.numZones, tc.numRoachNodes, tc.numLoadNodes) 355 require.EqualValues(t, lg, tc.loadGroups) 356 }) 357 } 358 t.Run("panics with too many load nodes", func(t *testing.T) { 359 require.Panics(t, func() { 360 361 numZones, numRoachNodes, numLoadNodes := 2, 4, 3 362 makeLoadGroups(nil, numZones, numRoachNodes, numLoadNodes) 363 }, "Failed to panic when number of load nodes exceeded number of zones") 364 }) 365 t.Run("panics with unequal zones per load node", func(t *testing.T) { 366 require.Panics(t, func() { 367 numZones, numRoachNodes, numLoadNodes := 4, 4, 3 368 makeLoadGroups(nil, numZones, numRoachNodes, numLoadNodes) 369 }, "Failed to panic when number of zones is not divisible by number of load nodes") 370 }) 371 } 372 373 func TestCmdLogFileName(t *testing.T) { 374 ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local) 375 376 const exp = `run_150412.000_n1,3-4,9_cockroach_bla` 377 nodes := nodeListOption{1, 3, 4, 9} 378 assert.Equal(t, 379 exp, 380 cmdLogFileName(ts, nodes, "./cockroach", "bla", "--foo", "bar"), 381 ) 382 assert.Equal(t, 383 exp, 384 cmdLogFileName(ts, nodes, "./cockroach bla --foo bar"), 385 ) 386 }