github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/gossip.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "net" 18 "net/http" 19 "net/url" 20 "strconv" 21 "strings" 22 "time" 23 "unicode" 24 25 "github.com/cockroachdb/cockroach/pkg/gossip" 26 "github.com/cockroachdb/cockroach/pkg/util" 27 "github.com/cockroachdb/cockroach/pkg/util/httputil" 28 "github.com/cockroachdb/cockroach/pkg/util/retry" 29 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 30 "github.com/cockroachdb/errors" 31 ) 32 33 func registerGossip(r *testRegistry) { 34 runGossipChaos := func(ctx context.Context, t *test, c *cluster) { 35 args := startArgs("--args=--vmodule=*=1") 36 c.Put(ctx, cockroach, "./cockroach", c.All()) 37 c.Start(ctx, t, c.All(), args) 38 waitForFullReplication(t, c.Conn(ctx, 1)) 39 40 gossipNetwork := func(node int) string { 41 const query = ` 42 SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') 43 FROM (SELECT * FROM crdb_internal.gossip_network ORDER BY source_id, target_id) 44 ` 45 46 db := c.Conn(ctx, node) 47 defer db.Close() 48 var s gosql.NullString 49 if err := db.QueryRow(query).Scan(&s); err != nil { 50 t.Fatal(err) 51 } 52 if s.Valid { 53 return s.String 54 } 55 return "" 56 } 57 58 var deadNode int 59 gossipOK := func(start time.Time) bool { 60 var expected string 61 var initialized bool 62 for i := 1; i <= c.spec.NodeCount; i++ { 63 if elapsed := timeutil.Since(start); elapsed >= 20*time.Second { 64 t.Fatalf("gossip did not stabilize in %.1fs", elapsed.Seconds()) 65 } 66 67 if i == deadNode { 68 continue 69 } 70 c.l.Printf("%d: checking gossip\n", i) 71 s := gossipNetwork(i) 72 if !initialized { 73 deadNodeStr := fmt.Sprint(deadNode) 74 split := func(c rune) bool { 75 return !unicode.IsNumber(c) 76 } 77 for _, id := range strings.FieldsFunc(s, split) { 78 if id == deadNodeStr { 79 c.l.Printf("%d: gossip not ok (dead node %d present): %s (%.0fs)\n", 80 i, deadNode, s, timeutil.Since(start).Seconds()) 81 return false 82 } 83 } 84 initialized = true 85 expected = s 86 continue 87 } 88 if expected != s { 89 c.l.Printf("%d: gossip not ok: %s != %s (%.0fs)\n", 90 i, expected, s, timeutil.Since(start).Seconds()) 91 return false 92 } 93 } 94 c.l.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds()) 95 return true 96 } 97 98 waitForGossip := func() { 99 t.Status("waiting for gossip to stabilize") 100 start := timeutil.Now() 101 for { 102 if gossipOK(start) { 103 return 104 } 105 time.Sleep(time.Second) 106 } 107 } 108 109 waitForGossip() 110 nodes := c.All() 111 for j := 0; j < 100; j++ { 112 deadNode = nodes.randNode()[0] 113 c.Stop(ctx, c.Node(deadNode)) 114 waitForGossip() 115 c.Start(ctx, t, c.Node(deadNode), args) 116 } 117 } 118 119 r.Add(testSpec{ 120 Name: fmt.Sprintf("gossip/chaos/nodes=9"), 121 Owner: OwnerKV, 122 Cluster: makeClusterSpec(9), 123 Run: func(ctx context.Context, t *test, c *cluster) { 124 runGossipChaos(ctx, t, c) 125 }, 126 }) 127 } 128 129 type gossipUtil struct { 130 waitTime time.Duration 131 urlMap map[int]string 132 conn func(ctx context.Context, i int) *gosql.DB 133 } 134 135 func newGossipUtil(ctx context.Context, c *cluster) *gossipUtil { 136 urlMap := make(map[int]string) 137 for i, addr := range c.ExternalAdminUIAddr(ctx, c.All()) { 138 urlMap[i+1] = `http://` + addr 139 } 140 return &gossipUtil{ 141 waitTime: 30 * time.Second, 142 urlMap: urlMap, 143 conn: c.Conn, 144 } 145 } 146 147 type checkGossipFunc func(map[string]gossip.Info) error 148 149 // checkGossip fetches the gossip infoStore from each node and invokes the 150 // given function. The test passes if the function returns 0 for every node, 151 // retrying for up to the given duration. 152 func (g *gossipUtil) check(ctx context.Context, c *cluster, f checkGossipFunc) error { 153 return retry.ForDuration(g.waitTime, func() error { 154 var infoStatus gossip.InfoStatus 155 for i := 1; i <= c.spec.NodeCount; i++ { 156 url := g.urlMap[i] + `/_status/gossip/local` 157 if err := httputil.GetJSON(http.Client{}, url, &infoStatus); err != nil { 158 return errors.Wrapf(err, "failed to get gossip status from node %d", i) 159 } 160 if err := f(infoStatus.Infos); err != nil { 161 return errors.Wrapf(err, "node %d", i) 162 } 163 } 164 165 return nil 166 }) 167 } 168 169 // hasPeers returns a checkGossipFunc that passes when the given number of 170 // peers are connected via gossip. 171 func (gossipUtil) hasPeers(expected int) checkGossipFunc { 172 return func(infos map[string]gossip.Info) error { 173 count := 0 174 for k := range infos { 175 if strings.HasPrefix(k, gossip.KeyNodeIDPrefix) { 176 count++ 177 } 178 } 179 if count != expected { 180 return errors.Errorf("expected %d peers, found %d", expected, count) 181 } 182 return nil 183 } 184 } 185 186 // hasSentinel is a checkGossipFunc that passes when the sentinel gossip is present. 187 func (gossipUtil) hasSentinel(infos map[string]gossip.Info) error { 188 if _, ok := infos[gossip.KeySentinel]; !ok { 189 return errors.Errorf("sentinel not found") 190 } 191 return nil 192 } 193 194 // hasClusterID is a checkGossipFunc that passes when the cluster ID gossip is present. 195 func (gossipUtil) hasClusterID(infos map[string]gossip.Info) error { 196 if _, ok := infos[gossip.KeyClusterID]; !ok { 197 return errors.Errorf("cluster ID not found") 198 } 199 return nil 200 } 201 202 func (g *gossipUtil) checkConnectedAndFunctional(ctx context.Context, t *test, c *cluster) { 203 t.l.Printf("waiting for gossip to be connected\n") 204 if err := g.check(ctx, c, g.hasPeers(c.spec.NodeCount)); err != nil { 205 t.Fatal(err) 206 } 207 if err := g.check(ctx, c, g.hasClusterID); err != nil { 208 t.Fatal(err) 209 } 210 if err := g.check(ctx, c, g.hasSentinel); err != nil { 211 t.Fatal(err) 212 } 213 214 for i := 1; i <= c.spec.NodeCount; i++ { 215 db := g.conn(ctx, i) 216 defer db.Close() 217 if i == 1 { 218 if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS test"); err != nil { 219 t.Fatal(err) 220 } 221 if _, err := db.Exec("CREATE TABLE IF NOT EXISTS test.kv (k INT PRIMARY KEY, v INT)"); err != nil { 222 t.Fatal(err) 223 } 224 if _, err := db.Exec(`UPSERT INTO test.kv (k, v) VALUES (1, 0)`); err != nil { 225 t.Fatal(err) 226 } 227 } 228 rows, err := db.Query(`UPDATE test.kv SET v=v+1 WHERE k=1 RETURNING v`) 229 if err != nil { 230 t.Fatal(err) 231 } 232 defer rows.Close() 233 var count int 234 if rows.Next() { 235 if err := rows.Scan(&count); err != nil { 236 t.Fatal(err) 237 } 238 if count != i { 239 t.Fatalf("unexpected value %d for write #%d (expected %d)", count, i, i) 240 } 241 } else { 242 t.Fatalf("no results found from update") 243 } 244 } 245 } 246 247 func runGossipPeerings(ctx context.Context, t *test, c *cluster) { 248 c.Put(ctx, cockroach, "./cockroach") 249 c.Start(ctx, t) 250 251 // Repeatedly restart a random node and verify that all of the nodes are 252 // seeing the gossiped values. 253 254 g := newGossipUtil(ctx, c) 255 deadline := timeutil.Now().Add(time.Minute) 256 257 for i := 1; timeutil.Now().Before(deadline); i++ { 258 if err := g.check(ctx, c, g.hasPeers(c.spec.NodeCount)); err != nil { 259 t.Fatal(err) 260 } 261 if err := g.check(ctx, c, g.hasClusterID); err != nil { 262 t.Fatal(err) 263 } 264 if err := g.check(ctx, c, g.hasSentinel); err != nil { 265 t.Fatal(err) 266 } 267 t.l.Printf("%d: OK\n", i) 268 269 // Restart a random node. 270 node := c.All().randNode() 271 t.l.Printf("%d: restarting node %d\n", i, node[0]) 272 c.Stop(ctx, node) 273 c.Start(ctx, t, node) 274 } 275 } 276 277 func runGossipRestart(ctx context.Context, t *test, c *cluster) { 278 c.Put(ctx, cockroach, "./cockroach") 279 c.Start(ctx, t) 280 281 // Repeatedly stop and restart a cluster and verify that we can perform basic 282 // operations. This is stressing the gossiping of the first range descriptor 283 // which is required for any node to be able do even the most basic 284 // operations on a cluster. 285 286 g := newGossipUtil(ctx, c) 287 deadline := timeutil.Now().Add(time.Minute) 288 289 for i := 1; timeutil.Now().Before(deadline); i++ { 290 g.checkConnectedAndFunctional(ctx, t, c) 291 t.l.Printf("%d: OK\n", i) 292 293 t.l.Printf("%d: killing all nodes\n", i) 294 c.Stop(ctx) 295 296 t.l.Printf("%d: restarting all nodes\n", i) 297 c.Start(ctx, t) 298 } 299 } 300 301 func runGossipRestartNodeOne(ctx context.Context, t *test, c *cluster) { 302 args := startArgs("--env=COCKROACH_SCAN_MAX_IDLE_TIME=5ms", "--encrypt=false") 303 c.Put(ctx, cockroach, "./cockroach") 304 // Reduce the scan max idle time to speed up evacuation of node 1. 305 c.Start(ctx, t, racks(c.spec.NodeCount), args) 306 307 db := c.Conn(ctx, 1) 308 defer db.Close() 309 310 run := func(stmtStr string) { 311 stmt := fmt.Sprintf(stmtStr, "", "=") 312 t.l.Printf("%s\n", stmt) 313 _, err := db.ExecContext(ctx, stmt) 314 if err != nil && strings.Contains(err.Error(), "syntax error") { 315 // Pre-2.1 was EXPERIMENTAL. 316 // TODO(knz): Remove this in 2.2. 317 stmt = fmt.Sprintf(stmtStr, "EXPERIMENTAL", "") 318 t.l.Printf("%s\n", stmt) 319 _, err = db.ExecContext(ctx, stmt) 320 } 321 if err != nil { 322 t.Fatal(err) 323 } 324 } 325 326 // Wait for gossip to propagate - otherwise attempting to set zone 327 // constraints can fail with an error about how the constraint doesn't match 328 // any nodes in the cluster (#30220). 329 var lastNodeCount int 330 if err := retry.ForDuration(30*time.Second, func() error { 331 const query = `SELECT count(*) FROM crdb_internal.gossip_nodes` 332 var count int 333 if err := db.QueryRow(query).Scan(&count); err != nil { 334 t.Fatal(err) 335 } 336 if count <= 1 { 337 err := errors.Errorf("node 1 still only knows about %d node%s", 338 count, util.Pluralize(int64(count))) 339 if count != lastNodeCount { 340 lastNodeCount = count 341 t.l.Printf("%s\n", err) 342 } 343 return err 344 } 345 return nil 346 }); err != nil { 347 t.Fatal(err) 348 } 349 350 // Evacuate all of the ranges off node 1 with zone config constraints. See 351 // the racks setting specified when the cluster was started. 352 run(`ALTER RANGE default %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 353 run(`ALTER RANGE system %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 354 run(`ALTER DATABASE system %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 355 run(`ALTER RANGE meta %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 356 run(`ALTER RANGE liveness %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 357 // TODO(andrei): Changing the constraints for the system tables shouldn't be 358 // needed given that we've changed them for the system zone. What's going on? 359 // #40921. 360 run(`ALTER TABLE system.jobs %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 361 if t.IsBuildVersion("v19.2.0") { 362 run(`ALTER TABLE system.replication_stats %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 363 run(`ALTER TABLE system.replication_constraint_stats %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`) 364 } 365 366 var lastReplCount int 367 if err := retry.ForDuration(2*time.Minute, func() error { 368 const query = ` 369 SELECT count(replicas) 370 FROM crdb_internal.ranges 371 WHERE array_position(replicas, 1) IS NOT NULL 372 ` 373 var count int 374 if err := db.QueryRow(query).Scan(&count); err != nil { 375 t.Fatal(err) 376 } 377 if count > 0 { 378 err := errors.Errorf("node 1 still has %d replicas", count) 379 if count != lastReplCount { 380 lastReplCount = count 381 t.l.Printf("%s\n", err) 382 } 383 return err 384 } 385 return nil 386 }); err != nil { 387 t.Fatal(err) 388 } 389 390 t.l.Printf("killing all nodes\n") 391 c.Stop(ctx) 392 393 // Restart node 1, but have it listen on a different port for internal 394 // connections. This will require node 1 to reach out to the other nodes in 395 // the cluster for gossip info. 396 err := c.RunE(ctx, c.Node(1), 397 `./cockroach start --insecure --background --store={store-dir} `+ 398 `--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+ 399 `--listen-addr=:$[{pgport:1}+10000] --http-port=$[{pgport:1}+1] `+ 400 `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`) 401 if err != nil { 402 t.Fatal(err) 403 } 404 405 // Restart the other nodes. These nodes won't be able to talk to node 1 until 406 // node 1 talks to it (they have out of date address info). Node 1 needs 407 // incoming gossip info in order to determine where range 1 is. 408 c.Start(ctx, t, c.Range(2, c.spec.NodeCount), args) 409 410 // We need to override DB connection creation to use the correct port for 411 // node 1. This is more complicated than it should be and a limitation of the 412 // current infrastructure which doesn't know about cockroach nodes started on 413 // non-standard ports. 414 g := newGossipUtil(ctx, c) 415 g.conn = func(ctx context.Context, i int) *gosql.DB { 416 if i != 1 { 417 return c.Conn(ctx, i) 418 } 419 url, err := url.Parse(c.ExternalPGUrl(ctx, c.Node(1))[0]) 420 if err != nil { 421 t.Fatal(err) 422 } 423 host, port, err := net.SplitHostPort(url.Host) 424 if err != nil { 425 t.Fatal(err) 426 } 427 v, err := strconv.Atoi(port) 428 if err != nil { 429 t.Fatal(err) 430 } 431 url.Host = fmt.Sprintf("%s:%d", host, v+10000) 432 db, err := gosql.Open("postgres", url.String()) 433 if err != nil { 434 t.Fatal(err) 435 } 436 return db 437 } 438 439 g.checkConnectedAndFunctional(ctx, t, c) 440 441 // Stop our special snowflake process which won't be recognized by the test 442 // harness, and start it again on the regular. 443 c.Stop(ctx, c.Node(1)) 444 c.Start(ctx, t, c.Node(1)) 445 } 446 447 func runCheckLocalityIPAddress(ctx context.Context, t *test, c *cluster) { 448 c.Put(ctx, cockroach, "./cockroach") 449 450 externalIP := c.ExternalIP(ctx, c.Range(1, c.spec.NodeCount)) 451 452 for i := 1; i <= c.spec.NodeCount; i++ { 453 if local { 454 externalIP[i-1] = "localhost" 455 } 456 extAddr := externalIP[i-1] 457 458 c.Start(ctx, t, c.Node(i), startArgs("--racks=1", 459 fmt.Sprintf("--args=--locality-advertise-addr=rack=0@%s", extAddr))) 460 } 461 462 rowCount := 0 463 464 for i := 1; i <= c.spec.NodeCount; i++ { 465 db := c.Conn(ctx, 1) 466 defer db.Close() 467 468 rows, err := db.Query( 469 `SELECT node_id, advertise_address FROM crdb_internal.gossip_nodes`, 470 ) 471 if err != nil { 472 t.Fatal(err) 473 } 474 475 for rows.Next() { 476 rowCount++ 477 var nodeID int 478 var advertiseAddress string 479 if err := rows.Scan(&nodeID, &advertiseAddress); err != nil { 480 t.Fatal(err) 481 } 482 483 if local { 484 if !strings.Contains(advertiseAddress, "localhost") { 485 t.Fatal("Expected connect address to contain localhost") 486 } 487 } else if exp := c.ExternalAddr(ctx, c.Node(nodeID))[0]; exp != advertiseAddress { 488 t.Fatalf("Connection address is %s but expected %s", advertiseAddress, exp) 489 } 490 } 491 } 492 if rowCount <= 0 { 493 t.Fatal("No results for " + 494 "SELECT node_id, advertise_address FROM crdb_internal.gossip_nodes") 495 } 496 }