github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/gossip.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"net"
    18  	"net/http"
    19  	"net/url"
    20  	"strconv"
    21  	"strings"
    22  	"time"
    23  	"unicode"
    24  
    25  	"github.com/cockroachdb/cockroach/pkg/gossip"
    26  	"github.com/cockroachdb/cockroach/pkg/util"
    27  	"github.com/cockroachdb/cockroach/pkg/util/httputil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    29  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    30  	"github.com/cockroachdb/errors"
    31  )
    32  
    33  func registerGossip(r *testRegistry) {
    34  	runGossipChaos := func(ctx context.Context, t *test, c *cluster) {
    35  		args := startArgs("--args=--vmodule=*=1")
    36  		c.Put(ctx, cockroach, "./cockroach", c.All())
    37  		c.Start(ctx, t, c.All(), args)
    38  		waitForFullReplication(t, c.Conn(ctx, 1))
    39  
    40  		gossipNetwork := func(node int) string {
    41  			const query = `
    42  SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
    43    FROM (SELECT * FROM crdb_internal.gossip_network ORDER BY source_id, target_id)
    44  `
    45  
    46  			db := c.Conn(ctx, node)
    47  			defer db.Close()
    48  			var s gosql.NullString
    49  			if err := db.QueryRow(query).Scan(&s); err != nil {
    50  				t.Fatal(err)
    51  			}
    52  			if s.Valid {
    53  				return s.String
    54  			}
    55  			return ""
    56  		}
    57  
    58  		var deadNode int
    59  		gossipOK := func(start time.Time) bool {
    60  			var expected string
    61  			var initialized bool
    62  			for i := 1; i <= c.spec.NodeCount; i++ {
    63  				if elapsed := timeutil.Since(start); elapsed >= 20*time.Second {
    64  					t.Fatalf("gossip did not stabilize in %.1fs", elapsed.Seconds())
    65  				}
    66  
    67  				if i == deadNode {
    68  					continue
    69  				}
    70  				c.l.Printf("%d: checking gossip\n", i)
    71  				s := gossipNetwork(i)
    72  				if !initialized {
    73  					deadNodeStr := fmt.Sprint(deadNode)
    74  					split := func(c rune) bool {
    75  						return !unicode.IsNumber(c)
    76  					}
    77  					for _, id := range strings.FieldsFunc(s, split) {
    78  						if id == deadNodeStr {
    79  							c.l.Printf("%d: gossip not ok (dead node %d present): %s (%.0fs)\n",
    80  								i, deadNode, s, timeutil.Since(start).Seconds())
    81  							return false
    82  						}
    83  					}
    84  					initialized = true
    85  					expected = s
    86  					continue
    87  				}
    88  				if expected != s {
    89  					c.l.Printf("%d: gossip not ok: %s != %s (%.0fs)\n",
    90  						i, expected, s, timeutil.Since(start).Seconds())
    91  					return false
    92  				}
    93  			}
    94  			c.l.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds())
    95  			return true
    96  		}
    97  
    98  		waitForGossip := func() {
    99  			t.Status("waiting for gossip to stabilize")
   100  			start := timeutil.Now()
   101  			for {
   102  				if gossipOK(start) {
   103  					return
   104  				}
   105  				time.Sleep(time.Second)
   106  			}
   107  		}
   108  
   109  		waitForGossip()
   110  		nodes := c.All()
   111  		for j := 0; j < 100; j++ {
   112  			deadNode = nodes.randNode()[0]
   113  			c.Stop(ctx, c.Node(deadNode))
   114  			waitForGossip()
   115  			c.Start(ctx, t, c.Node(deadNode), args)
   116  		}
   117  	}
   118  
   119  	r.Add(testSpec{
   120  		Name:    fmt.Sprintf("gossip/chaos/nodes=9"),
   121  		Owner:   OwnerKV,
   122  		Cluster: makeClusterSpec(9),
   123  		Run: func(ctx context.Context, t *test, c *cluster) {
   124  			runGossipChaos(ctx, t, c)
   125  		},
   126  	})
   127  }
   128  
   129  type gossipUtil struct {
   130  	waitTime time.Duration
   131  	urlMap   map[int]string
   132  	conn     func(ctx context.Context, i int) *gosql.DB
   133  }
   134  
   135  func newGossipUtil(ctx context.Context, c *cluster) *gossipUtil {
   136  	urlMap := make(map[int]string)
   137  	for i, addr := range c.ExternalAdminUIAddr(ctx, c.All()) {
   138  		urlMap[i+1] = `http://` + addr
   139  	}
   140  	return &gossipUtil{
   141  		waitTime: 30 * time.Second,
   142  		urlMap:   urlMap,
   143  		conn:     c.Conn,
   144  	}
   145  }
   146  
   147  type checkGossipFunc func(map[string]gossip.Info) error
   148  
   149  // checkGossip fetches the gossip infoStore from each node and invokes the
   150  // given function. The test passes if the function returns 0 for every node,
   151  // retrying for up to the given duration.
   152  func (g *gossipUtil) check(ctx context.Context, c *cluster, f checkGossipFunc) error {
   153  	return retry.ForDuration(g.waitTime, func() error {
   154  		var infoStatus gossip.InfoStatus
   155  		for i := 1; i <= c.spec.NodeCount; i++ {
   156  			url := g.urlMap[i] + `/_status/gossip/local`
   157  			if err := httputil.GetJSON(http.Client{}, url, &infoStatus); err != nil {
   158  				return errors.Wrapf(err, "failed to get gossip status from node %d", i)
   159  			}
   160  			if err := f(infoStatus.Infos); err != nil {
   161  				return errors.Wrapf(err, "node %d", i)
   162  			}
   163  		}
   164  
   165  		return nil
   166  	})
   167  }
   168  
   169  // hasPeers returns a checkGossipFunc that passes when the given number of
   170  // peers are connected via gossip.
   171  func (gossipUtil) hasPeers(expected int) checkGossipFunc {
   172  	return func(infos map[string]gossip.Info) error {
   173  		count := 0
   174  		for k := range infos {
   175  			if strings.HasPrefix(k, gossip.KeyNodeIDPrefix) {
   176  				count++
   177  			}
   178  		}
   179  		if count != expected {
   180  			return errors.Errorf("expected %d peers, found %d", expected, count)
   181  		}
   182  		return nil
   183  	}
   184  }
   185  
   186  // hasSentinel is a checkGossipFunc that passes when the sentinel gossip is present.
   187  func (gossipUtil) hasSentinel(infos map[string]gossip.Info) error {
   188  	if _, ok := infos[gossip.KeySentinel]; !ok {
   189  		return errors.Errorf("sentinel not found")
   190  	}
   191  	return nil
   192  }
   193  
   194  // hasClusterID is a checkGossipFunc that passes when the cluster ID gossip is present.
   195  func (gossipUtil) hasClusterID(infos map[string]gossip.Info) error {
   196  	if _, ok := infos[gossip.KeyClusterID]; !ok {
   197  		return errors.Errorf("cluster ID not found")
   198  	}
   199  	return nil
   200  }
   201  
   202  func (g *gossipUtil) checkConnectedAndFunctional(ctx context.Context, t *test, c *cluster) {
   203  	t.l.Printf("waiting for gossip to be connected\n")
   204  	if err := g.check(ctx, c, g.hasPeers(c.spec.NodeCount)); err != nil {
   205  		t.Fatal(err)
   206  	}
   207  	if err := g.check(ctx, c, g.hasClusterID); err != nil {
   208  		t.Fatal(err)
   209  	}
   210  	if err := g.check(ctx, c, g.hasSentinel); err != nil {
   211  		t.Fatal(err)
   212  	}
   213  
   214  	for i := 1; i <= c.spec.NodeCount; i++ {
   215  		db := g.conn(ctx, i)
   216  		defer db.Close()
   217  		if i == 1 {
   218  			if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS test"); err != nil {
   219  				t.Fatal(err)
   220  			}
   221  			if _, err := db.Exec("CREATE TABLE IF NOT EXISTS test.kv (k INT PRIMARY KEY, v INT)"); err != nil {
   222  				t.Fatal(err)
   223  			}
   224  			if _, err := db.Exec(`UPSERT INTO test.kv (k, v) VALUES (1, 0)`); err != nil {
   225  				t.Fatal(err)
   226  			}
   227  		}
   228  		rows, err := db.Query(`UPDATE test.kv SET v=v+1 WHERE k=1 RETURNING v`)
   229  		if err != nil {
   230  			t.Fatal(err)
   231  		}
   232  		defer rows.Close()
   233  		var count int
   234  		if rows.Next() {
   235  			if err := rows.Scan(&count); err != nil {
   236  				t.Fatal(err)
   237  			}
   238  			if count != i {
   239  				t.Fatalf("unexpected value %d for write #%d (expected %d)", count, i, i)
   240  			}
   241  		} else {
   242  			t.Fatalf("no results found from update")
   243  		}
   244  	}
   245  }
   246  
   247  func runGossipPeerings(ctx context.Context, t *test, c *cluster) {
   248  	c.Put(ctx, cockroach, "./cockroach")
   249  	c.Start(ctx, t)
   250  
   251  	// Repeatedly restart a random node and verify that all of the nodes are
   252  	// seeing the gossiped values.
   253  
   254  	g := newGossipUtil(ctx, c)
   255  	deadline := timeutil.Now().Add(time.Minute)
   256  
   257  	for i := 1; timeutil.Now().Before(deadline); i++ {
   258  		if err := g.check(ctx, c, g.hasPeers(c.spec.NodeCount)); err != nil {
   259  			t.Fatal(err)
   260  		}
   261  		if err := g.check(ctx, c, g.hasClusterID); err != nil {
   262  			t.Fatal(err)
   263  		}
   264  		if err := g.check(ctx, c, g.hasSentinel); err != nil {
   265  			t.Fatal(err)
   266  		}
   267  		t.l.Printf("%d: OK\n", i)
   268  
   269  		// Restart a random node.
   270  		node := c.All().randNode()
   271  		t.l.Printf("%d: restarting node %d\n", i, node[0])
   272  		c.Stop(ctx, node)
   273  		c.Start(ctx, t, node)
   274  	}
   275  }
   276  
   277  func runGossipRestart(ctx context.Context, t *test, c *cluster) {
   278  	c.Put(ctx, cockroach, "./cockroach")
   279  	c.Start(ctx, t)
   280  
   281  	// Repeatedly stop and restart a cluster and verify that we can perform basic
   282  	// operations. This is stressing the gossiping of the first range descriptor
   283  	// which is required for any node to be able do even the most basic
   284  	// operations on a cluster.
   285  
   286  	g := newGossipUtil(ctx, c)
   287  	deadline := timeutil.Now().Add(time.Minute)
   288  
   289  	for i := 1; timeutil.Now().Before(deadline); i++ {
   290  		g.checkConnectedAndFunctional(ctx, t, c)
   291  		t.l.Printf("%d: OK\n", i)
   292  
   293  		t.l.Printf("%d: killing all nodes\n", i)
   294  		c.Stop(ctx)
   295  
   296  		t.l.Printf("%d: restarting all nodes\n", i)
   297  		c.Start(ctx, t)
   298  	}
   299  }
   300  
   301  func runGossipRestartNodeOne(ctx context.Context, t *test, c *cluster) {
   302  	args := startArgs("--env=COCKROACH_SCAN_MAX_IDLE_TIME=5ms", "--encrypt=false")
   303  	c.Put(ctx, cockroach, "./cockroach")
   304  	// Reduce the scan max idle time to speed up evacuation of node 1.
   305  	c.Start(ctx, t, racks(c.spec.NodeCount), args)
   306  
   307  	db := c.Conn(ctx, 1)
   308  	defer db.Close()
   309  
   310  	run := func(stmtStr string) {
   311  		stmt := fmt.Sprintf(stmtStr, "", "=")
   312  		t.l.Printf("%s\n", stmt)
   313  		_, err := db.ExecContext(ctx, stmt)
   314  		if err != nil && strings.Contains(err.Error(), "syntax error") {
   315  			// Pre-2.1 was EXPERIMENTAL.
   316  			// TODO(knz): Remove this in 2.2.
   317  			stmt = fmt.Sprintf(stmtStr, "EXPERIMENTAL", "")
   318  			t.l.Printf("%s\n", stmt)
   319  			_, err = db.ExecContext(ctx, stmt)
   320  		}
   321  		if err != nil {
   322  			t.Fatal(err)
   323  		}
   324  	}
   325  
   326  	// Wait for gossip to propagate - otherwise attempting to set zone
   327  	// constraints can fail with an error about how the constraint doesn't match
   328  	// any nodes in the cluster (#30220).
   329  	var lastNodeCount int
   330  	if err := retry.ForDuration(30*time.Second, func() error {
   331  		const query = `SELECT count(*) FROM crdb_internal.gossip_nodes`
   332  		var count int
   333  		if err := db.QueryRow(query).Scan(&count); err != nil {
   334  			t.Fatal(err)
   335  		}
   336  		if count <= 1 {
   337  			err := errors.Errorf("node 1 still only knows about %d node%s",
   338  				count, util.Pluralize(int64(count)))
   339  			if count != lastNodeCount {
   340  				lastNodeCount = count
   341  				t.l.Printf("%s\n", err)
   342  			}
   343  			return err
   344  		}
   345  		return nil
   346  	}); err != nil {
   347  		t.Fatal(err)
   348  	}
   349  
   350  	// Evacuate all of the ranges off node 1 with zone config constraints. See
   351  	// the racks setting specified when the cluster was started.
   352  	run(`ALTER RANGE default %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   353  	run(`ALTER RANGE system %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   354  	run(`ALTER DATABASE system %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   355  	run(`ALTER RANGE meta %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   356  	run(`ALTER RANGE liveness %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   357  	// TODO(andrei): Changing the constraints for the system tables shouldn't be
   358  	// needed given that we've changed them for the system zone. What's going on?
   359  	// #40921.
   360  	run(`ALTER TABLE system.jobs %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   361  	if t.IsBuildVersion("v19.2.0") {
   362  		run(`ALTER TABLE system.replication_stats %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   363  		run(`ALTER TABLE system.replication_constraint_stats %[1]s CONFIGURE ZONE %[2]s 'constraints: {"-rack=0"}'`)
   364  	}
   365  
   366  	var lastReplCount int
   367  	if err := retry.ForDuration(2*time.Minute, func() error {
   368  		const query = `
   369  SELECT count(replicas)
   370    FROM crdb_internal.ranges
   371   WHERE array_position(replicas, 1) IS NOT NULL
   372  `
   373  		var count int
   374  		if err := db.QueryRow(query).Scan(&count); err != nil {
   375  			t.Fatal(err)
   376  		}
   377  		if count > 0 {
   378  			err := errors.Errorf("node 1 still has %d replicas", count)
   379  			if count != lastReplCount {
   380  				lastReplCount = count
   381  				t.l.Printf("%s\n", err)
   382  			}
   383  			return err
   384  		}
   385  		return nil
   386  	}); err != nil {
   387  		t.Fatal(err)
   388  	}
   389  
   390  	t.l.Printf("killing all nodes\n")
   391  	c.Stop(ctx)
   392  
   393  	// Restart node 1, but have it listen on a different port for internal
   394  	// connections. This will require node 1 to reach out to the other nodes in
   395  	// the cluster for gossip info.
   396  	err := c.RunE(ctx, c.Node(1),
   397  		`./cockroach start --insecure --background --store={store-dir} `+
   398  			`--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+
   399  			`--listen-addr=:$[{pgport:1}+10000] --http-port=$[{pgport:1}+1] `+
   400  			`> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`)
   401  	if err != nil {
   402  		t.Fatal(err)
   403  	}
   404  
   405  	// Restart the other nodes. These nodes won't be able to talk to node 1 until
   406  	// node 1 talks to it (they have out of date address info). Node 1 needs
   407  	// incoming gossip info in order to determine where range 1 is.
   408  	c.Start(ctx, t, c.Range(2, c.spec.NodeCount), args)
   409  
   410  	// We need to override DB connection creation to use the correct port for
   411  	// node 1. This is more complicated than it should be and a limitation of the
   412  	// current infrastructure which doesn't know about cockroach nodes started on
   413  	// non-standard ports.
   414  	g := newGossipUtil(ctx, c)
   415  	g.conn = func(ctx context.Context, i int) *gosql.DB {
   416  		if i != 1 {
   417  			return c.Conn(ctx, i)
   418  		}
   419  		url, err := url.Parse(c.ExternalPGUrl(ctx, c.Node(1))[0])
   420  		if err != nil {
   421  			t.Fatal(err)
   422  		}
   423  		host, port, err := net.SplitHostPort(url.Host)
   424  		if err != nil {
   425  			t.Fatal(err)
   426  		}
   427  		v, err := strconv.Atoi(port)
   428  		if err != nil {
   429  			t.Fatal(err)
   430  		}
   431  		url.Host = fmt.Sprintf("%s:%d", host, v+10000)
   432  		db, err := gosql.Open("postgres", url.String())
   433  		if err != nil {
   434  			t.Fatal(err)
   435  		}
   436  		return db
   437  	}
   438  
   439  	g.checkConnectedAndFunctional(ctx, t, c)
   440  
   441  	// Stop our special snowflake process which won't be recognized by the test
   442  	// harness, and start it again on the regular.
   443  	c.Stop(ctx, c.Node(1))
   444  	c.Start(ctx, t, c.Node(1))
   445  }
   446  
   447  func runCheckLocalityIPAddress(ctx context.Context, t *test, c *cluster) {
   448  	c.Put(ctx, cockroach, "./cockroach")
   449  
   450  	externalIP := c.ExternalIP(ctx, c.Range(1, c.spec.NodeCount))
   451  
   452  	for i := 1; i <= c.spec.NodeCount; i++ {
   453  		if local {
   454  			externalIP[i-1] = "localhost"
   455  		}
   456  		extAddr := externalIP[i-1]
   457  
   458  		c.Start(ctx, t, c.Node(i), startArgs("--racks=1",
   459  			fmt.Sprintf("--args=--locality-advertise-addr=rack=0@%s", extAddr)))
   460  	}
   461  
   462  	rowCount := 0
   463  
   464  	for i := 1; i <= c.spec.NodeCount; i++ {
   465  		db := c.Conn(ctx, 1)
   466  		defer db.Close()
   467  
   468  		rows, err := db.Query(
   469  			`SELECT node_id, advertise_address FROM crdb_internal.gossip_nodes`,
   470  		)
   471  		if err != nil {
   472  			t.Fatal(err)
   473  		}
   474  
   475  		for rows.Next() {
   476  			rowCount++
   477  			var nodeID int
   478  			var advertiseAddress string
   479  			if err := rows.Scan(&nodeID, &advertiseAddress); err != nil {
   480  				t.Fatal(err)
   481  			}
   482  
   483  			if local {
   484  				if !strings.Contains(advertiseAddress, "localhost") {
   485  					t.Fatal("Expected connect address to contain localhost")
   486  				}
   487  			} else if exp := c.ExternalAddr(ctx, c.Node(nodeID))[0]; exp != advertiseAddress {
   488  				t.Fatalf("Connection address is %s but expected %s", advertiseAddress, exp)
   489  			}
   490  		}
   491  	}
   492  	if rowCount <= 0 {
   493  		t.Fatal("No results for " +
   494  			"SELECT node_id, advertise_address FROM crdb_internal.gossip_nodes")
   495  	}
   496  }