github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/network.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"io/ioutil"
    18  	"time"
    19  
    20  	toxiproxy "github.com/Shopify/toxiproxy/client"
    21  	"github.com/cockroachdb/cockroach/pkg/util/httputil"
    22  	_ "github.com/lib/pq"
    23  )
    24  
    25  // runNetworkSanity is just a sanity check to make sure we're setting up toxiproxy
    26  // correctly. It injects latency between the nodes and verifies that we're not
    27  // seeing the latency on the client connection running `SELECT 1` on each node.
    28  func runNetworkSanity(ctx context.Context, t *test, origC *cluster, nodes int) {
    29  	origC.Put(ctx, cockroach, "./cockroach", origC.All())
    30  	c, err := Toxify(ctx, origC, origC.All())
    31  	if err != nil {
    32  		t.Fatal(err)
    33  	}
    34  
    35  	c.Start(ctx, t, c.All())
    36  
    37  	db := c.Conn(ctx, 1) // unaffected by toxiproxy
    38  	defer db.Close()
    39  	waitForFullReplication(t, db)
    40  
    41  	// NB: we're generous with latency in this test because we're checking that
    42  	// the upstream connections aren't affected by latency below, but the fixed
    43  	// cost of starting the binary and processing the query is already close to
    44  	// 100ms.
    45  	const latency = 300 * time.Millisecond
    46  	for i := 1; i <= nodes; i++ {
    47  		// NB: note that these latencies only apply to connections *to* the node
    48  		// on which the toxic is active. That is, if n1 has a (down or upstream)
    49  		// latency toxic of 100ms, then none of its outbound connections are
    50  		// affected but any connections made to it by other nodes will.
    51  		// In particular, it's difficult to simulate intricate network partitions
    52  		// as there's no way to activate toxics only for certain peers.
    53  		proxy := c.Proxy(i)
    54  		if _, err := proxy.AddToxic("", "latency", "downstream", 1, toxiproxy.Attributes{
    55  			"latency": latency / (2 * time.Millisecond), // ms
    56  		}); err != nil {
    57  			t.Fatal(err)
    58  		}
    59  		if _, err := proxy.AddToxic("", "latency", "upstream", 1, toxiproxy.Attributes{
    60  			"latency": latency / (2 * time.Millisecond), // ms
    61  		}); err != nil {
    62  			t.Fatal(err)
    63  		}
    64  	}
    65  
    66  	m := newMonitor(ctx, c.cluster, c.All())
    67  	m.Go(func(ctx context.Context) error {
    68  		c.Measure(ctx, 1, `SET CLUSTER SETTING trace.debug.enable = true`)
    69  		c.Measure(ctx, 1, "CREATE DATABASE test")
    70  		c.Measure(ctx, 1, `CREATE TABLE test.commit (a INT, b INT, v INT, PRIMARY KEY (a, b))`)
    71  
    72  		for i := 0; i < 10; i++ {
    73  			duration := c.Measure(ctx, 1, fmt.Sprintf(
    74  				"BEGIN; INSERT INTO test.commit VALUES (2, %[1]d), (1, %[1]d), (3, %[1]d); COMMIT",
    75  				i,
    76  			))
    77  			c.l.Printf("%s\n", duration)
    78  		}
    79  
    80  		c.Measure(ctx, 1, `
    81  set tracing=on;
    82  insert into test.commit values(3,1000), (1,1000), (2,1000);
    83  select age, message from [ show trace for session ];
    84  `)
    85  
    86  		for i := 1; i <= origC.spec.NodeCount; i++ {
    87  			if dur := c.Measure(ctx, i, `SELECT 1`); dur > latency {
    88  				t.Fatalf("node %d unexpectedly affected by latency: select 1 took %.2fs", i, dur.Seconds())
    89  			}
    90  		}
    91  
    92  		return nil
    93  	})
    94  
    95  	m.Wait()
    96  }
    97  
    98  func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) {
    99  	n := origC.spec.NodeCount
   100  	serverNodes, workerNode := origC.Range(1, n-1), origC.Node(n)
   101  	origC.Put(ctx, cockroach, "./cockroach", origC.All())
   102  	origC.Put(ctx, workload, "./workload", origC.All())
   103  
   104  	c, err := Toxify(ctx, origC, serverNodes)
   105  	if err != nil {
   106  		t.Fatal(err)
   107  	}
   108  
   109  	const warehouses = 1
   110  	c.Start(ctx, t, serverNodes)
   111  	c.Run(ctx, workerNode, fmt.Sprintf(
   112  		`./workload fixtures load tpcc --warehouses=%d {pgurl:1}`, warehouses,
   113  	))
   114  
   115  	db := c.Conn(ctx, 1)
   116  	defer db.Close()
   117  	waitForFullReplication(t, db)
   118  
   119  	duration := time.Hour
   120  	if local {
   121  		// NB: this is really just testing the test with this duration, it won't
   122  		// be able to detect slow goroutine leaks.
   123  		duration = 5 * time.Minute
   124  	}
   125  
   126  	// Run TPCC, but don't give it the first node (or it basically won't do anything).
   127  	m := newMonitor(ctx, c.cluster, serverNodes)
   128  
   129  	m.Go(func(ctx context.Context) error {
   130  		t.WorkerStatus("running tpcc")
   131  
   132  		cmd := fmt.Sprintf(
   133  			"./workload run tpcc --warehouses=%d --wait=false"+
   134  				" --histograms="+perfArtifactsDir+"/stats.json"+
   135  				" --duration=%s {pgurl:2-%d}",
   136  			warehouses, duration, c.spec.NodeCount-1)
   137  		return c.RunE(ctx, workerNode, cmd)
   138  	})
   139  
   140  	checkGoroutines := func(ctx context.Context) int {
   141  		// NB: at the time of writing, the goroutine count would quickly
   142  		// stabilize near 230 when the network is partitioned, and around 270
   143  		// when it isn't. Experimentally a past "slow" goroutine leak leaked ~3
   144  		// goroutines every minute (though it would likely be more with the tpcc
   145  		// workload above), which over the duration of an hour would easily push
   146  		// us over the threshold.
   147  		const thresh = 350
   148  
   149  		uiAddrs := c.ExternalAdminUIAddr(ctx, serverNodes)
   150  		var maxSeen int
   151  		// The goroutine dump may take a while to generate, maybe more
   152  		// than the 3 second timeout of the default http client.
   153  		httpClient := httputil.NewClientWithTimeout(15 * time.Second)
   154  		for _, addr := range uiAddrs {
   155  			url := "http://" + addr + "/debug/pprof/goroutine?debug=2"
   156  			resp, err := httpClient.Get(ctx, url)
   157  			if err != nil {
   158  				t.Fatal(err)
   159  			}
   160  			content, err := ioutil.ReadAll(resp.Body)
   161  			resp.Body.Close()
   162  			if err != nil {
   163  				t.Fatal(err)
   164  			}
   165  			numGoroutines := bytes.Count(content, []byte("goroutine "))
   166  			if numGoroutines >= thresh {
   167  				t.Fatalf("%s shows %d goroutines (expected <%d)", url, numGoroutines, thresh)
   168  			}
   169  			if maxSeen < numGoroutines {
   170  				maxSeen = numGoroutines
   171  			}
   172  		}
   173  		return maxSeen
   174  	}
   175  
   176  	m.Go(func(ctx context.Context) error {
   177  		time.Sleep(10 * time.Second) // give tpcc a head start
   178  		// Give n1 a network partition from the remainder of the cluster. Note that even though it affects
   179  		// both the "upstream" and "downstream" directions, this is in fact an asymmetric partition since
   180  		// it only affects connections *to* the node. n1 itself can connect to the cluster just fine.
   181  		proxy := c.Proxy(1)
   182  		c.l.Printf("letting inbound traffic to first node time out")
   183  		for _, direction := range []string{"upstream", "downstream"} {
   184  			if _, err := proxy.AddToxic("", "timeout", direction, 1, toxiproxy.Attributes{
   185  				"timeout": 0, // forever
   186  			}); err != nil {
   187  				t.Fatal(err)
   188  			}
   189  		}
   190  
   191  		t.WorkerStatus("checking goroutines")
   192  		done := time.After(duration)
   193  		var maxSeen int
   194  		for {
   195  			cur := checkGoroutines(ctx)
   196  			if maxSeen < cur {
   197  				c.l.Printf("new goroutine peak: %d", cur)
   198  				maxSeen = cur
   199  			}
   200  
   201  			select {
   202  			case <-done:
   203  				c.l.Printf("done checking goroutines, repairing network")
   204  				// Repair the network. Note that the TPCC workload would never
   205  				// finish (despite the duration) without this. In particular,
   206  				// we don't want to m.Wait() before we do this.
   207  				toxics, err := proxy.Toxics()
   208  				if err != nil {
   209  					t.Fatal(err)
   210  				}
   211  				for _, toxic := range toxics {
   212  					if err := proxy.RemoveToxic(toxic.Name); err != nil {
   213  						t.Fatal(err)
   214  					}
   215  				}
   216  				c.l.Printf("network is repaired")
   217  
   218  				// Verify that goroutine count doesn't spike.
   219  				for i := 0; i < 20; i++ {
   220  					nowGoroutines := checkGoroutines(ctx)
   221  					c.l.Printf("currently at most %d goroutines per node", nowGoroutines)
   222  					time.Sleep(time.Second)
   223  				}
   224  
   225  				return nil
   226  			default:
   227  				time.Sleep(3 * time.Second)
   228  			}
   229  		}
   230  	})
   231  
   232  	m.Wait()
   233  }
   234  
   235  func registerNetwork(r *testRegistry) {
   236  	const numNodes = 4
   237  
   238  	r.Add(testSpec{
   239  		Name:    fmt.Sprintf("network/sanity/nodes=%d", numNodes),
   240  		Owner:   OwnerKV,
   241  		Cluster: makeClusterSpec(numNodes),
   242  		Run: func(ctx context.Context, t *test, c *cluster) {
   243  			runNetworkSanity(ctx, t, c, numNodes)
   244  		},
   245  	})
   246  	r.Add(testSpec{
   247  		Name:    fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
   248  		Owner:   OwnerKV,
   249  		Cluster: makeClusterSpec(numNodes),
   250  		Run: func(ctx context.Context, t *test, c *cluster) {
   251  			runNetworkTPCC(ctx, t, c, numNodes)
   252  		},
   253  	})
   254  }