github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/network.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "io/ioutil" 18 "time" 19 20 toxiproxy "github.com/Shopify/toxiproxy/client" 21 "github.com/cockroachdb/cockroach/pkg/util/httputil" 22 _ "github.com/lib/pq" 23 ) 24 25 // runNetworkSanity is just a sanity check to make sure we're setting up toxiproxy 26 // correctly. It injects latency between the nodes and verifies that we're not 27 // seeing the latency on the client connection running `SELECT 1` on each node. 28 func runNetworkSanity(ctx context.Context, t *test, origC *cluster, nodes int) { 29 origC.Put(ctx, cockroach, "./cockroach", origC.All()) 30 c, err := Toxify(ctx, origC, origC.All()) 31 if err != nil { 32 t.Fatal(err) 33 } 34 35 c.Start(ctx, t, c.All()) 36 37 db := c.Conn(ctx, 1) // unaffected by toxiproxy 38 defer db.Close() 39 waitForFullReplication(t, db) 40 41 // NB: we're generous with latency in this test because we're checking that 42 // the upstream connections aren't affected by latency below, but the fixed 43 // cost of starting the binary and processing the query is already close to 44 // 100ms. 45 const latency = 300 * time.Millisecond 46 for i := 1; i <= nodes; i++ { 47 // NB: note that these latencies only apply to connections *to* the node 48 // on which the toxic is active. That is, if n1 has a (down or upstream) 49 // latency toxic of 100ms, then none of its outbound connections are 50 // affected but any connections made to it by other nodes will. 51 // In particular, it's difficult to simulate intricate network partitions 52 // as there's no way to activate toxics only for certain peers. 53 proxy := c.Proxy(i) 54 if _, err := proxy.AddToxic("", "latency", "downstream", 1, toxiproxy.Attributes{ 55 "latency": latency / (2 * time.Millisecond), // ms 56 }); err != nil { 57 t.Fatal(err) 58 } 59 if _, err := proxy.AddToxic("", "latency", "upstream", 1, toxiproxy.Attributes{ 60 "latency": latency / (2 * time.Millisecond), // ms 61 }); err != nil { 62 t.Fatal(err) 63 } 64 } 65 66 m := newMonitor(ctx, c.cluster, c.All()) 67 m.Go(func(ctx context.Context) error { 68 c.Measure(ctx, 1, `SET CLUSTER SETTING trace.debug.enable = true`) 69 c.Measure(ctx, 1, "CREATE DATABASE test") 70 c.Measure(ctx, 1, `CREATE TABLE test.commit (a INT, b INT, v INT, PRIMARY KEY (a, b))`) 71 72 for i := 0; i < 10; i++ { 73 duration := c.Measure(ctx, 1, fmt.Sprintf( 74 "BEGIN; INSERT INTO test.commit VALUES (2, %[1]d), (1, %[1]d), (3, %[1]d); COMMIT", 75 i, 76 )) 77 c.l.Printf("%s\n", duration) 78 } 79 80 c.Measure(ctx, 1, ` 81 set tracing=on; 82 insert into test.commit values(3,1000), (1,1000), (2,1000); 83 select age, message from [ show trace for session ]; 84 `) 85 86 for i := 1; i <= origC.spec.NodeCount; i++ { 87 if dur := c.Measure(ctx, i, `SELECT 1`); dur > latency { 88 t.Fatalf("node %d unexpectedly affected by latency: select 1 took %.2fs", i, dur.Seconds()) 89 } 90 } 91 92 return nil 93 }) 94 95 m.Wait() 96 } 97 98 func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) { 99 n := origC.spec.NodeCount 100 serverNodes, workerNode := origC.Range(1, n-1), origC.Node(n) 101 origC.Put(ctx, cockroach, "./cockroach", origC.All()) 102 origC.Put(ctx, workload, "./workload", origC.All()) 103 104 c, err := Toxify(ctx, origC, serverNodes) 105 if err != nil { 106 t.Fatal(err) 107 } 108 109 const warehouses = 1 110 c.Start(ctx, t, serverNodes) 111 c.Run(ctx, workerNode, fmt.Sprintf( 112 `./workload fixtures load tpcc --warehouses=%d {pgurl:1}`, warehouses, 113 )) 114 115 db := c.Conn(ctx, 1) 116 defer db.Close() 117 waitForFullReplication(t, db) 118 119 duration := time.Hour 120 if local { 121 // NB: this is really just testing the test with this duration, it won't 122 // be able to detect slow goroutine leaks. 123 duration = 5 * time.Minute 124 } 125 126 // Run TPCC, but don't give it the first node (or it basically won't do anything). 127 m := newMonitor(ctx, c.cluster, serverNodes) 128 129 m.Go(func(ctx context.Context) error { 130 t.WorkerStatus("running tpcc") 131 132 cmd := fmt.Sprintf( 133 "./workload run tpcc --warehouses=%d --wait=false"+ 134 " --histograms="+perfArtifactsDir+"/stats.json"+ 135 " --duration=%s {pgurl:2-%d}", 136 warehouses, duration, c.spec.NodeCount-1) 137 return c.RunE(ctx, workerNode, cmd) 138 }) 139 140 checkGoroutines := func(ctx context.Context) int { 141 // NB: at the time of writing, the goroutine count would quickly 142 // stabilize near 230 when the network is partitioned, and around 270 143 // when it isn't. Experimentally a past "slow" goroutine leak leaked ~3 144 // goroutines every minute (though it would likely be more with the tpcc 145 // workload above), which over the duration of an hour would easily push 146 // us over the threshold. 147 const thresh = 350 148 149 uiAddrs := c.ExternalAdminUIAddr(ctx, serverNodes) 150 var maxSeen int 151 // The goroutine dump may take a while to generate, maybe more 152 // than the 3 second timeout of the default http client. 153 httpClient := httputil.NewClientWithTimeout(15 * time.Second) 154 for _, addr := range uiAddrs { 155 url := "http://" + addr + "/debug/pprof/goroutine?debug=2" 156 resp, err := httpClient.Get(ctx, url) 157 if err != nil { 158 t.Fatal(err) 159 } 160 content, err := ioutil.ReadAll(resp.Body) 161 resp.Body.Close() 162 if err != nil { 163 t.Fatal(err) 164 } 165 numGoroutines := bytes.Count(content, []byte("goroutine ")) 166 if numGoroutines >= thresh { 167 t.Fatalf("%s shows %d goroutines (expected <%d)", url, numGoroutines, thresh) 168 } 169 if maxSeen < numGoroutines { 170 maxSeen = numGoroutines 171 } 172 } 173 return maxSeen 174 } 175 176 m.Go(func(ctx context.Context) error { 177 time.Sleep(10 * time.Second) // give tpcc a head start 178 // Give n1 a network partition from the remainder of the cluster. Note that even though it affects 179 // both the "upstream" and "downstream" directions, this is in fact an asymmetric partition since 180 // it only affects connections *to* the node. n1 itself can connect to the cluster just fine. 181 proxy := c.Proxy(1) 182 c.l.Printf("letting inbound traffic to first node time out") 183 for _, direction := range []string{"upstream", "downstream"} { 184 if _, err := proxy.AddToxic("", "timeout", direction, 1, toxiproxy.Attributes{ 185 "timeout": 0, // forever 186 }); err != nil { 187 t.Fatal(err) 188 } 189 } 190 191 t.WorkerStatus("checking goroutines") 192 done := time.After(duration) 193 var maxSeen int 194 for { 195 cur := checkGoroutines(ctx) 196 if maxSeen < cur { 197 c.l.Printf("new goroutine peak: %d", cur) 198 maxSeen = cur 199 } 200 201 select { 202 case <-done: 203 c.l.Printf("done checking goroutines, repairing network") 204 // Repair the network. Note that the TPCC workload would never 205 // finish (despite the duration) without this. In particular, 206 // we don't want to m.Wait() before we do this. 207 toxics, err := proxy.Toxics() 208 if err != nil { 209 t.Fatal(err) 210 } 211 for _, toxic := range toxics { 212 if err := proxy.RemoveToxic(toxic.Name); err != nil { 213 t.Fatal(err) 214 } 215 } 216 c.l.Printf("network is repaired") 217 218 // Verify that goroutine count doesn't spike. 219 for i := 0; i < 20; i++ { 220 nowGoroutines := checkGoroutines(ctx) 221 c.l.Printf("currently at most %d goroutines per node", nowGoroutines) 222 time.Sleep(time.Second) 223 } 224 225 return nil 226 default: 227 time.Sleep(3 * time.Second) 228 } 229 } 230 }) 231 232 m.Wait() 233 } 234 235 func registerNetwork(r *testRegistry) { 236 const numNodes = 4 237 238 r.Add(testSpec{ 239 Name: fmt.Sprintf("network/sanity/nodes=%d", numNodes), 240 Owner: OwnerKV, 241 Cluster: makeClusterSpec(numNodes), 242 Run: func(ctx context.Context, t *test, c *cluster) { 243 runNetworkSanity(ctx, t, c, numNodes) 244 }, 245 }) 246 r.Add(testSpec{ 247 Name: fmt.Sprintf("network/tpcc/nodes=%d", numNodes), 248 Owner: OwnerKV, 249 Cluster: makeClusterSpec(numNodes), 250 Run: func(ctx context.Context, t *test, c *cluster) { 251 runNetworkTPCC(ctx, t, c, numNodes) 252 }, 253 }) 254 }