github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cluster_init.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "io/ioutil" 18 "net/http" 19 "strings" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/server" 23 "github.com/cockroachdb/cockroach/pkg/server/serverpb" 24 "github.com/cockroachdb/cockroach/pkg/util/httputil" 25 "github.com/cockroachdb/cockroach/pkg/util/retry" 26 "golang.org/x/sync/errgroup" 27 ) 28 29 func runClusterInit(ctx context.Context, t *test, c *cluster) { 30 c.Put(ctx, cockroach, "./cockroach") 31 32 addrs := c.InternalAddr(ctx, c.All()) 33 34 // TODO(tbg): this should never happen, but I saw it locally. The result 35 // is the test hanging forever, because all nodes will create their own 36 // single node cluster and waitForFullReplication never returns. 37 if addrs[0] == "" { 38 t.Fatal("no address for first node") 39 } 40 41 // Legacy-style init where we start node 1 without a join flag and then point 42 // the other nodes at it. 43 func() { 44 var g errgroup.Group 45 g.Go(func() error { 46 return c.RunE(ctx, c.Node(1), 47 `mkdir -p {log-dir} && `+ 48 `./cockroach start --insecure --background --store={store-dir} `+ 49 `--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+ 50 `--listen-addr=:{pgport:1} --http-port=$[{pgport:1}+1] `+ 51 `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`) 52 }) 53 for i := 2; i <= c.spec.NodeCount; i++ { 54 i := i 55 g.Go(func() error { 56 return c.RunE(ctx, c.Node(i), 57 fmt.Sprintf( 58 `mkdir -p {log-dir} && `+ 59 `./cockroach start --insecure --background --store={store-dir} `+ 60 `--log-dir={log-dir} --cache=10%% --max-sql-memory=10%% `+ 61 `--listen-addr=:{pgport:%[1]d} --http-port=$[{pgport:%[1]d}+1] `+ 62 `--join=`+addrs[0]+ 63 `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`, i)) 64 }) 65 } 66 if err := g.Wait(); err != nil { 67 t.Fatal(err) 68 } 69 70 db := c.Conn(ctx, 1) 71 defer db.Close() 72 waitForFullReplication(t, db) 73 }() 74 75 // New-style init where we start all nodes with the same join flags and then 76 // issue an "init" command to one of the nodes. 77 for _, initNode := range []int{1, 2} { 78 c.Wipe(ctx) 79 80 func() { 81 var g errgroup.Group 82 for i := 1; i <= c.spec.NodeCount; i++ { 83 i := i 84 g.Go(func() error { 85 return c.RunE(ctx, c.Node(i), 86 fmt.Sprintf( 87 `mkdir -p {log-dir} && `+ 88 `./cockroach start --insecure --background --store={store-dir} `+ 89 `--log-dir={log-dir} --cache=10%% --max-sql-memory=10%% `+ 90 `--listen-addr=:{pgport:%[1]d} --http-port=$[{pgport:%[1]d}+1] `+ 91 `--join=`+strings.Join(addrs, ",")+ 92 `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`, i)) 93 }) 94 } 95 96 urlMap := make(map[int]string) 97 for i, addr := range c.ExternalAdminUIAddr(ctx, c.All()) { 98 urlMap[i+1] = `http://` + addr 99 } 100 101 // Wait for the servers to bind their ports. 102 if err := retry.ForDuration(10*time.Second, func() error { 103 for i := 1; i <= c.spec.NodeCount; i++ { 104 resp, err := httputil.Get(ctx, urlMap[i]+"/health") 105 if err != nil { 106 return err 107 } 108 resp.Body.Close() 109 } 110 return nil 111 }); err != nil { 112 t.Fatal(err) 113 } 114 115 var dbs []*gosql.DB 116 for i := 1; i <= c.spec.NodeCount; i++ { 117 db := c.Conn(ctx, i) 118 defer db.Close() 119 dbs = append(dbs, db) 120 } 121 122 // Initially, we can connect to any node, but queries issued will hang. 123 errCh := make(chan error, len(dbs)) 124 for _, db := range dbs { 125 db := db 126 go func() { 127 var val int 128 errCh <- db.QueryRow("SELECT 1").Scan(&val) 129 }() 130 } 131 132 // Give them time to get a "connection refused" or similar error if 133 // the server isn't listening. 134 time.Sleep(time.Second) 135 select { 136 case err := <-errCh: 137 t.Fatalf("query finished prematurely with err %v", err) 138 default: 139 } 140 141 // Check that the /health endpoint is functional even before cluster init, 142 // whereas other debug endpoints return an appropriate error. 143 httpTests := []struct { 144 endpoint string 145 expectedStatus int 146 }{ 147 {"/health", http.StatusOK}, 148 {"/health?ready=1", http.StatusServiceUnavailable}, 149 {"/_status/nodes", http.StatusNotFound}, 150 } 151 for _, tc := range httpTests { 152 for _, withCookie := range []bool{false, true} { 153 req, err := http.NewRequest("GET", urlMap[1]+tc.endpoint, nil /* body */) 154 if err != nil { 155 t.Fatalf("unexpected error while constructing request for %s: %s", tc.endpoint, err) 156 } 157 if withCookie { 158 // Prevent regression of #25771 by also sending authenticated 159 // requests, like would be sent if an admin UI were open against 160 // this node while it booted. 161 cookie, err := server.EncodeSessionCookie(&serverpb.SessionCookie{ 162 // The actual contents of the cookie don't matter; the presence of 163 // a valid encoded cookie is enough to trigger the authentication 164 // code paths. 165 }, false /* forHTTPSOnly - cluster is insecure */) 166 if err != nil { 167 t.Fatal(err) 168 } 169 req.AddCookie(cookie) 170 } 171 resp, err := http.DefaultClient.Do(req) 172 if err != nil { 173 t.Fatalf("unexpected error hitting %s endpoint: %v", tc.endpoint, err) 174 } 175 defer resp.Body.Close() 176 if resp.StatusCode != tc.expectedStatus { 177 bodyBytes, _ := ioutil.ReadAll(resp.Body) 178 t.Fatalf("unexpected response code %d (expected %d) hitting %s endpoint: %v", 179 resp.StatusCode, tc.expectedStatus, tc.endpoint, string(bodyBytes)) 180 } 181 } 182 183 } 184 185 c.Run(ctx, c.Node(initNode), 186 fmt.Sprintf(`./cockroach init --insecure --port={pgport:%d}`, initNode)) 187 if err := g.Wait(); err != nil { 188 t.Fatal(err) 189 } 190 191 // This will only succeed if 3 nodes joined the cluster. 192 waitForFullReplication(t, dbs[0]) 193 194 execCLI := func(runNode int, extraArgs ...string) (string, error) { 195 args := []string{"./cockroach"} 196 args = append(args, extraArgs...) 197 args = append(args, "--insecure") 198 args = append(args, fmt.Sprintf("--port={pgport:%d}", runNode)) 199 buf, err := c.RunWithBuffer(ctx, c.l, c.Node(runNode), args...) 200 t.l.Printf("%s\n", buf) 201 return string(buf), err 202 } 203 204 { 205 // Make sure that running init again returns the expected error message and 206 // does not break the cluster. We have to use ExecCLI rather than OneShot in 207 // order to actually get the output from the command. 208 if output, err := execCLI(initNode, "init"); err == nil { 209 t.Fatalf("expected error running init command on initialized cluster\n%s", output) 210 } else if !strings.Contains(output, "cluster has already been initialized") { 211 t.Fatalf("unexpected output when running init command on initialized cluster: %v\n%s", 212 err, output) 213 } 214 } 215 216 // Once initialized, the queries we started earlier will finish. 217 deadline := time.After(10 * time.Second) 218 for i := 0; i < len(dbs); i++ { 219 select { 220 case err := <-errCh: 221 if err != nil { 222 t.Fatalf("querying node %d: %s", i, err) 223 } 224 case <-deadline: 225 t.Fatalf("timed out waiting for query %d", i) 226 } 227 } 228 229 // New queries will work too. 230 for i, db := range dbs { 231 var val int 232 if err := db.QueryRow("SELECT 1").Scan(&val); err != nil { 233 t.Fatalf("querying node %d: %s", i, err) 234 } 235 } 236 }() 237 } 238 }