github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cluster_init.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"io/ioutil"
    18  	"net/http"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/server"
    23  	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
    24  	"github.com/cockroachdb/cockroach/pkg/util/httputil"
    25  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    26  	"golang.org/x/sync/errgroup"
    27  )
    28  
    29  func runClusterInit(ctx context.Context, t *test, c *cluster) {
    30  	c.Put(ctx, cockroach, "./cockroach")
    31  
    32  	addrs := c.InternalAddr(ctx, c.All())
    33  
    34  	// TODO(tbg): this should never happen, but I saw it locally. The result
    35  	// is the test hanging forever, because all nodes will create their own
    36  	// single node cluster and waitForFullReplication never returns.
    37  	if addrs[0] == "" {
    38  		t.Fatal("no address for first node")
    39  	}
    40  
    41  	// Legacy-style init where we start node 1 without a join flag and then point
    42  	// the other nodes at it.
    43  	func() {
    44  		var g errgroup.Group
    45  		g.Go(func() error {
    46  			return c.RunE(ctx, c.Node(1),
    47  				`mkdir -p {log-dir} && `+
    48  					`./cockroach start --insecure --background --store={store-dir} `+
    49  					`--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+
    50  					`--listen-addr=:{pgport:1} --http-port=$[{pgport:1}+1] `+
    51  					`> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`)
    52  		})
    53  		for i := 2; i <= c.spec.NodeCount; i++ {
    54  			i := i
    55  			g.Go(func() error {
    56  				return c.RunE(ctx, c.Node(i),
    57  					fmt.Sprintf(
    58  						`mkdir -p {log-dir} && `+
    59  							`./cockroach start --insecure --background --store={store-dir} `+
    60  							`--log-dir={log-dir} --cache=10%% --max-sql-memory=10%% `+
    61  							`--listen-addr=:{pgport:%[1]d} --http-port=$[{pgport:%[1]d}+1] `+
    62  							`--join=`+addrs[0]+
    63  							`> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`, i))
    64  			})
    65  		}
    66  		if err := g.Wait(); err != nil {
    67  			t.Fatal(err)
    68  		}
    69  
    70  		db := c.Conn(ctx, 1)
    71  		defer db.Close()
    72  		waitForFullReplication(t, db)
    73  	}()
    74  
    75  	// New-style init where we start all nodes with the same join flags and then
    76  	// issue an "init" command to one of the nodes.
    77  	for _, initNode := range []int{1, 2} {
    78  		c.Wipe(ctx)
    79  
    80  		func() {
    81  			var g errgroup.Group
    82  			for i := 1; i <= c.spec.NodeCount; i++ {
    83  				i := i
    84  				g.Go(func() error {
    85  					return c.RunE(ctx, c.Node(i),
    86  						fmt.Sprintf(
    87  							`mkdir -p {log-dir} && `+
    88  								`./cockroach start --insecure --background --store={store-dir} `+
    89  								`--log-dir={log-dir} --cache=10%% --max-sql-memory=10%% `+
    90  								`--listen-addr=:{pgport:%[1]d} --http-port=$[{pgport:%[1]d}+1] `+
    91  								`--join=`+strings.Join(addrs, ",")+
    92  								`> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`, i))
    93  				})
    94  			}
    95  
    96  			urlMap := make(map[int]string)
    97  			for i, addr := range c.ExternalAdminUIAddr(ctx, c.All()) {
    98  				urlMap[i+1] = `http://` + addr
    99  			}
   100  
   101  			// Wait for the servers to bind their ports.
   102  			if err := retry.ForDuration(10*time.Second, func() error {
   103  				for i := 1; i <= c.spec.NodeCount; i++ {
   104  					resp, err := httputil.Get(ctx, urlMap[i]+"/health")
   105  					if err != nil {
   106  						return err
   107  					}
   108  					resp.Body.Close()
   109  				}
   110  				return nil
   111  			}); err != nil {
   112  				t.Fatal(err)
   113  			}
   114  
   115  			var dbs []*gosql.DB
   116  			for i := 1; i <= c.spec.NodeCount; i++ {
   117  				db := c.Conn(ctx, i)
   118  				defer db.Close()
   119  				dbs = append(dbs, db)
   120  			}
   121  
   122  			// Initially, we can connect to any node, but queries issued will hang.
   123  			errCh := make(chan error, len(dbs))
   124  			for _, db := range dbs {
   125  				db := db
   126  				go func() {
   127  					var val int
   128  					errCh <- db.QueryRow("SELECT 1").Scan(&val)
   129  				}()
   130  			}
   131  
   132  			// Give them time to get a "connection refused" or similar error if
   133  			// the server isn't listening.
   134  			time.Sleep(time.Second)
   135  			select {
   136  			case err := <-errCh:
   137  				t.Fatalf("query finished prematurely with err %v", err)
   138  			default:
   139  			}
   140  
   141  			// Check that the /health endpoint is functional even before cluster init,
   142  			// whereas other debug endpoints return an appropriate error.
   143  			httpTests := []struct {
   144  				endpoint       string
   145  				expectedStatus int
   146  			}{
   147  				{"/health", http.StatusOK},
   148  				{"/health?ready=1", http.StatusServiceUnavailable},
   149  				{"/_status/nodes", http.StatusNotFound},
   150  			}
   151  			for _, tc := range httpTests {
   152  				for _, withCookie := range []bool{false, true} {
   153  					req, err := http.NewRequest("GET", urlMap[1]+tc.endpoint, nil /* body */)
   154  					if err != nil {
   155  						t.Fatalf("unexpected error while constructing request for %s: %s", tc.endpoint, err)
   156  					}
   157  					if withCookie {
   158  						// Prevent regression of #25771 by also sending authenticated
   159  						// requests, like would be sent if an admin UI were open against
   160  						// this node while it booted.
   161  						cookie, err := server.EncodeSessionCookie(&serverpb.SessionCookie{
   162  							// The actual contents of the cookie don't matter; the presence of
   163  							// a valid encoded cookie is enough to trigger the authentication
   164  							// code paths.
   165  						}, false /* forHTTPSOnly - cluster is insecure */)
   166  						if err != nil {
   167  							t.Fatal(err)
   168  						}
   169  						req.AddCookie(cookie)
   170  					}
   171  					resp, err := http.DefaultClient.Do(req)
   172  					if err != nil {
   173  						t.Fatalf("unexpected error hitting %s endpoint: %v", tc.endpoint, err)
   174  					}
   175  					defer resp.Body.Close()
   176  					if resp.StatusCode != tc.expectedStatus {
   177  						bodyBytes, _ := ioutil.ReadAll(resp.Body)
   178  						t.Fatalf("unexpected response code %d (expected %d) hitting %s endpoint: %v",
   179  							resp.StatusCode, tc.expectedStatus, tc.endpoint, string(bodyBytes))
   180  					}
   181  				}
   182  
   183  			}
   184  
   185  			c.Run(ctx, c.Node(initNode),
   186  				fmt.Sprintf(`./cockroach init --insecure --port={pgport:%d}`, initNode))
   187  			if err := g.Wait(); err != nil {
   188  				t.Fatal(err)
   189  			}
   190  
   191  			// This will only succeed if 3 nodes joined the cluster.
   192  			waitForFullReplication(t, dbs[0])
   193  
   194  			execCLI := func(runNode int, extraArgs ...string) (string, error) {
   195  				args := []string{"./cockroach"}
   196  				args = append(args, extraArgs...)
   197  				args = append(args, "--insecure")
   198  				args = append(args, fmt.Sprintf("--port={pgport:%d}", runNode))
   199  				buf, err := c.RunWithBuffer(ctx, c.l, c.Node(runNode), args...)
   200  				t.l.Printf("%s\n", buf)
   201  				return string(buf), err
   202  			}
   203  
   204  			{
   205  				// Make sure that running init again returns the expected error message and
   206  				// does not break the cluster. We have to use ExecCLI rather than OneShot in
   207  				// order to actually get the output from the command.
   208  				if output, err := execCLI(initNode, "init"); err == nil {
   209  					t.Fatalf("expected error running init command on initialized cluster\n%s", output)
   210  				} else if !strings.Contains(output, "cluster has already been initialized") {
   211  					t.Fatalf("unexpected output when running init command on initialized cluster: %v\n%s",
   212  						err, output)
   213  				}
   214  			}
   215  
   216  			// Once initialized, the queries we started earlier will finish.
   217  			deadline := time.After(10 * time.Second)
   218  			for i := 0; i < len(dbs); i++ {
   219  				select {
   220  				case err := <-errCh:
   221  					if err != nil {
   222  						t.Fatalf("querying node %d: %s", i, err)
   223  					}
   224  				case <-deadline:
   225  					t.Fatalf("timed out waiting for query %d", i)
   226  				}
   227  			}
   228  
   229  			// New queries will work too.
   230  			for i, db := range dbs {
   231  				var val int
   232  				if err := db.QueryRow("SELECT 1").Scan(&val); err != nil {
   233  					t.Fatalf("querying node %d: %s", i, err)
   234  				}
   235  			}
   236  		}()
   237  	}
   238  }