github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/install/cluster_synced.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package install 12 13 import ( 14 "bufio" 15 "bytes" 16 "context" 17 "fmt" 18 "io" 19 "io/ioutil" 20 "log" 21 "math" 22 "os" 23 "os/exec" 24 "os/signal" 25 "path/filepath" 26 "sort" 27 "strings" 28 "sync" 29 "syscall" 30 "text/template" 31 "time" 32 33 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config" 34 rperrors "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/errors" 35 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh" 36 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ui" 37 clog "github.com/cockroachdb/cockroach/pkg/util/log" 38 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 39 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 40 "github.com/cockroachdb/errors" 41 crdberrors "github.com/cockroachdb/errors" 42 "golang.org/x/sync/errgroup" 43 ) 44 45 // ClusterImpl TODO(peter): document 46 type ClusterImpl interface { 47 Start(c *SyncedCluster, extraArgs []string) 48 CertsDir(c *SyncedCluster, index int) string 49 NodeDir(c *SyncedCluster, index int) string 50 LogDir(c *SyncedCluster, index int) string 51 NodeURL(c *SyncedCluster, host string, port int) string 52 NodePort(c *SyncedCluster, index int) int 53 NodeUIPort(c *SyncedCluster, index int) int 54 } 55 56 // A SyncedCluster is created from the information in the synced hosts file 57 // and is used as the target for installing and managing various software 58 // components. 59 // 60 // TODO(benesch): unify with CloudCluster. 61 type SyncedCluster struct { 62 // name, vms, users, localities are populated at init time. 63 Name string 64 VMs []string 65 Users []string 66 Localities []string 67 VPCs []string 68 // all other fields are populated in newCluster. 69 Nodes []int 70 Secure bool 71 Env string 72 Args []string 73 Tag string 74 Impl ClusterImpl 75 UseTreeDist bool 76 Quiet bool 77 MaxConcurrency int // used in Parallel 78 // AuthorizedKeys is used by SetupSSH to add additional authorized keys. 79 AuthorizedKeys []byte 80 81 // Used to stash debug information. 82 DebugDir string 83 } 84 85 // CmdKind is the kind of command passed to SyncedCluster.Run(). 86 type CmdKind int 87 88 // The kinds of commands passed to SyncedCluster.Run(). 89 const ( 90 // A cockroach command is passed. 91 CockroachCmd CmdKind = iota 92 93 // A non-classified command is passed. 94 OtherCmd 95 ) 96 97 func (ck CmdKind) classifyError(err error) rperrors.Error { 98 if ck == CockroachCmd { 99 return rperrors.ClassifyCockroachError(err) 100 } 101 102 return rperrors.ClassifyCmdError(err) 103 } 104 105 func (c *SyncedCluster) host(index int) string { 106 return c.VMs[index-1] 107 } 108 109 func (c *SyncedCluster) user(index int) string { 110 return c.Users[index-1] 111 } 112 113 func (c *SyncedCluster) locality(index int) string { 114 return c.Localities[index-1] 115 } 116 117 // IsLocal TODO(peter): document 118 // 119 // TODO(tschottdorf): roachprod should cleanly encapsulate the home directory 120 // which is currently the biggest culprit for awkward one-offs. 121 func (c *SyncedCluster) IsLocal() bool { 122 return c.Name == config.Local 123 } 124 125 // ServerNodes TODO(peter): document 126 func (c *SyncedCluster) ServerNodes() []int { 127 return append([]int{}, c.Nodes...) 128 } 129 130 // GetInternalIP returns the internal IP address of the specified node. 131 func (c *SyncedCluster) GetInternalIP(index int) (string, error) { 132 if c.IsLocal() { 133 return c.host(index), nil 134 } 135 136 session, err := c.newSession(index) 137 if err != nil { 138 return "", errors.Wrapf(err, "GetInternalIP: failed dial %s:%d", c.Name, index) 139 } 140 defer session.Close() 141 142 var stdout, stderr strings.Builder 143 session.SetStdout(&stdout) 144 session.SetStderr(&stderr) 145 cmd := `hostname --all-ip-addresses` 146 if err := session.Run(cmd); err != nil { 147 return "", errors.Wrapf(err, 148 "GetInternalIP: failed to execute hostname on %s:%d:\n(stdout) %s\n(stderr) %s", 149 c.Name, index, stdout.String(), stderr.String()) 150 } 151 ip := strings.TrimSpace(stdout.String()) 152 if ip == "" { 153 return "", errors.Errorf( 154 "empty internal IP returned, stdout:\n%s\nstderr:\n%s", 155 stdout.String(), stderr.String(), 156 ) 157 } 158 return ip, nil 159 } 160 161 // Start TODO(peter): document 162 func (c *SyncedCluster) Start() { 163 c.Impl.Start(c, c.Args) 164 } 165 166 func (c *SyncedCluster) newSession(i int) (session, error) { 167 if c.IsLocal() { 168 return newLocalSession(), nil 169 } 170 return newRemoteSession(c.user(i), c.host(i), c.DebugDir) 171 } 172 173 // Stop TODO(peter): document 174 func (c *SyncedCluster) Stop(sig int, wait bool) { 175 display := fmt.Sprintf("%s: stopping", c.Name) 176 if wait { 177 display += " and waiting" 178 } 179 c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) { 180 sess, err := c.newSession(c.Nodes[i]) 181 if err != nil { 182 return nil, err 183 } 184 defer sess.Close() 185 186 var waitCmd string 187 if wait { 188 waitCmd = fmt.Sprintf(` 189 for pid in ${pids}; do 190 echo "${pid}: checking" >> %[1]s/roachprod.log 191 while kill -0 ${pid}; do 192 kill -0 ${pid} >> %[1]s/roachprod.log 2>&1 193 echo "${pid}: still alive [$?]" >> %[1]s/roachprod.log 194 ps axeww -o pid -o command >> %[1]s/roachprod.log 195 sleep 1 196 done 197 echo "${pid}: dead" >> %[1]s/roachprod.log 198 done 199 `, c.Impl.LogDir(c, c.Nodes[i])) 200 } 201 202 // NB: the awkward-looking `awk` invocation serves to avoid having the 203 // awk process match its own output from `ps`. 204 cmd := fmt.Sprintf(` 205 mkdir -p logs 206 echo ">>> roachprod stop: $(date)" >> %[1]s/roachprod.log 207 ps axeww -o pid -o command >> %[1]s/roachprod.log 208 pids=$(ps axeww -o pid -o command | \ 209 sed 's/export ROACHPROD=//g' | \ 210 awk '/ROACHPROD=(%[2]d%[3]s)[ \/]/ { print $1 }') 211 if [ -n "${pids}" ]; then 212 kill -%[4]d ${pids} 213 %[5]s 214 fi 215 `, c.Impl.LogDir(c, c.Nodes[i]), c.Nodes[i], c.escapedTag(), sig, waitCmd) 216 return sess.CombinedOutput(cmd) 217 }) 218 } 219 220 // Wipe TODO(peter): document 221 func (c *SyncedCluster) Wipe(preserveCerts bool) { 222 display := fmt.Sprintf("%s: wiping", c.Name) 223 c.Stop(9, true /* wait */) 224 c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) { 225 sess, err := c.newSession(c.Nodes[i]) 226 if err != nil { 227 return nil, err 228 } 229 defer sess.Close() 230 231 var cmd string 232 if c.IsLocal() { 233 // Not all shells like brace expansion, so we'll do it here 234 dirs := []string{"data", "logs"} 235 if !preserveCerts { 236 dirs = append(dirs, "certs*") 237 } 238 for _, dir := range dirs { 239 cmd += fmt.Sprintf(`rm -fr ${HOME}/local/%d/%s ;`, c.Nodes[i], dir) 240 } 241 } else { 242 cmd = `sudo find /mnt/data* -maxdepth 1 -type f -exec rm -f {} \; && 243 sudo rm -fr /mnt/data*/{auxiliary,local,tmp,cassandra,cockroach,cockroach-temp*,mongo-data} && 244 sudo rm -fr logs && 245 ` 246 if !preserveCerts { 247 cmd += "sudo rm -fr certs* ;\n" 248 } 249 } 250 return sess.CombinedOutput(cmd) 251 }) 252 } 253 254 // Status TODO(peter): document 255 func (c *SyncedCluster) Status() { 256 display := fmt.Sprintf("%s: status", c.Name) 257 results := make([]string, len(c.Nodes)) 258 c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) { 259 sess, err := c.newSession(c.Nodes[i]) 260 if err != nil { 261 results[i] = err.Error() 262 return nil, nil 263 } 264 defer sess.Close() 265 266 binary := cockroachNodeBinary(c, c.Nodes[i]) 267 cmd := fmt.Sprintf(`out=$(ps axeww -o pid -o ucomm -o command | \ 268 sed 's/export ROACHPROD=//g' | \ 269 awk '/ROACHPROD=(%d%s)[ \/]/ {print $2, $1}'`, 270 c.Nodes[i], c.escapedTag()) 271 cmd += ` | sort | uniq); 272 vers=$(` + binary + ` version 2>/dev/null | awk '/Build Tag:/ {print $NF}') 273 if [ -n "${out}" -a -n "${vers}" ]; then 274 echo ${out} | sed "s/cockroach/cockroach-${vers}/g" 275 else 276 echo ${out} 277 fi 278 ` 279 out, err := sess.CombinedOutput(cmd) 280 var msg string 281 if err != nil { 282 return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out) 283 } 284 msg = strings.TrimSpace(string(out)) 285 if msg == "" { 286 msg = "not running" 287 } 288 results[i] = msg 289 return nil, nil 290 }) 291 292 for i, r := range results { 293 fmt.Printf(" %2d: %s\n", c.Nodes[i], r) 294 } 295 } 296 297 // NodeMonitorInfo is a message describing a cockroach process' status. 298 type NodeMonitorInfo struct { 299 // The index of the node (in a SyncedCluster) at which the message originated. 300 Index int 301 // A message about the node. This is either a PID, "dead", "nc exited", or 302 // "skipped". 303 // Anything but a PID or "skipped" is an indication that there is some 304 // problem with the node and that the process is not running. 305 Msg string 306 // Err is an error that may occur when trying to probe the status of the node. 307 // If Err is non-nil, Msg is empty. After an error is returned, the node with 308 // the given index will no longer be probed. Errors typically indicate networking 309 // issues or nodes that have (physically) shut down. 310 Err error 311 } 312 313 // Monitor writes NodeMonitorInfo for the cluster nodes to the returned channel. 314 // Infos sent to the channel always have the Index and exactly one of Msg or Err 315 // set. 316 // 317 // If oneShot is true, infos are retrieved only once for each node and the 318 // channel is subsequently closed; otherwise the process continues indefinitely 319 // (emitting new information as the status of the cockroach process changes). 320 // 321 // If ignoreEmptyNodes is true, nodes on which no CockroachDB data is found 322 // (in {store-dir}) will not be probed and single message, "skipped", will 323 // be emitted for them. 324 func (c *SyncedCluster) Monitor(ignoreEmptyNodes bool, oneShot bool) chan NodeMonitorInfo { 325 ch := make(chan NodeMonitorInfo) 326 nodes := c.ServerNodes() 327 var wg sync.WaitGroup 328 329 for i := range nodes { 330 wg.Add(1) 331 go func(i int) { 332 defer wg.Done() 333 sess, err := c.newSession(nodes[i]) 334 if err != nil { 335 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 336 wg.Done() 337 return 338 } 339 defer sess.Close() 340 341 p, err := sess.StdoutPipe() 342 if err != nil { 343 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 344 wg.Done() 345 return 346 } 347 348 // On each monitored node, we loop looking for a cockroach process. In 349 // order to avoid polling with lsof, if we find a live process we use nc 350 // (netcat) to connect to the rpc port which will block until the server 351 // either decides to kill the connection or the process is killed. 352 // In one-shot we don't use nc and return after the first assessment 353 // of the process' health. 354 data := struct { 355 OneShot bool 356 IgnoreEmpty bool 357 Store string 358 Port int 359 }{ 360 OneShot: oneShot, 361 IgnoreEmpty: ignoreEmptyNodes, 362 Store: Cockroach{}.NodeDir(c, nodes[i]), 363 Port: Cockroach{}.NodePort(c, nodes[i]), 364 } 365 366 snippet := ` 367 lastpid=0 368 {{ if .IgnoreEmpty}} 369 if [ ! -f "{{.Store}}/CURRENT" ]; then 370 echo "skipped" 371 exit 0 372 fi 373 {{- end}} 374 while :; do 375 pid=$(lsof -i :{{.Port}} -sTCP:LISTEN | awk '!/COMMAND/ {print $2}') 376 if [ "${pid}" != "${lastpid}" ]; then 377 if [ -n "${lastpid}" -a -z "${pid}" ]; then 378 echo dead 379 fi 380 lastpid=${pid} 381 if [ -n "${pid}" ]; then 382 echo ${pid} 383 fi 384 fi 385 {{if .OneShot }} 386 exit 0 387 {{- end}} 388 if [ -n "${lastpid}" ]; then 389 while kill -0 "${lastpid}"; do 390 sleep 1 391 done 392 echo "kill exited nonzero" 393 else 394 sleep 1 395 fi 396 done 397 ` 398 399 t := template.Must(template.New("script").Parse(snippet)) 400 var buf bytes.Buffer 401 if err := t.Execute(&buf, data); err != nil { 402 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 403 return 404 } 405 406 // Request a PTY so that the script will receive will receive a SIGPIPE 407 // when the session is closed. 408 if err := sess.RequestPty(); err != nil { 409 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 410 return 411 } 412 // Give the session a valid stdin pipe so that nc won't exit immediately. 413 // When nc does exit, we write to stdout, which has a side effect of 414 // checking whether the stdout pipe has broken. This allows us to detect 415 // when the roachprod process is killed. 416 inPipe, err := sess.StdinPipe() 417 if err != nil { 418 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 419 return 420 } 421 defer inPipe.Close() 422 423 var readerWg sync.WaitGroup 424 readerWg.Add(1) 425 go func(p io.Reader) { 426 defer readerWg.Done() 427 r := bufio.NewReader(p) 428 for { 429 line, _, err := r.ReadLine() 430 if err == io.EOF { 431 return 432 } 433 ch <- NodeMonitorInfo{Index: nodes[i], Msg: string(line)} 434 } 435 }(p) 436 437 if err := sess.Start(buf.String()); err != nil { 438 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 439 return 440 } 441 442 readerWg.Wait() 443 // We must call `sess.Wait()` only after finishing reading from the stdout 444 // pipe. Otherwise it can be closed under us, causing the reader to loop 445 // infinitely receiving a non-`io.EOF` error. 446 if err := sess.Wait(); err != nil { 447 ch <- NodeMonitorInfo{Index: nodes[i], Err: err} 448 return 449 } 450 }(i) 451 } 452 go func() { 453 wg.Wait() 454 close(ch) 455 }() 456 457 return ch 458 } 459 460 // Run a command on >= 1 node in the cluster. 461 // 462 // When running on just one node, the command output is streamed to stdout. 463 // When running on multiple nodes, the commands run in parallel, their output 464 // is cached and then emitted all together once all commands are completed. 465 // 466 // stdout: Where stdout messages are written 467 // stderr: Where stderr messages are written 468 // nodes: The cluster nodes where the command will be run. 469 // cmdKind: Which type of command is being run? This allows refined error reporting. 470 // title: A description of the command being run that is output to the logs. 471 // cmd: The command to run. 472 func (c *SyncedCluster) Run( 473 stdout, stderr io.Writer, nodes []int, cmdKind CmdKind, title, cmd string, 474 ) error { 475 // Stream output if we're running the command on only 1 node. 476 stream := len(nodes) == 1 477 var display string 478 if !stream { 479 display = fmt.Sprintf("%s: %s", c.Name, title) 480 } 481 482 errors := make([]error, len(nodes)) 483 results := make([]string, len(nodes)) 484 c.Parallel(display, len(nodes), 0, func(i int) ([]byte, error) { 485 sess, err := c.newSession(nodes[i]) 486 if err != nil { 487 errors[i] = err 488 results[i] = err.Error() 489 return nil, nil 490 } 491 defer sess.Close() 492 493 // Argument template expansion is node specific (e.g. for {store-dir}). 494 e := expander{ 495 node: nodes[i], 496 } 497 expandedCmd, err := e.expand(c, cmd) 498 if err != nil { 499 return nil, err 500 } 501 502 // Be careful about changing these command strings. In particular, we need 503 // to support running commands in the background on both local and remote 504 // nodes. For example: 505 // 506 // roachprod run cluster -- "sleep 60 &> /dev/null < /dev/null &" 507 // 508 // That command should return immediately. And a "roachprod status" should 509 // reveal that the sleep command is running on the cluster. 510 nodeCmd := fmt.Sprintf(`export ROACHPROD=%d%s GOTRACEBACK=crash && bash -c %s`, 511 nodes[i], c.Tag, ssh.Escape1(expandedCmd)) 512 if c.IsLocal() { 513 nodeCmd = fmt.Sprintf("cd ${HOME}/local/%d ; %s", nodes[i], nodeCmd) 514 } 515 516 if stream { 517 sess.SetStdout(stdout) 518 sess.SetStderr(stderr) 519 errors[i] = sess.Run(nodeCmd) 520 if errors[i] != nil { 521 detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", nodes[i], cmd) 522 err = crdberrors.WithDetail(errors[i], detailMsg) 523 err = cmdKind.classifyError(err) 524 errors[i] = err 525 } 526 return nil, nil 527 } 528 529 out, err := sess.CombinedOutput(nodeCmd) 530 msg := strings.TrimSpace(string(out)) 531 if err != nil { 532 detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", nodes[i], cmd) 533 err = crdberrors.WithDetail(err, detailMsg) 534 err = cmdKind.classifyError(err) 535 errors[i] = err 536 msg += fmt.Sprintf("\n%v", err) 537 } 538 results[i] = msg 539 return nil, nil 540 }) 541 542 if !stream { 543 for i, r := range results { 544 fmt.Fprintf(stdout, " %2d: %s\n", nodes[i], r) 545 } 546 } 547 548 return rperrors.SelectPriorityError(errors) 549 } 550 551 // Wait TODO(peter): document 552 func (c *SyncedCluster) Wait() error { 553 display := fmt.Sprintf("%s: waiting for nodes to start", c.Name) 554 errs := make([]error, len(c.Nodes)) 555 c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) { 556 for j := 0; j < 600; j++ { 557 sess, err := c.newSession(c.Nodes[i]) 558 if err != nil { 559 time.Sleep(500 * time.Millisecond) 560 continue 561 } 562 defer sess.Close() 563 564 _, err = sess.CombinedOutput("test -e /mnt/data1/.roachprod-initialized") 565 if err != nil { 566 time.Sleep(500 * time.Millisecond) 567 continue 568 } 569 return nil, nil 570 } 571 errs[i] = errors.New("timed out after 5m") 572 return nil, nil 573 }) 574 575 var foundErr bool 576 for i, err := range errs { 577 if err != nil { 578 fmt.Printf(" %2d: %v\n", c.Nodes[i], err) 579 foundErr = true 580 } 581 } 582 if foundErr { 583 return errors.New("not all nodes booted successfully") 584 } 585 return nil 586 } 587 588 // SetupSSH configures the cluster for use with SSH. This is generally run after 589 // the cloud.Cluster has been synced which resets the SSH credentials on the 590 // machines and sets them up for the current user. This method enables the 591 // hosts to talk to eachother and optionally confiures additional keys to be 592 // added to the hosts via the c.AuthorizedKeys field. It does so in the following 593 // steps: 594 // 595 // 1. Creates an ssh key pair on the first host to be used on all hosts if 596 // none exists. 597 // 2. Distributes the public key, private key, and authorized_keys file from 598 // the first host to the others. 599 // 3. Merges the data in c.AuthorizedKeys with the existing authorized_keys 600 // files on all hosts. 601 // 602 // This call strives to be idempotent. 603 func (c *SyncedCluster) SetupSSH() error { 604 if c.IsLocal() { 605 return nil 606 } 607 608 if len(c.Nodes) == 0 || len(c.Users) == 0 || len(c.VMs) == 0 { 609 return fmt.Errorf("%s: invalid cluster: nodes=%d users=%d hosts=%d", 610 c.Name, len(c.Nodes), len(c.Users), len(c.VMs)) 611 } 612 613 // Generate an ssh key that we'll distribute to all of the nodes in the 614 // cluster in order to allow inter-node ssh. 615 var sshTar []byte 616 c.Parallel("generating ssh key", 1, 0, func(i int) ([]byte, error) { 617 sess, err := c.newSession(1) 618 if err != nil { 619 return nil, err 620 } 621 defer sess.Close() 622 623 // Create the ssh key and then tar up the public, private and 624 // authorized_keys files and output them to stdout. We'll take this output 625 // and pipe it back into tar on the other nodes in the cluster. 626 cmd := ` 627 test -f .ssh/id_rsa || \ 628 (ssh-keygen -q -f .ssh/id_rsa -t rsa -N '' && \ 629 cat .ssh/id_rsa.pub >> .ssh/authorized_keys); 630 tar cf - .ssh/id_rsa .ssh/id_rsa.pub .ssh/authorized_keys 631 ` 632 633 var stdout bytes.Buffer 634 var stderr bytes.Buffer 635 sess.SetStdout(&stdout) 636 sess.SetStderr(&stderr) 637 638 if err := sess.Run(cmd); err != nil { 639 return nil, errors.Wrapf(err, "%s: stderr:\n%s", cmd, stderr.String()) 640 } 641 sshTar = stdout.Bytes() 642 return nil, nil 643 }) 644 645 // Skip the the first node which is where we generated the key. 646 nodes := c.Nodes[1:] 647 c.Parallel("distributing ssh key", len(nodes), 0, func(i int) ([]byte, error) { 648 sess, err := c.newSession(nodes[i]) 649 if err != nil { 650 return nil, err 651 } 652 defer sess.Close() 653 654 sess.SetStdin(bytes.NewReader(sshTar)) 655 cmd := `tar xf -` 656 if out, err := sess.CombinedOutput(cmd); err != nil { 657 return nil, errors.Wrapf(err, "%s: output:\n%s", cmd, out) 658 } 659 return nil, nil 660 }) 661 662 // Populate the known_hosts file with both internal and external IPs of all 663 // of the nodes in the cluster. Note that as a side effect, this creates the 664 // known hosts file in unhashed format, working around a limitation of jsch 665 // (which is used in jepsen tests). 666 ips := make([]string, len(c.Nodes), len(c.Nodes)*2) 667 c.Parallel("retrieving hosts", len(c.Nodes), 0, func(i int) ([]byte, error) { 668 for j := 0; j < 20 && ips[i] == ""; j++ { 669 var err error 670 ips[i], err = c.GetInternalIP(c.Nodes[i]) 671 if err != nil { 672 return nil, errors.Wrapf(err, "pgurls") 673 } 674 time.Sleep(time.Second) 675 } 676 if ips[i] == "" { 677 return nil, fmt.Errorf("retrieved empty IP address") 678 } 679 return nil, nil 680 }) 681 for _, i := range c.Nodes { 682 ips = append(ips, c.host(i)) 683 } 684 var knownHostsData []byte 685 c.Parallel("scanning hosts", 1, 0, func(i int) ([]byte, error) { 686 sess, err := c.newSession(c.Nodes[i]) 687 if err != nil { 688 return nil, err 689 } 690 defer sess.Close() 691 692 // ssh-keyscan may return fewer than the desired number of entries if the 693 // remote nodes are not responding yet, so we loop until we have a scan that 694 // found host keys for all of the IPs. Merge the newly scanned keys with the 695 // existing list to make this process idempotent. 696 cmd := ` 697 set -e 698 tmp="$(tempfile -d ~/.ssh -p 'roachprod' )" 699 on_exit() { 700 rm -f "${tmp}" 701 } 702 trap on_exit EXIT 703 for i in {1..20}; do 704 ssh-keyscan -T 60 -t rsa ` + strings.Join(ips, " ") + ` > "${tmp}" 705 if [[ "$(wc < ${tmp} -l)" -eq "` + fmt.Sprint(len(ips)) + `" ]]; then 706 [[ -f .ssh/known_hosts ]] && cat .ssh/known_hosts >> "${tmp}" 707 sort -u < "${tmp}" 708 exit 0 709 fi 710 sleep 1 711 done 712 exit 1 713 ` 714 var stdout bytes.Buffer 715 var stderr bytes.Buffer 716 sess.SetStdout(&stdout) 717 sess.SetStderr(&stderr) 718 if err := sess.Run(cmd); err != nil { 719 return nil, errors.Wrapf(err, "%s: stderr:\n%s", cmd, stderr.String()) 720 } 721 knownHostsData = stdout.Bytes() 722 return nil, nil 723 }) 724 c.Parallel("distributing known_hosts", len(c.Nodes), 0, func(i int) ([]byte, error) { 725 sess, err := c.newSession(c.Nodes[i]) 726 if err != nil { 727 return nil, err 728 } 729 defer sess.Close() 730 731 sess.SetStdin(bytes.NewReader(knownHostsData)) 732 const cmd = ` 733 known_hosts_data="$(cat)" 734 set -e 735 tmp="$(tempfile -p 'roachprod' -m 0644 )" 736 on_exit() { 737 rm -f "${tmp}" 738 } 739 trap on_exit EXIT 740 echo "${known_hosts_data}" > "${tmp}" 741 cat "${tmp}" >> ~/.ssh/known_hosts 742 # If our bootstrapping user is not the shared user install all of the 743 # relevant ssh files from the bootstrapping user into the shared user's 744 # .ssh directory. 745 if [[ "$(whoami)" != "` + config.SharedUser + `" ]]; then 746 # Ensure that the shared user has a .ssh directory 747 sudo -u ` + config.SharedUser + 748 ` bash -c "mkdir -p ~` + config.SharedUser + `/.ssh" 749 # This somewhat absurd incantation ensures that we properly shell quote 750 # filenames so that they both aren't expanded and work even if the filenames 751 # include spaces. 752 sudo find ~/.ssh -type f -execdir bash -c 'install \ 753 --owner ` + config.SharedUser + ` \ 754 --group ` + config.SharedUser + ` \ 755 --mode $(stat -c "%a" '"'"'{}'"'"') \ 756 '"'"'{}'"'"' ~` + config.SharedUser + `/.ssh' \; 757 fi 758 ` 759 if out, err := sess.CombinedOutput(cmd); err != nil { 760 return nil, errors.Wrapf(err, "%s: output:\n%s", cmd, out) 761 } 762 return nil, nil 763 }) 764 if len(c.AuthorizedKeys) > 0 { 765 // When clusters are created using cloud APIs they only have a subset of 766 // desired keys installed on a subset of users. This code distributes 767 // additional authorized_keys to both the current user (your username on 768 // gce and the shared user on aws) as well as to the shared user on both 769 // platforms. 770 c.Parallel("adding additional authorized keys", len(c.Nodes), 0, func(i int) ([]byte, error) { 771 sess, err := c.newSession(c.Nodes[i]) 772 if err != nil { 773 return nil, err 774 } 775 defer sess.Close() 776 777 sess.SetStdin(bytes.NewReader(c.AuthorizedKeys)) 778 const cmd = ` 779 keys_data="$(cat)" 780 set -e 781 tmp1="$(tempfile -d ~/.ssh -p 'roachprod' )" 782 tmp2="$(tempfile -d ~/.ssh -p 'roachprod' )" 783 on_exit() { 784 rm -f "${tmp1}" "${tmp2}" 785 } 786 trap on_exit EXIT 787 if [[ -f ~/.ssh/authorized_keys ]]; then 788 cat ~/.ssh/authorized_keys > "${tmp1}" 789 fi 790 echo "${keys_data}" >> "${tmp1}" 791 sort -u < "${tmp1}" > "${tmp2}" 792 install --mode 0600 "${tmp2}" ~/.ssh/authorized_keys 793 if [[ "$(whoami)" != "` + config.SharedUser + `" ]]; then 794 sudo install --mode 0600 \ 795 --owner ` + config.SharedUser + `\ 796 --group ` + config.SharedUser + `\ 797 "${tmp2}" ~` + config.SharedUser + `/.ssh/authorized_keys 798 fi 799 ` 800 if out, err := sess.CombinedOutput(cmd); err != nil { 801 return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out) 802 } 803 return nil, nil 804 }) 805 } 806 807 return nil 808 } 809 810 // DistributeCerts will generate and distribute certificates to all of the 811 // nodes. 812 func (c *SyncedCluster) DistributeCerts() { 813 dir := "" 814 if c.IsLocal() { 815 dir = `${HOME}/local/1` 816 } 817 818 // Check to see if the certs have already been initialized. 819 var existsErr error 820 display := fmt.Sprintf("%s: checking certs", c.Name) 821 c.Parallel(display, 1, 0, func(i int) ([]byte, error) { 822 sess, err := c.newSession(1) 823 if err != nil { 824 return nil, err 825 } 826 defer sess.Close() 827 _, existsErr = sess.CombinedOutput(`test -e ` + filepath.Join(dir, `certs.tar`)) 828 return nil, nil 829 }) 830 831 if existsErr == nil { 832 return 833 } 834 835 // Gather the internal IP addresses for every node in the cluster, even 836 // if it won't be added to the cluster itself we still add the IP address 837 // to the node cert. 838 var msg string 839 display = fmt.Sprintf("%s: initializing certs", c.Name) 840 nodes := allNodes(len(c.VMs)) 841 var ips []string 842 if !c.IsLocal() { 843 ips = make([]string, len(nodes)) 844 c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) { 845 var err error 846 ips[i], err = c.GetInternalIP(nodes[i]) 847 return nil, errors.Wrapf(err, "IPs") 848 }) 849 } 850 851 // Generate the ca, client and node certificates on the first node. 852 c.Parallel(display, 1, 0, func(i int) ([]byte, error) { 853 sess, err := c.newSession(1) 854 if err != nil { 855 return nil, err 856 } 857 defer sess.Close() 858 859 var nodeNames []string 860 if c.IsLocal() { 861 // For local clusters, we only need to add one of the VM IP addresses. 862 nodeNames = append(nodeNames, "$(hostname)", c.VMs[0]) 863 } else { 864 // Add both the local and external IP addresses, as well as the 865 // hostnames to the node certificate. 866 nodeNames = append(nodeNames, ips...) 867 nodeNames = append(nodeNames, c.VMs...) 868 for i := range c.VMs { 869 nodeNames = append(nodeNames, fmt.Sprintf("%s-%04d", c.Name, i+1)) 870 // On AWS nodes internally have a DNS name in the form ip-<ip address> 871 // where dots have been replaces with dashes. 872 // See https://docs.aws.amazon.com/vpc/latest/userguide/vpc-dns.html#vpc-dns-hostnames 873 if strings.Contains(c.Localities[i], "cloud=aws") { 874 nodeNames = append(nodeNames, "ip-"+strings.ReplaceAll(ips[i], ".", "-")) 875 } 876 } 877 } 878 879 var cmd string 880 if c.IsLocal() { 881 cmd = `cd ${HOME}/local/1 ; ` 882 } 883 cmd += fmt.Sprintf(` 884 rm -fr certs 885 mkdir -p certs 886 %[1]s cert create-ca --certs-dir=certs --ca-key=certs/ca.key 887 %[1]s cert create-client root --certs-dir=certs --ca-key=certs/ca.key 888 %[1]s cert create-node localhost %[2]s --certs-dir=certs --ca-key=certs/ca.key 889 tar cvf certs.tar certs 890 `, cockroachNodeBinary(c, 1), strings.Join(nodeNames, " ")) 891 if out, err := sess.CombinedOutput(cmd); err != nil { 892 msg = fmt.Sprintf("%s: %v", out, err) 893 } 894 return nil, nil 895 }) 896 897 if msg != "" { 898 fmt.Fprintln(os.Stderr, msg) 899 os.Exit(1) 900 } 901 902 var tmpfileName string 903 if c.IsLocal() { 904 tmpfileName = os.ExpandEnv(filepath.Join(dir, "certs.tar")) 905 } else { 906 // Retrieve the certs.tar that was created on the first node. 907 tmpfile, err := ioutil.TempFile("", "certs") 908 if err != nil { 909 fmt.Fprintln(os.Stderr, err) 910 os.Exit(1) 911 } 912 _ = tmpfile.Close() 913 defer func() { 914 _ = os.Remove(tmpfile.Name()) // clean up 915 }() 916 917 if err := func() error { 918 return c.scp(fmt.Sprintf("%s@%s:certs.tar", c.user(1), c.host(1)), tmpfile.Name()) 919 }(); err != nil { 920 fmt.Fprintln(os.Stderr, err) 921 os.Exit(1) 922 } 923 924 tmpfileName = tmpfile.Name() 925 } 926 927 // Read the certs.tar file we just downloaded. We'll be piping it to the 928 // other nodes in the cluster. 929 certsTar, err := ioutil.ReadFile(tmpfileName) 930 if err != nil { 931 fmt.Fprintln(os.Stderr, err) 932 os.Exit(1) 933 } 934 935 // Skip the the first node which is where we generated the certs. 936 display = c.Name + ": distributing certs" 937 nodes = nodes[1:] 938 c.Parallel(display, len(nodes), 0, func(i int) ([]byte, error) { 939 sess, err := c.newSession(nodes[i]) 940 if err != nil { 941 return nil, err 942 } 943 defer sess.Close() 944 945 sess.SetStdin(bytes.NewReader(certsTar)) 946 var cmd string 947 if c.IsLocal() { 948 cmd = fmt.Sprintf(`cd ${HOME}/local/%d ; `, nodes[i]) 949 } 950 cmd += `tar xf -` 951 if out, err := sess.CombinedOutput(cmd); err != nil { 952 return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out) 953 } 954 return nil, nil 955 }) 956 } 957 958 const progressDone = "=======================================>" 959 const progressTodo = "----------------------------------------" 960 961 func formatProgress(p float64) string { 962 i := int(math.Ceil(float64(len(progressDone)) * (1 - p))) 963 if i > len(progressDone) { 964 i = len(progressDone) 965 } 966 if i < 0 { 967 i = 0 968 } 969 return fmt.Sprintf("[%s%s] %.0f%%", progressDone[i:], progressTodo[:i], 100*p) 970 } 971 972 // Put TODO(peter): document 973 func (c *SyncedCluster) Put(src, dest string) { 974 // NB: This value was determined with a few experiments. Higher values were 975 // not tested. 976 const treeDistFanout = 10 977 978 var detail string 979 if !c.IsLocal() { 980 if c.UseTreeDist { 981 detail = " (dist)" 982 } else { 983 detail = " (scp)" 984 } 985 } 986 fmt.Printf("%s: putting%s %s %s\n", c.Name, detail, src, dest) 987 988 type result struct { 989 index int 990 err error 991 } 992 993 results := make(chan result, len(c.Nodes)) 994 lines := make([]string, len(c.Nodes)) 995 var linesMu syncutil.Mutex 996 var wg sync.WaitGroup 997 wg.Add(len(c.Nodes)) 998 999 // Each destination for the copy needs a source to copy from. We create a 1000 // channel that has capacity for each destination. If we try to add a source 1001 // and the channel is full we can simply drop that source as we know we won't 1002 // need to use it. 1003 sources := make(chan int, len(c.Nodes)) 1004 pushSource := func(i int) { 1005 select { 1006 case sources <- i: 1007 default: 1008 } 1009 } 1010 1011 if c.UseTreeDist { 1012 // In treedist mode, only add the local source initially. 1013 pushSource(-1) 1014 } else { 1015 // In non-treedist mode, add the local source N times (once for each 1016 // destination). 1017 for range c.Nodes { 1018 pushSource(-1) 1019 } 1020 } 1021 1022 mkpath := func(i int, dest string) (string, error) { 1023 if i == -1 { 1024 return src, nil 1025 } 1026 // Expand the destination to allow, for example, putting directly 1027 // into {store-dir}. 1028 e := expander{ 1029 node: c.Nodes[i], 1030 } 1031 dest, err := e.expand(c, dest) 1032 if err != nil { 1033 return "", err 1034 } 1035 return fmt.Sprintf("%s@%s:%s", c.user(c.Nodes[i]), c.host(c.Nodes[i]), dest), nil 1036 } 1037 1038 for i := range c.Nodes { 1039 go func(i int, dest string) { 1040 defer wg.Done() 1041 1042 if c.IsLocal() { 1043 // Expand the destination to allow, for example, putting directly 1044 // into {store-dir}. 1045 e := expander{ 1046 node: c.Nodes[i], 1047 } 1048 var err error 1049 dest, err = e.expand(c, dest) 1050 if err != nil { 1051 results <- result{i, err} 1052 return 1053 } 1054 if _, err := os.Stat(src); err != nil { 1055 results <- result{i, err} 1056 return 1057 } 1058 from, err := filepath.Abs(src) 1059 if err != nil { 1060 results <- result{i, err} 1061 return 1062 } 1063 // TODO(jlinder): this does not take into account things like 1064 // roachprod put local:1 /some/file.txt /some/dir 1065 // and will replace 'dir' with the contents of file.txt, instead 1066 // of creating /some/dir/file.txt. 1067 var to string 1068 if filepath.IsAbs(dest) { 1069 to = dest 1070 } else { 1071 to = fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d/%s"), c.Nodes[i], dest) 1072 } 1073 // Remove the destination if it exists, ignoring errors which we'll 1074 // handle via the os.Symlink() call. 1075 _ = os.Remove(to) 1076 results <- result{i, os.Symlink(from, to)} 1077 return 1078 } 1079 1080 // Determine the source to copy from. 1081 // 1082 // TODO(peter): Take the cluster topology into account. We should 1083 // preferentially use a source in the same region and only perform a 1084 // single copy between regions. We have the region information and 1085 // achieving this approach is likely a generalization of the current 1086 // code. 1087 srcIndex := <-sources 1088 from, err := mkpath(srcIndex, dest) 1089 if err != nil { 1090 results <- result{i, err} 1091 return 1092 } 1093 // TODO(peter): For remote-to-remote copies, should the destination use 1094 // the internal IP address? The external address works, but it might be 1095 // slower. 1096 to, err := mkpath(i, dest) 1097 if err != nil { 1098 results <- result{i, err} 1099 return 1100 } 1101 1102 err = c.scp(from, to) 1103 results <- result{i, err} 1104 1105 if err != nil { 1106 // The copy failed. Re-add the original source. 1107 pushSource(srcIndex) 1108 } else { 1109 // The copy failed. Re-add the original source if it is remote. 1110 if srcIndex != -1 { 1111 pushSource(srcIndex) 1112 } 1113 // Add fanout number of new sources for the destination. 1114 for j := 0; j < treeDistFanout; j++ { 1115 pushSource(i) 1116 } 1117 } 1118 }(i, dest) 1119 } 1120 1121 go func() { 1122 wg.Wait() 1123 close(results) 1124 }() 1125 1126 var writer ui.Writer 1127 var ticker *time.Ticker 1128 if !c.Quiet { 1129 ticker = time.NewTicker(100 * time.Millisecond) 1130 } else { 1131 ticker = time.NewTicker(1000 * time.Millisecond) 1132 } 1133 defer ticker.Stop() 1134 haveErr := false 1135 1136 var spinner = []string{"|", "/", "-", "\\"} 1137 spinnerIdx := 0 1138 1139 for done := false; !done; { 1140 select { 1141 case <-ticker.C: 1142 if c.Quiet { 1143 fmt.Printf(".") 1144 } 1145 case r, ok := <-results: 1146 done = !ok 1147 if ok { 1148 linesMu.Lock() 1149 if r.err != nil { 1150 haveErr = true 1151 lines[r.index] = r.err.Error() 1152 } else { 1153 lines[r.index] = "done" 1154 } 1155 linesMu.Unlock() 1156 } 1157 } 1158 if !c.Quiet { 1159 linesMu.Lock() 1160 for i := range lines { 1161 fmt.Fprintf(&writer, " %2d: ", c.Nodes[i]) 1162 if lines[i] != "" { 1163 fmt.Fprintf(&writer, "%s", lines[i]) 1164 } else { 1165 fmt.Fprintf(&writer, "%s", spinner[spinnerIdx%len(spinner)]) 1166 } 1167 fmt.Fprintf(&writer, "\n") 1168 } 1169 linesMu.Unlock() 1170 _ = writer.Flush(os.Stdout) 1171 spinnerIdx++ 1172 } 1173 } 1174 1175 if c.Quiet { 1176 fmt.Printf("\n") 1177 linesMu.Lock() 1178 for i := range lines { 1179 fmt.Printf(" %2d: %s\n", c.Nodes[i], lines[i]) 1180 } 1181 linesMu.Unlock() 1182 } 1183 1184 if haveErr { 1185 log.Fatalf("put %s failed", src) 1186 } 1187 } 1188 1189 // Logs will sync the logs from c to dest with each nodes logs under dest in 1190 // directories per node and stream the merged logs to out. 1191 // For example, if dest is "tpcc-test.logs" then the logs for each node will be 1192 // stored like: 1193 // 1194 // tpcc-test.logs/1.logs/... 1195 // tpcc-test.logs/2.logs/... 1196 // ... 1197 // 1198 // Log file syncing uses rsync which attempts to be efficient when deciding 1199 // which files to update. The logs are merged by calling 1200 // `cockroach debug merge-logs <dest>/*/*` with the optional flag for filter. 1201 // The syncing and merging happens in a loop which pauses <interval> between 1202 // iterations and takes some care with the from/to flags in merge-logs to make 1203 // new logs appear to be streamed. If <from> is zero streaming begins from now. 1204 // If to is non-zero, when the stream of logs passes to, the function returns. 1205 // <user> allows retrieval of logs from a roachprod cluster being run by another 1206 // user and assumes that the current user used to create c has the ability to 1207 // sudo into <user>. 1208 func (c *SyncedCluster) Logs( 1209 src, dest, user, filter, programFilter string, 1210 interval time.Duration, 1211 from, to time.Time, 1212 out io.Writer, 1213 ) error { 1214 rsyncNodeLogs := func(ctx context.Context, idx int) error { 1215 base := fmt.Sprintf("%d.logs", c.Nodes[idx-1]) 1216 local := filepath.Join(dest, base) + "/" 1217 sshUser := c.user(c.Nodes[idx-1]) 1218 rsyncArgs := []string{"-az", "--size-only"} 1219 var remote string 1220 if c.IsLocal() { 1221 // This here is a bit of a hack to guess that the parent of the log dir is 1222 // the "home" for the local node and that the srcBase is relative to that. 1223 localHome := filepath.Dir(c.Impl.LogDir(c, idx)) 1224 remote = filepath.Join(localHome, src) + "/" 1225 } else { 1226 logDir := src 1227 if !filepath.IsAbs(logDir) && user != "" && user != sshUser { 1228 logDir = "~" + user + "/" + logDir 1229 } 1230 remote = fmt.Sprintf("%s@%s:%s/", c.user(c.Nodes[idx-1]), 1231 c.host(c.Nodes[idx-1]), logDir) 1232 // Use control master to mitigate SSH connection setup cost. 1233 rsyncArgs = append(rsyncArgs, "--rsh", "ssh "+ 1234 "-o StrictHostKeyChecking=no "+ 1235 "-o ControlMaster=auto "+ 1236 "-o ControlPath=~/.ssh/%r@%h:%p "+ 1237 "-o UserKnownHostsFile=/dev/null "+ 1238 "-o ControlPersist=2m") 1239 // Use rsync-path flag to sudo into user if different from sshUser. 1240 if user != "" && user != sshUser { 1241 rsyncArgs = append(rsyncArgs, "--rsync-path", 1242 fmt.Sprintf("sudo -u %s rsync", user)) 1243 } 1244 } 1245 rsyncArgs = append(rsyncArgs, remote, local) 1246 cmd := exec.CommandContext(ctx, "rsync", rsyncArgs...) 1247 var stderrBuf bytes.Buffer 1248 cmd.Stdout = os.Stdout 1249 cmd.Stderr = &stderrBuf 1250 if err := cmd.Run(); err != nil { 1251 if ctx.Err() != nil { 1252 return nil 1253 } 1254 return errors.Errorf("failed to rsync from %v to %v: %v\n%s", 1255 src, dest, err, stderrBuf.String()) 1256 } 1257 return nil 1258 } 1259 rsyncLogs := func(ctx context.Context) error { 1260 g, gctx := errgroup.WithContext(ctx) 1261 for i := range c.Nodes { 1262 idx := c.Nodes[i] 1263 g.Go(func() error { 1264 return rsyncNodeLogs(gctx, idx) 1265 }) 1266 } 1267 return g.Wait() 1268 } 1269 mergeLogs := func(ctx context.Context, prev, t time.Time) error { 1270 cmd := exec.CommandContext(ctx, "cockroach", "debug", "merge-logs", 1271 dest+"/*/*", 1272 "--from", prev.Format(time.RFC3339), 1273 "--to", t.Format(time.RFC3339)) 1274 if filter != "" { 1275 cmd.Args = append(cmd.Args, "--filter", filter) 1276 } 1277 if programFilter != "" { 1278 cmd.Args = append(cmd.Args, "--program-filter", programFilter) 1279 } 1280 // For local clusters capture the cluster ID from the sync path because the 1281 // host information is useless. 1282 if c.IsLocal() { 1283 cmd.Args = append(cmd.Args, 1284 "--file-pattern", "^(?:.*/)?(?P<id>[0-9]+).*/"+clog.FileNamePattern+"$", 1285 "--prefix", "${id}> ") 1286 } 1287 cmd.Stdout = out 1288 var errBuf bytes.Buffer 1289 cmd.Stderr = &errBuf 1290 if err := cmd.Run(); err != nil && ctx.Err() == nil { 1291 return fmt.Errorf("failed to run cockroach debug merge-logs:%v\n%v", 1292 err, errBuf.String()) 1293 } 1294 return nil 1295 } 1296 1297 ctx, cancel := context.WithCancel(context.Background()) 1298 defer cancel() 1299 if err := os.MkdirAll(dest, 0755); err != nil { 1300 return errors.Errorf("failed to create destination directory: %v", err) 1301 } 1302 // Cancel context upon signaling. 1303 ch := make(chan os.Signal, 1) 1304 signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 1305 defer func() { signal.Stop(ch); close(ch) }() 1306 go func() { <-ch; cancel() }() 1307 // TODO(ajwerner): consider SIGHUP-ing cockroach before the rsync to avoid the delays 1308 prev := from 1309 if prev.IsZero() { 1310 prev = timeutil.Now().Add(-2 * time.Second).Truncate(time.Microsecond) 1311 } 1312 for to.IsZero() || prev.Before(to) { 1313 // Subtract ~1 second to deal with the flush delay in util/log. 1314 t := timeutil.Now().Add(-1100 * time.Millisecond).Truncate(time.Microsecond) 1315 if err := rsyncLogs(ctx); err != nil { 1316 return errors.Errorf("failed to sync logs: %v", err) 1317 } 1318 if !to.IsZero() && t.After(to) { 1319 t = to 1320 } 1321 if err := mergeLogs(ctx, prev, t); err != nil { 1322 return err 1323 } 1324 prev = t 1325 if !to.IsZero() && !prev.Before(to) { 1326 return nil 1327 } 1328 select { 1329 case <-time.After(interval): 1330 case <-ctx.Done(): 1331 return nil 1332 } 1333 } 1334 return nil 1335 } 1336 1337 // Get TODO(peter): document 1338 func (c *SyncedCluster) Get(src, dest string) { 1339 // TODO(peter): Only get 10 nodes at a time. When a node completes, output a 1340 // line indicating that. 1341 var detail string 1342 if !c.IsLocal() { 1343 detail = " (scp)" 1344 } 1345 fmt.Printf("%s: getting%s %s %s\n", c.Name, detail, src, dest) 1346 1347 type result struct { 1348 index int 1349 err error 1350 } 1351 1352 var writer ui.Writer 1353 results := make(chan result, len(c.Nodes)) 1354 lines := make([]string, len(c.Nodes)) 1355 var linesMu syncutil.Mutex 1356 1357 var wg sync.WaitGroup 1358 for i := range c.Nodes { 1359 wg.Add(1) 1360 go func(i int) { 1361 defer wg.Done() 1362 1363 src := src 1364 dest := dest 1365 if len(c.Nodes) > 1 { 1366 base := fmt.Sprintf("%d.%s", c.Nodes[i], filepath.Base(dest)) 1367 dest = filepath.Join(filepath.Dir(dest), base) 1368 } 1369 1370 progress := func(p float64) { 1371 linesMu.Lock() 1372 defer linesMu.Unlock() 1373 lines[i] = formatProgress(p) 1374 } 1375 1376 if c.IsLocal() { 1377 if !filepath.IsAbs(src) { 1378 src = filepath.Join(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), c.Nodes[i]), src) 1379 } 1380 1381 var copy func(src, dest string, info os.FileInfo) error 1382 copy = func(src, dest string, info os.FileInfo) error { 1383 // Make sure the destination file is world readable. 1384 // See: 1385 // https://github.com/cockroachdb/cockroach/issues/44843 1386 mode := info.Mode() | 0444 1387 if info.IsDir() { 1388 if err := os.MkdirAll(dest, mode); err != nil { 1389 return err 1390 } 1391 1392 infos, err := ioutil.ReadDir(src) 1393 if err != nil { 1394 return err 1395 } 1396 1397 for _, info := range infos { 1398 if err := copy( 1399 filepath.Join(src, info.Name()), 1400 filepath.Join(dest, info.Name()), 1401 info, 1402 ); err != nil { 1403 return err 1404 } 1405 } 1406 return nil 1407 } 1408 1409 if !mode.IsRegular() { 1410 return nil 1411 } 1412 1413 out, err := os.Create(dest) 1414 if err != nil { 1415 return err 1416 } 1417 defer out.Close() 1418 1419 if err := os.Chmod(out.Name(), mode); err != nil { 1420 return err 1421 } 1422 1423 in, err := os.Open(src) 1424 if err != nil { 1425 return err 1426 } 1427 defer in.Close() 1428 1429 p := &ssh.ProgressWriter{ 1430 Writer: out, 1431 Done: 0, 1432 Total: info.Size(), 1433 Progress: progress, 1434 } 1435 _, err = io.Copy(p, in) 1436 return err 1437 } 1438 1439 info, err := os.Stat(src) 1440 if err != nil { 1441 results <- result{i, err} 1442 return 1443 } 1444 err = copy(src, dest, info) 1445 results <- result{i, err} 1446 return 1447 } 1448 1449 err := c.scp(fmt.Sprintf("%s@%s:%s", c.user(c.Nodes[0]), c.host(c.Nodes[i]), src), dest) 1450 if err == nil { 1451 // Make sure all created files and directories are world readable. 1452 // The CRDB process intentionally sets a 0007 umask (resulting in 1453 // non-world-readable files). This creates annoyances during CI 1454 // that we circumvent wholesale by adding o+r back here. 1455 // See: 1456 // 1457 // https://github.com/cockroachdb/cockroach/issues/44843 1458 chmod := func(path string, info os.FileInfo, err error) error { 1459 if err != nil { 1460 return err 1461 } 1462 const oRead = 0004 1463 if mode := info.Mode(); mode&oRead == 0 { 1464 if err := os.Chmod(path, mode|oRead); err != nil { 1465 return err 1466 } 1467 } 1468 return nil 1469 } 1470 err = filepath.Walk(dest, chmod) 1471 } 1472 1473 results <- result{i, err} 1474 }(i) 1475 } 1476 1477 go func() { 1478 wg.Wait() 1479 close(results) 1480 }() 1481 1482 var ticker *time.Ticker 1483 if !c.Quiet { 1484 ticker = time.NewTicker(100 * time.Millisecond) 1485 } else { 1486 ticker = time.NewTicker(1000 * time.Millisecond) 1487 } 1488 defer ticker.Stop() 1489 haveErr := false 1490 1491 var spinner = []string{"|", "/", "-", "\\"} 1492 spinnerIdx := 0 1493 1494 for done := false; !done; { 1495 select { 1496 case <-ticker.C: 1497 if c.Quiet { 1498 fmt.Printf(".") 1499 } 1500 case r, ok := <-results: 1501 done = !ok 1502 if ok { 1503 linesMu.Lock() 1504 if r.err != nil { 1505 haveErr = true 1506 lines[r.index] = r.err.Error() 1507 } else { 1508 lines[r.index] = "done" 1509 } 1510 linesMu.Unlock() 1511 } 1512 } 1513 if !c.Quiet { 1514 linesMu.Lock() 1515 for i := range lines { 1516 fmt.Fprintf(&writer, " %2d: ", c.Nodes[i]) 1517 if lines[i] != "" { 1518 fmt.Fprintf(&writer, "%s", lines[i]) 1519 } else { 1520 fmt.Fprintf(&writer, "%s", spinner[spinnerIdx%len(spinner)]) 1521 } 1522 fmt.Fprintf(&writer, "\n") 1523 } 1524 linesMu.Unlock() 1525 _ = writer.Flush(os.Stdout) 1526 spinnerIdx++ 1527 } 1528 } 1529 1530 if c.Quiet { 1531 fmt.Printf("\n") 1532 linesMu.Lock() 1533 for i := range lines { 1534 fmt.Printf(" %2d: %s\n", c.Nodes[i], lines[i]) 1535 } 1536 linesMu.Unlock() 1537 } 1538 1539 if haveErr { 1540 log.Fatalf("get %s failed", src) 1541 } 1542 } 1543 1544 func (c *SyncedCluster) pgurls(nodes []int) map[int]string { 1545 hosts := c.pghosts(nodes) 1546 m := make(map[int]string, len(hosts)) 1547 for node, host := range hosts { 1548 m[node] = c.Impl.NodeURL(c, host, c.Impl.NodePort(c, node)) 1549 } 1550 return m 1551 } 1552 1553 func (c *SyncedCluster) pghosts(nodes []int) map[int]string { 1554 ips := make([]string, len(nodes)) 1555 c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) { 1556 var err error 1557 ips[i], err = c.GetInternalIP(nodes[i]) 1558 return nil, errors.Wrapf(err, "pghosts") 1559 }) 1560 1561 m := make(map[int]string, len(ips)) 1562 for i, ip := range ips { 1563 m[nodes[i]] = ip 1564 } 1565 return m 1566 } 1567 1568 // SSH TODO(peter): document 1569 func (c *SyncedCluster) SSH(sshArgs, args []string) error { 1570 if len(c.Nodes) != 1 && len(args) == 0 { 1571 // If trying to ssh to more than 1 node and the ssh session is interative, 1572 // try sshing with an iTerm2 split screen configuration. 1573 sshed, err := maybeSplitScreenSSHITerm2(c) 1574 if sshed { 1575 return err 1576 } 1577 } 1578 1579 // Perform template expansion on the arguments. 1580 e := expander{ 1581 node: c.Nodes[0], 1582 } 1583 var expandedArgs []string 1584 for _, arg := range args { 1585 expandedArg, err := e.expand(c, arg) 1586 if err != nil { 1587 return err 1588 } 1589 expandedArgs = append(expandedArgs, strings.Split(expandedArg, " ")...) 1590 } 1591 1592 var allArgs []string 1593 if c.IsLocal() { 1594 allArgs = []string{ 1595 "/bin/bash", "-c", 1596 } 1597 cmd := fmt.Sprintf("cd ${HOME}/local/%d ; ", c.Nodes[0]) 1598 if len(args) == 0 /* interactive */ { 1599 cmd += "/bin/bash " 1600 } 1601 if len(args) > 0 { 1602 cmd += fmt.Sprintf("export ROACHPROD=%d%s ; ", c.Nodes[0], c.Tag) 1603 cmd += strings.Join(expandedArgs, " ") 1604 } 1605 allArgs = append(allArgs, cmd) 1606 } else { 1607 allArgs = []string{ 1608 "ssh", 1609 fmt.Sprintf("%s@%s", c.user(c.Nodes[0]), c.host(c.Nodes[0])), 1610 "-o", "UserKnownHostsFile=/dev/null", 1611 "-o", "StrictHostKeyChecking=no", 1612 } 1613 allArgs = append(allArgs, sshAuthArgs()...) 1614 allArgs = append(allArgs, sshArgs...) 1615 if len(args) > 0 { 1616 allArgs = append(allArgs, fmt.Sprintf("export ROACHPROD=%d%s ;", c.Nodes[0], c.Tag)) 1617 } 1618 allArgs = append(allArgs, expandedArgs...) 1619 } 1620 1621 sshPath, err := exec.LookPath(allArgs[0]) 1622 if err != nil { 1623 return err 1624 } 1625 return syscall.Exec(sshPath, allArgs, os.Environ()) 1626 } 1627 1628 func (c *SyncedCluster) scp(src, dest string) error { 1629 args := []string{ 1630 "scp", "-r", "-C", 1631 "-o", "StrictHostKeyChecking=no", 1632 } 1633 args = append(args, sshAuthArgs()...) 1634 args = append(args, src, dest) 1635 cmd := exec.Command(args[0], args[1:]...) 1636 out, err := cmd.CombinedOutput() 1637 if err != nil { 1638 return errors.Wrapf(err, "~ %s\n%s", strings.Join(args, " "), out) 1639 } 1640 return nil 1641 } 1642 1643 // Parallel TODO(peter): document 1644 func (c *SyncedCluster) Parallel( 1645 display string, count, concurrency int, fn func(i int) ([]byte, error), 1646 ) { 1647 if concurrency == 0 || concurrency > count { 1648 concurrency = count 1649 } 1650 if c.MaxConcurrency > 0 && concurrency > c.MaxConcurrency { 1651 concurrency = c.MaxConcurrency 1652 } 1653 type result struct { 1654 index int 1655 out []byte 1656 err error 1657 } 1658 1659 results := make(chan result, count) 1660 var wg sync.WaitGroup 1661 wg.Add(count) 1662 1663 var index int 1664 startNext := func() { 1665 go func(i int) { 1666 defer wg.Done() 1667 out, err := fn(i) 1668 results <- result{i, out, err} 1669 }(index) 1670 index++ 1671 } 1672 1673 for index < concurrency { 1674 startNext() 1675 } 1676 1677 go func() { 1678 wg.Wait() 1679 close(results) 1680 }() 1681 1682 var writer ui.Writer 1683 out := io.Writer(os.Stdout) 1684 if display == "" { 1685 out = ioutil.Discard 1686 } 1687 1688 var ticker *time.Ticker 1689 if !c.Quiet { 1690 ticker = time.NewTicker(100 * time.Millisecond) 1691 } else { 1692 ticker = time.NewTicker(1000 * time.Millisecond) 1693 fmt.Fprintf(out, "%s", display) 1694 } 1695 1696 defer ticker.Stop() 1697 complete := make([]bool, count) 1698 var failed []result 1699 1700 var spinner = []string{"|", "/", "-", "\\"} 1701 spinnerIdx := 0 1702 1703 for done := false; !done; { 1704 select { 1705 case <-ticker.C: 1706 if c.Quiet { 1707 fmt.Fprintf(out, ".") 1708 } 1709 case r, ok := <-results: 1710 if r.err != nil { 1711 failed = append(failed, r) 1712 } 1713 done = !ok 1714 if ok { 1715 complete[r.index] = true 1716 } 1717 if index < count { 1718 startNext() 1719 } 1720 } 1721 1722 if !c.Quiet { 1723 fmt.Fprint(&writer, display) 1724 var n int 1725 for i := range complete { 1726 if complete[i] { 1727 n++ 1728 } 1729 } 1730 fmt.Fprintf(&writer, " %d/%d", n, len(complete)) 1731 if !done { 1732 fmt.Fprintf(&writer, " %s", spinner[spinnerIdx%len(spinner)]) 1733 } 1734 fmt.Fprintf(&writer, "\n") 1735 _ = writer.Flush(out) 1736 spinnerIdx++ 1737 } 1738 } 1739 1740 if c.Quiet { 1741 fmt.Fprintf(out, "\n") 1742 } 1743 1744 if len(failed) > 0 { 1745 sort.Slice(failed, func(i, j int) bool { return failed[i].index < failed[j].index }) 1746 for _, f := range failed { 1747 fmt.Fprintf(os.Stderr, "%d: %+v: %s\n", f.index, f.err, f.out) 1748 } 1749 log.Fatal("command failed") 1750 } 1751 } 1752 1753 func (c *SyncedCluster) escapedTag() string { 1754 return strings.Replace(c.Tag, "/", "\\/", -1) 1755 }