github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/quit.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package cli
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"io"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/server"
    20  	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    22  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    23  	"github.com/cockroachdb/cockroach/pkg/util/log"
    24  	"github.com/cockroachdb/errors"
    25  	"github.com/spf13/cobra"
    26  )
    27  
    28  // quitCmd command shuts down the node server.
    29  var quitCmd = &cobra.Command{
    30  	Use:   "quit",
    31  	Short: "drain and shut down a node\n",
    32  	Long: `
    33  Shut down the server. The first stage is drain, where the server
    34  stops accepting client connections, then stops extant
    35  connections, and finally pushes range leases onto other nodes,
    36  subject to various timeout parameters configurable via
    37  cluster settings. After the first stage completes,
    38  the server process is shut down.
    39  
    40  See also 'cockroach node drain' to drain a server
    41  without stopping the server process.
    42  `,
    43  	Args: cobra.NoArgs,
    44  	RunE: MaybeDecorateGRPCError(runQuit),
    45  }
    46  
    47  // runQuit accesses the quit shutdown path.
    48  func runQuit(cmd *cobra.Command, args []string) (err error) {
    49  	ctx, cancel := context.WithCancel(context.Background())
    50  	defer cancel()
    51  
    52  	// At the end, we'll report "ok" if there was no error.
    53  	defer func() {
    54  		if err == nil {
    55  			fmt.Println("ok")
    56  		}
    57  	}()
    58  
    59  	// Establish a RPC connection.
    60  	c, finish, err := getAdminClient(ctx, serverCfg)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	defer finish()
    65  
    66  	return drainAndShutdown(ctx, c)
    67  }
    68  
    69  // drainAndShutdown attempts to drain the server and then shut it
    70  // down.
    71  func drainAndShutdown(ctx context.Context, c serverpb.AdminClient) (err error) {
    72  	hardError, remainingWork, err := doDrain(ctx, c)
    73  	if hardError {
    74  		return err
    75  	}
    76  
    77  	if remainingWork {
    78  		log.Warningf(ctx, "graceful shutdown may not have completed successfully; check the node's logs for details.")
    79  	}
    80  
    81  	if err != nil {
    82  		log.Warningf(ctx, "drain did not complete successfully; hard shutdown may cause disruption")
    83  	}
    84  	// We have already performed the drain above, so now go straight to
    85  	// shutdown. We try twice just in case there is a transient error.
    86  	hardErr, err := doShutdown(ctx, c)
    87  	if err != nil && !hardErr {
    88  		log.Warningf(ctx, "hard shutdown attempt failed, retrying: %v", err)
    89  		_, err = doShutdown(ctx, c)
    90  	}
    91  	return errors.Wrap(err, "hard shutdown failed")
    92  }
    93  
    94  // doDrain calls a graceful drain.
    95  //
    96  // If the function returns hardError true, then the caller should not
    97  // proceed with an alternate strategy (it's likely the server has gone
    98  // away).
    99  func doDrain(
   100  	ctx context.Context, c serverpb.AdminClient,
   101  ) (hardError, remainingWork bool, err error) {
   102  	// The next step is to drain. The timeout is configurable
   103  	// via --drain-wait.
   104  	if quitCtx.drainWait == 0 {
   105  		return doDrainNoTimeout(ctx, c)
   106  	}
   107  
   108  	err = contextutil.RunWithTimeout(ctx, "drain", quitCtx.drainWait, func(ctx context.Context) (err error) {
   109  		hardError, remainingWork, err = doDrainNoTimeout(ctx, c)
   110  		return err
   111  	})
   112  	if errors.HasType(err, (*contextutil.TimeoutError)(nil)) || grpcutil.IsTimeout(err) {
   113  		log.Infof(ctx, "drain timed out: %v", err)
   114  		err = errors.New("drain timeout")
   115  	}
   116  	return
   117  }
   118  
   119  func doDrainNoTimeout(
   120  	ctx context.Context, c serverpb.AdminClient,
   121  ) (hardError, remainingWork bool, err error) {
   122  	defer func() {
   123  		if server.IsWaitingForInit(err) {
   124  			log.Infof(ctx, "%v", err)
   125  			err = errors.New("node cannot be drained before it has been initialized")
   126  		}
   127  	}()
   128  
   129  	remainingWork = true
   130  	for {
   131  		// Tell the user we're starting to drain. This enables the user to
   132  		// mentally prepare for something to take some time, as opposed to
   133  		// wondering why nothing is happening.
   134  		fmt.Fprintf(stderr, "node is draining... ") // notice no final newline.
   135  
   136  		// Send a drain request with the drain bit set and the shutdown bit
   137  		// unset.
   138  		stream, err := c.Drain(ctx, &serverpb.DrainRequest{
   139  			DeprecatedProbeIndicator: server.DeprecatedDrainParameter,
   140  			DoDrain:                  true,
   141  		})
   142  		if err != nil {
   143  			fmt.Fprintf(stderr, "\n") // finish the line started above.
   144  			return !grpcutil.IsTimeout(err), remainingWork, errors.Wrap(err, "error sending drain request")
   145  		}
   146  		for {
   147  			resp, err := stream.Recv()
   148  			if err == io.EOF {
   149  				// Done.
   150  				break
   151  			}
   152  			if err != nil {
   153  				// Unexpected error.
   154  				fmt.Fprintf(stderr, "\n") // finish the line started above.
   155  				log.Infof(ctx, "graceful shutdown failed: %v", err)
   156  				return false, remainingWork, err
   157  			}
   158  
   159  			if resp.IsDraining {
   160  				// We want to assert that the node is quitting, and tell the
   161  				// story about how much work was performed in logs for
   162  				// debugging.
   163  				finalString := ""
   164  				if resp.DrainRemainingIndicator == 0 {
   165  					finalString = " (complete)"
   166  				}
   167  				// We use stderr so that 'cockroach quit''s stdout remains a
   168  				// simple 'ok' in case of success (for compatibility with
   169  				// scripts).
   170  				fmt.Fprintf(stderr, "remaining: %d%s\n",
   171  					resp.DrainRemainingIndicator, finalString)
   172  				remainingWork = resp.DrainRemainingIndicator > 0
   173  			} else {
   174  				// Either the server has decided it wanted to stop quitting; or
   175  				// we're running a pre-20.1 node which doesn't populate IsDraining.
   176  				// In either case, we need to stop sending drain requests.
   177  				remainingWork = false
   178  				fmt.Fprintf(stderr, "done\n")
   179  			}
   180  
   181  			if resp.DrainRemainingDescription != "" {
   182  				// Only show this information in the log; we'd use this for debugging.
   183  				// (This can be revealed e.g. via --logtostderr.)
   184  				log.Infof(ctx, "drain details: %s\n", resp.DrainRemainingDescription)
   185  			}
   186  
   187  			// Iterate until end of stream, which indicates the drain is
   188  			// complete.
   189  		}
   190  		if !remainingWork {
   191  			break
   192  		}
   193  		// Avoid a busy wait with high CPU/network usage if the server
   194  		// replies with an incomplete drain too quickly.
   195  		time.Sleep(200 * time.Millisecond)
   196  	}
   197  	return false, remainingWork, nil
   198  }
   199  
   200  // doShutdown attempts to trigger a server shutdown *without*
   201  // draining. Use doDrain() prior to perform a drain, or
   202  // drainAndShutdown() to combine both.
   203  func doShutdown(ctx context.Context, c serverpb.AdminClient) (hardError bool, err error) {
   204  	defer func() {
   205  		if server.IsWaitingForInit(err) {
   206  			log.Infof(ctx, "encountered error: %v", err)
   207  			err = errors.New("node cannot be shut down before it has been initialized")
   208  			err = errors.WithHint(err, "You can still stop the process using a service manager or a signal.")
   209  			hardError = true
   210  		}
   211  		if grpcutil.IsClosedConnection(err) {
   212  			// This most likely means that we shut down successfully. Note
   213  			// that sometimes the connection can be shut down even before a
   214  			// DrainResponse gets sent back to us, so we don't require a
   215  			// response on the stream (see #14184).
   216  			err = nil
   217  		}
   218  	}()
   219  
   220  	// We use a shorter timeout because a shutdown request has nothing
   221  	// else to do than shut down the node immediately.
   222  	err = contextutil.RunWithTimeout(ctx, "hard shutdown", 10*time.Second, func(ctx context.Context) error {
   223  		// Send a drain request with the drain bit unset (no drain).
   224  		// and the shutdown bit set.
   225  		stream, err := c.Drain(ctx, &serverpb.DrainRequest{Shutdown: true})
   226  		if err != nil {
   227  			return errors.Wrap(err, "error sending shutdown request")
   228  		}
   229  		for {
   230  			_, err := stream.Recv()
   231  			if err == io.EOF {
   232  				return nil
   233  			}
   234  			if err != nil {
   235  				return err
   236  			}
   237  		}
   238  	})
   239  	if !errors.HasType(err, (*contextutil.TimeoutError)(nil)) {
   240  		hardError = true
   241  	}
   242  	return hardError, err
   243  }
   244  
   245  // getAdminClient returns an AdminClient and a closure that must be invoked
   246  // to free associated resources.
   247  func getAdminClient(ctx context.Context, cfg server.Config) (serverpb.AdminClient, func(), error) {
   248  	conn, _, finish, err := getClientGRPCConn(ctx, cfg)
   249  	if err != nil {
   250  		return nil, nil, errors.Wrap(err, "Failed to connect to the node")
   251  	}
   252  	return serverpb.NewAdminClient(conn), finish, nil
   253  }