github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/drain.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package server
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"os"
    17  	"reflect"
    18  	"strings"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
    22  	"github.com/cockroachdb/cockroach/pkg/settings"
    23  	"github.com/cockroachdb/cockroach/pkg/util/log"
    24  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    25  	"github.com/cockroachdb/errors"
    26  	"google.golang.org/grpc/codes"
    27  	"google.golang.org/grpc/status"
    28  )
    29  
    30  var (
    31  	// DeprecatedDrainParameter the special value that must be
    32  	// passed in DrainRequest.DeprecatedProbeIndicator to signal the
    33  	// drain request is not a probe.
    34  	// This variable is also used in the v20.1 "quit" client
    35  	// to provide a valid input to the request sent to
    36  	// v19.1 nodes.
    37  	//
    38  	// TODO(knz): Remove this in v20.2 and whenever the "quit" command
    39  	// is not meant to work with 19.x servers any more, whichever comes
    40  	// later.
    41  	DeprecatedDrainParameter = []int32{0, 1}
    42  
    43  	queryWait = settings.RegisterPublicDurationSetting(
    44  		"server.shutdown.query_wait",
    45  		"the server will wait for at least this amount of time for active queries to finish",
    46  		10*time.Second,
    47  	)
    48  
    49  	drainWait = settings.RegisterPublicDurationSetting(
    50  		"server.shutdown.drain_wait",
    51  		"the amount of time a server waits in an unready state before proceeding with the rest "+
    52  			"of the shutdown process",
    53  		0*time.Second,
    54  	)
    55  )
    56  
    57  // Drain puts the node into the specified drain mode(s) and optionally
    58  // instructs the process to terminate.
    59  // This method is part of the serverpb.AdminClient interface.
    60  func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_DrainServer) error {
    61  	ctx := stream.Context()
    62  	ctx = s.server.AnnotateCtx(ctx)
    63  
    64  	doDrain := req.DoDrain
    65  	if len(req.DeprecatedProbeIndicator) > 0 {
    66  		// Pre-20.1 behavior.
    67  		// TODO(knz): Remove this condition in 20.2.
    68  		doDrain = true
    69  		if !reflect.DeepEqual(req.DeprecatedProbeIndicator, DeprecatedDrainParameter) {
    70  			return status.Errorf(codes.InvalidArgument, "Invalid drain request parameter.")
    71  		}
    72  	}
    73  
    74  	log.Infof(ctx, "drain request received with doDrain = %v, shutdown = %v", doDrain, req.Shutdown)
    75  
    76  	res := serverpb.DrainResponse{}
    77  	if doDrain {
    78  		remaining, info, err := s.server.Drain(ctx)
    79  		if err != nil {
    80  			log.Errorf(ctx, "drain failed: %v", err)
    81  			return err
    82  		}
    83  		res.DrainRemainingIndicator = remaining
    84  		res.DrainRemainingDescription = info
    85  	}
    86  	if s.server.isDraining() {
    87  		res.DeprecatedDrainStatus = DeprecatedDrainParameter
    88  		res.IsDraining = true
    89  	}
    90  
    91  	if err := stream.Send(&res); err != nil {
    92  		return err
    93  	}
    94  
    95  	if !req.Shutdown {
    96  		if doDrain {
    97  			// The condition "if doDrain" is because we don't need an info
    98  			// message for just a probe.
    99  			log.Infof(ctx, "drain request completed without server shutdown")
   100  		}
   101  		return nil
   102  	}
   103  
   104  	go func() {
   105  		// TODO(tbg): why don't we stop the stopper first? Stopping the stopper
   106  		// first seems more reasonable since grpc.Stop closes the listener right
   107  		// away (and who knows whether gRPC-goroutines are tied up in some
   108  		// stopper task somewhere).
   109  		s.server.grpc.Stop()
   110  		s.server.stopper.Stop(ctx)
   111  	}()
   112  
   113  	select {
   114  	case <-s.server.stopper.IsStopped():
   115  		return nil
   116  	case <-ctx.Done():
   117  		return ctx.Err()
   118  	case <-time.After(10 * time.Second):
   119  		// This is a hack to work around the problem in
   120  		// https://github.com/cockroachdb/cockroach/issues/37425#issuecomment-494336131
   121  		//
   122  		// There appear to be deadlock scenarios in which we don't manage to
   123  		// fully stop the grpc server (which implies closing the listener, i.e.
   124  		// seeming dead to the outside world) or don't manage to shut down the
   125  		// stopper (the evidence in #37425 is inconclusive which one it is).
   126  		//
   127  		// Other problems in this area are known, such as
   128  		// https://github.com/cockroachdb/cockroach/pull/31692
   129  		//
   130  		// The signal-based shutdown path uses a similar time-based escape hatch.
   131  		// Until we spend (potentially lots of time to) understand and fix this
   132  		// issue, this will serve us well.
   133  		os.Exit(1)
   134  		return errors.New("unreachable")
   135  	}
   136  }
   137  
   138  // Drain idempotently activates the draining mode.
   139  // Note: new code should not be taught to use this method
   140  // directly. Use the Drain() RPC instead with a suitably crafted
   141  // DrainRequest.
   142  //
   143  // On failure, the system may be in a partially drained
   144  // state; the client should either continue calling Drain() or shut
   145  // down the server.
   146  //
   147  // The reporter function, if non-nil, is called for each
   148  // packet of load shed away from the server during the drain.
   149  //
   150  // TODO(knz): This method is currently exported for use by the
   151  // shutdown code in cli/start.go; however, this is a mis-design. The
   152  // start code should use the Drain() RPC like quit does.
   153  func (s *Server) Drain(ctx context.Context) (remaining uint64, info string, err error) {
   154  	reports := make(map[string]int)
   155  	var mu syncutil.Mutex
   156  	reporter := func(howMany int, what string) {
   157  		if howMany > 0 {
   158  			mu.Lock()
   159  			reports[what] += howMany
   160  			mu.Unlock()
   161  		}
   162  	}
   163  	defer func() {
   164  		// Detail the counts based on the collected reports.
   165  		var descBuf strings.Builder
   166  		comma := ""
   167  		for what, howMany := range reports {
   168  			remaining += uint64(howMany)
   169  			fmt.Fprintf(&descBuf, "%s%s: %d", comma, what, howMany)
   170  			comma = ", "
   171  		}
   172  		info = descBuf.String()
   173  		log.Infof(ctx, "drain remaining: %d", remaining)
   174  		if info != "" {
   175  			log.Infof(ctx, "drain details: %s", info)
   176  		}
   177  	}()
   178  
   179  	if err := s.doDrain(ctx, reporter); err != nil {
   180  		return 0, "", err
   181  	}
   182  
   183  	return
   184  }
   185  
   186  func (s *Server) doDrain(ctx context.Context, reporter func(int, string)) error {
   187  	// First drain all clients and SQL leases.
   188  	if err := s.drainClients(ctx, reporter); err != nil {
   189  		return err
   190  	}
   191  	// Finally, mark the node as draining in liveness and drain the
   192  	// range leases.
   193  	return s.drainNode(ctx, reporter)
   194  }
   195  
   196  // isDraining returns true if either clients are being drained
   197  // or one of the stores on the node is not accepting replicas.
   198  func (s *Server) isDraining() bool {
   199  	return s.sqlServer.pgServer.IsDraining() || s.node.IsDraining()
   200  }
   201  
   202  // drainClients starts draining the SQL layer.
   203  func (s *Server) drainClients(ctx context.Context, reporter func(int, string)) error {
   204  	// Mark the server as draining in a way that probes to
   205  	// /health?ready=1 will notice.
   206  	s.grpc.setMode(modeDraining)
   207  	// Wait for drainUnreadyWait. This will fail load balancer checks and
   208  	// delay draining so that client traffic can move off this node.
   209  	time.Sleep(drainWait.Get(&s.st.SV))
   210  
   211  	// Disable incoming SQL clients up to the queryWait timeout.
   212  	drainMaxWait := queryWait.Get(&s.st.SV)
   213  	if err := s.sqlServer.pgServer.Drain(drainMaxWait, reporter); err != nil {
   214  		return err
   215  	}
   216  	// Stop ongoing SQL execution up to the queryWait timeout.
   217  	s.sqlServer.distSQLServer.Drain(ctx, drainMaxWait, reporter)
   218  
   219  	// Drain the SQL leases. This must be done after the pgServer has
   220  	// given sessions a chance to finish ongoing work.
   221  	s.sqlServer.leaseMgr.SetDraining(true /* drain */, reporter)
   222  
   223  	// Done. This executes the defers set above to drain SQL leases.
   224  	return nil
   225  }
   226  
   227  // drainNode initiates the draining mode for the node, which
   228  // starts draining range leases.
   229  func (s *Server) drainNode(ctx context.Context, reporter func(int, string)) error {
   230  	s.nodeLiveness.SetDraining(ctx, true /* drain */, reporter)
   231  	return s.node.SetDraining(true /* drain */, reporter)
   232  }