github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/drain.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package server 12 13 import ( 14 "context" 15 "fmt" 16 "os" 17 "reflect" 18 "strings" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/server/serverpb" 22 "github.com/cockroachdb/cockroach/pkg/settings" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 25 "github.com/cockroachdb/errors" 26 "google.golang.org/grpc/codes" 27 "google.golang.org/grpc/status" 28 ) 29 30 var ( 31 // DeprecatedDrainParameter the special value that must be 32 // passed in DrainRequest.DeprecatedProbeIndicator to signal the 33 // drain request is not a probe. 34 // This variable is also used in the v20.1 "quit" client 35 // to provide a valid input to the request sent to 36 // v19.1 nodes. 37 // 38 // TODO(knz): Remove this in v20.2 and whenever the "quit" command 39 // is not meant to work with 19.x servers any more, whichever comes 40 // later. 41 DeprecatedDrainParameter = []int32{0, 1} 42 43 queryWait = settings.RegisterPublicDurationSetting( 44 "server.shutdown.query_wait", 45 "the server will wait for at least this amount of time for active queries to finish", 46 10*time.Second, 47 ) 48 49 drainWait = settings.RegisterPublicDurationSetting( 50 "server.shutdown.drain_wait", 51 "the amount of time a server waits in an unready state before proceeding with the rest "+ 52 "of the shutdown process", 53 0*time.Second, 54 ) 55 ) 56 57 // Drain puts the node into the specified drain mode(s) and optionally 58 // instructs the process to terminate. 59 // This method is part of the serverpb.AdminClient interface. 60 func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_DrainServer) error { 61 ctx := stream.Context() 62 ctx = s.server.AnnotateCtx(ctx) 63 64 doDrain := req.DoDrain 65 if len(req.DeprecatedProbeIndicator) > 0 { 66 // Pre-20.1 behavior. 67 // TODO(knz): Remove this condition in 20.2. 68 doDrain = true 69 if !reflect.DeepEqual(req.DeprecatedProbeIndicator, DeprecatedDrainParameter) { 70 return status.Errorf(codes.InvalidArgument, "Invalid drain request parameter.") 71 } 72 } 73 74 log.Infof(ctx, "drain request received with doDrain = %v, shutdown = %v", doDrain, req.Shutdown) 75 76 res := serverpb.DrainResponse{} 77 if doDrain { 78 remaining, info, err := s.server.Drain(ctx) 79 if err != nil { 80 log.Errorf(ctx, "drain failed: %v", err) 81 return err 82 } 83 res.DrainRemainingIndicator = remaining 84 res.DrainRemainingDescription = info 85 } 86 if s.server.isDraining() { 87 res.DeprecatedDrainStatus = DeprecatedDrainParameter 88 res.IsDraining = true 89 } 90 91 if err := stream.Send(&res); err != nil { 92 return err 93 } 94 95 if !req.Shutdown { 96 if doDrain { 97 // The condition "if doDrain" is because we don't need an info 98 // message for just a probe. 99 log.Infof(ctx, "drain request completed without server shutdown") 100 } 101 return nil 102 } 103 104 go func() { 105 // TODO(tbg): why don't we stop the stopper first? Stopping the stopper 106 // first seems more reasonable since grpc.Stop closes the listener right 107 // away (and who knows whether gRPC-goroutines are tied up in some 108 // stopper task somewhere). 109 s.server.grpc.Stop() 110 s.server.stopper.Stop(ctx) 111 }() 112 113 select { 114 case <-s.server.stopper.IsStopped(): 115 return nil 116 case <-ctx.Done(): 117 return ctx.Err() 118 case <-time.After(10 * time.Second): 119 // This is a hack to work around the problem in 120 // https://github.com/cockroachdb/cockroach/issues/37425#issuecomment-494336131 121 // 122 // There appear to be deadlock scenarios in which we don't manage to 123 // fully stop the grpc server (which implies closing the listener, i.e. 124 // seeming dead to the outside world) or don't manage to shut down the 125 // stopper (the evidence in #37425 is inconclusive which one it is). 126 // 127 // Other problems in this area are known, such as 128 // https://github.com/cockroachdb/cockroach/pull/31692 129 // 130 // The signal-based shutdown path uses a similar time-based escape hatch. 131 // Until we spend (potentially lots of time to) understand and fix this 132 // issue, this will serve us well. 133 os.Exit(1) 134 return errors.New("unreachable") 135 } 136 } 137 138 // Drain idempotently activates the draining mode. 139 // Note: new code should not be taught to use this method 140 // directly. Use the Drain() RPC instead with a suitably crafted 141 // DrainRequest. 142 // 143 // On failure, the system may be in a partially drained 144 // state; the client should either continue calling Drain() or shut 145 // down the server. 146 // 147 // The reporter function, if non-nil, is called for each 148 // packet of load shed away from the server during the drain. 149 // 150 // TODO(knz): This method is currently exported for use by the 151 // shutdown code in cli/start.go; however, this is a mis-design. The 152 // start code should use the Drain() RPC like quit does. 153 func (s *Server) Drain(ctx context.Context) (remaining uint64, info string, err error) { 154 reports := make(map[string]int) 155 var mu syncutil.Mutex 156 reporter := func(howMany int, what string) { 157 if howMany > 0 { 158 mu.Lock() 159 reports[what] += howMany 160 mu.Unlock() 161 } 162 } 163 defer func() { 164 // Detail the counts based on the collected reports. 165 var descBuf strings.Builder 166 comma := "" 167 for what, howMany := range reports { 168 remaining += uint64(howMany) 169 fmt.Fprintf(&descBuf, "%s%s: %d", comma, what, howMany) 170 comma = ", " 171 } 172 info = descBuf.String() 173 log.Infof(ctx, "drain remaining: %d", remaining) 174 if info != "" { 175 log.Infof(ctx, "drain details: %s", info) 176 } 177 }() 178 179 if err := s.doDrain(ctx, reporter); err != nil { 180 return 0, "", err 181 } 182 183 return 184 } 185 186 func (s *Server) doDrain(ctx context.Context, reporter func(int, string)) error { 187 // First drain all clients and SQL leases. 188 if err := s.drainClients(ctx, reporter); err != nil { 189 return err 190 } 191 // Finally, mark the node as draining in liveness and drain the 192 // range leases. 193 return s.drainNode(ctx, reporter) 194 } 195 196 // isDraining returns true if either clients are being drained 197 // or one of the stores on the node is not accepting replicas. 198 func (s *Server) isDraining() bool { 199 return s.sqlServer.pgServer.IsDraining() || s.node.IsDraining() 200 } 201 202 // drainClients starts draining the SQL layer. 203 func (s *Server) drainClients(ctx context.Context, reporter func(int, string)) error { 204 // Mark the server as draining in a way that probes to 205 // /health?ready=1 will notice. 206 s.grpc.setMode(modeDraining) 207 // Wait for drainUnreadyWait. This will fail load balancer checks and 208 // delay draining so that client traffic can move off this node. 209 time.Sleep(drainWait.Get(&s.st.SV)) 210 211 // Disable incoming SQL clients up to the queryWait timeout. 212 drainMaxWait := queryWait.Get(&s.st.SV) 213 if err := s.sqlServer.pgServer.Drain(drainMaxWait, reporter); err != nil { 214 return err 215 } 216 // Stop ongoing SQL execution up to the queryWait timeout. 217 s.sqlServer.distSQLServer.Drain(ctx, drainMaxWait, reporter) 218 219 // Drain the SQL leases. This must be done after the pgServer has 220 // given sessions a chance to finish ongoing work. 221 s.sqlServer.leaseMgr.SetDraining(true /* drain */, reporter) 222 223 // Done. This executes the defers set above to drain SQL leases. 224 return nil 225 } 226 227 // drainNode initiates the draining mode for the node, which 228 // starts draining range leases. 229 func (s *Server) drainNode(ctx context.Context, reporter func(int, string)) error { 230 s.nodeLiveness.SetDraining(ctx, true /* drain */, reporter) 231 return s.node.SetDraining(true /* drain */, reporter) 232 }