github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/quit.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package cli 12 13 import ( 14 "context" 15 "fmt" 16 "io" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/server" 20 "github.com/cockroachdb/cockroach/pkg/server/serverpb" 21 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 22 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/cockroachdb/errors" 25 "github.com/spf13/cobra" 26 ) 27 28 // quitCmd command shuts down the node server. 29 var quitCmd = &cobra.Command{ 30 Use: "quit", 31 Short: "drain and shut down a node\n", 32 Long: ` 33 Shut down the server. The first stage is drain, where the server 34 stops accepting client connections, then stops extant 35 connections, and finally pushes range leases onto other nodes, 36 subject to various timeout parameters configurable via 37 cluster settings. After the first stage completes, 38 the server process is shut down. 39 40 See also 'cockroach node drain' to drain a server 41 without stopping the server process. 42 `, 43 Args: cobra.NoArgs, 44 RunE: MaybeDecorateGRPCError(runQuit), 45 } 46 47 // runQuit accesses the quit shutdown path. 48 func runQuit(cmd *cobra.Command, args []string) (err error) { 49 ctx, cancel := context.WithCancel(context.Background()) 50 defer cancel() 51 52 // At the end, we'll report "ok" if there was no error. 53 defer func() { 54 if err == nil { 55 fmt.Println("ok") 56 } 57 }() 58 59 // Establish a RPC connection. 60 c, finish, err := getAdminClient(ctx, serverCfg) 61 if err != nil { 62 return err 63 } 64 defer finish() 65 66 return drainAndShutdown(ctx, c) 67 } 68 69 // drainAndShutdown attempts to drain the server and then shut it 70 // down. 71 func drainAndShutdown(ctx context.Context, c serverpb.AdminClient) (err error) { 72 hardError, remainingWork, err := doDrain(ctx, c) 73 if hardError { 74 return err 75 } 76 77 if remainingWork { 78 log.Warningf(ctx, "graceful shutdown may not have completed successfully; check the node's logs for details.") 79 } 80 81 if err != nil { 82 log.Warningf(ctx, "drain did not complete successfully; hard shutdown may cause disruption") 83 } 84 // We have already performed the drain above, so now go straight to 85 // shutdown. We try twice just in case there is a transient error. 86 hardErr, err := doShutdown(ctx, c) 87 if err != nil && !hardErr { 88 log.Warningf(ctx, "hard shutdown attempt failed, retrying: %v", err) 89 _, err = doShutdown(ctx, c) 90 } 91 return errors.Wrap(err, "hard shutdown failed") 92 } 93 94 // doDrain calls a graceful drain. 95 // 96 // If the function returns hardError true, then the caller should not 97 // proceed with an alternate strategy (it's likely the server has gone 98 // away). 99 func doDrain( 100 ctx context.Context, c serverpb.AdminClient, 101 ) (hardError, remainingWork bool, err error) { 102 // The next step is to drain. The timeout is configurable 103 // via --drain-wait. 104 if quitCtx.drainWait == 0 { 105 return doDrainNoTimeout(ctx, c) 106 } 107 108 err = contextutil.RunWithTimeout(ctx, "drain", quitCtx.drainWait, func(ctx context.Context) (err error) { 109 hardError, remainingWork, err = doDrainNoTimeout(ctx, c) 110 return err 111 }) 112 if errors.HasType(err, (*contextutil.TimeoutError)(nil)) || grpcutil.IsTimeout(err) { 113 log.Infof(ctx, "drain timed out: %v", err) 114 err = errors.New("drain timeout") 115 } 116 return 117 } 118 119 func doDrainNoTimeout( 120 ctx context.Context, c serverpb.AdminClient, 121 ) (hardError, remainingWork bool, err error) { 122 defer func() { 123 if server.IsWaitingForInit(err) { 124 log.Infof(ctx, "%v", err) 125 err = errors.New("node cannot be drained before it has been initialized") 126 } 127 }() 128 129 remainingWork = true 130 for { 131 // Tell the user we're starting to drain. This enables the user to 132 // mentally prepare for something to take some time, as opposed to 133 // wondering why nothing is happening. 134 fmt.Fprintf(stderr, "node is draining... ") // notice no final newline. 135 136 // Send a drain request with the drain bit set and the shutdown bit 137 // unset. 138 stream, err := c.Drain(ctx, &serverpb.DrainRequest{ 139 DeprecatedProbeIndicator: server.DeprecatedDrainParameter, 140 DoDrain: true, 141 }) 142 if err != nil { 143 fmt.Fprintf(stderr, "\n") // finish the line started above. 144 return !grpcutil.IsTimeout(err), remainingWork, errors.Wrap(err, "error sending drain request") 145 } 146 for { 147 resp, err := stream.Recv() 148 if err == io.EOF { 149 // Done. 150 break 151 } 152 if err != nil { 153 // Unexpected error. 154 fmt.Fprintf(stderr, "\n") // finish the line started above. 155 log.Infof(ctx, "graceful shutdown failed: %v", err) 156 return false, remainingWork, err 157 } 158 159 if resp.IsDraining { 160 // We want to assert that the node is quitting, and tell the 161 // story about how much work was performed in logs for 162 // debugging. 163 finalString := "" 164 if resp.DrainRemainingIndicator == 0 { 165 finalString = " (complete)" 166 } 167 // We use stderr so that 'cockroach quit''s stdout remains a 168 // simple 'ok' in case of success (for compatibility with 169 // scripts). 170 fmt.Fprintf(stderr, "remaining: %d%s\n", 171 resp.DrainRemainingIndicator, finalString) 172 remainingWork = resp.DrainRemainingIndicator > 0 173 } else { 174 // Either the server has decided it wanted to stop quitting; or 175 // we're running a pre-20.1 node which doesn't populate IsDraining. 176 // In either case, we need to stop sending drain requests. 177 remainingWork = false 178 fmt.Fprintf(stderr, "done\n") 179 } 180 181 if resp.DrainRemainingDescription != "" { 182 // Only show this information in the log; we'd use this for debugging. 183 // (This can be revealed e.g. via --logtostderr.) 184 log.Infof(ctx, "drain details: %s\n", resp.DrainRemainingDescription) 185 } 186 187 // Iterate until end of stream, which indicates the drain is 188 // complete. 189 } 190 if !remainingWork { 191 break 192 } 193 // Avoid a busy wait with high CPU/network usage if the server 194 // replies with an incomplete drain too quickly. 195 time.Sleep(200 * time.Millisecond) 196 } 197 return false, remainingWork, nil 198 } 199 200 // doShutdown attempts to trigger a server shutdown *without* 201 // draining. Use doDrain() prior to perform a drain, or 202 // drainAndShutdown() to combine both. 203 func doShutdown(ctx context.Context, c serverpb.AdminClient) (hardError bool, err error) { 204 defer func() { 205 if server.IsWaitingForInit(err) { 206 log.Infof(ctx, "encountered error: %v", err) 207 err = errors.New("node cannot be shut down before it has been initialized") 208 err = errors.WithHint(err, "You can still stop the process using a service manager or a signal.") 209 hardError = true 210 } 211 if grpcutil.IsClosedConnection(err) { 212 // This most likely means that we shut down successfully. Note 213 // that sometimes the connection can be shut down even before a 214 // DrainResponse gets sent back to us, so we don't require a 215 // response on the stream (see #14184). 216 err = nil 217 } 218 }() 219 220 // We use a shorter timeout because a shutdown request has nothing 221 // else to do than shut down the node immediately. 222 err = contextutil.RunWithTimeout(ctx, "hard shutdown", 10*time.Second, func(ctx context.Context) error { 223 // Send a drain request with the drain bit unset (no drain). 224 // and the shutdown bit set. 225 stream, err := c.Drain(ctx, &serverpb.DrainRequest{Shutdown: true}) 226 if err != nil { 227 return errors.Wrap(err, "error sending shutdown request") 228 } 229 for { 230 _, err := stream.Recv() 231 if err == io.EOF { 232 return nil 233 } 234 if err != nil { 235 return err 236 } 237 } 238 }) 239 if !errors.HasType(err, (*contextutil.TimeoutError)(nil)) { 240 hardError = true 241 } 242 return hardError, err 243 } 244 245 // getAdminClient returns an AdminClient and a closure that must be invoked 246 // to free associated resources. 247 func getAdminClient(ctx context.Context, cfg server.Config) (serverpb.AdminClient, func(), error) { 248 conn, _, finish, err := getClientGRPCConn(ctx, cfg) 249 if err != nil { 250 return nil, nil, errors.Wrap(err, "Failed to connect to the node") 251 } 252 return serverpb.NewAdminClient(conn), finish, nil 253 }