github.com/projectcontour/contour@v1.28.2/cmd/contour/shutdownmanager.go (about) 1 // Copyright Project Contour Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package main 15 16 import ( 17 "context" 18 "fmt" 19 "io" 20 "log" 21 "net" 22 "net/http" 23 "os" 24 "time" 25 26 "github.com/alecthomas/kingpin/v2" 27 "github.com/prometheus/common/expfmt" 28 "github.com/sirupsen/logrus" 29 "k8s.io/apimachinery/pkg/util/wait" 30 "k8s.io/client-go/util/retry" 31 ) 32 33 const ( 34 prometheusURL = "http://unix/stats/prometheus" 35 healthcheckFailURL = "http://unix/healthcheck/fail" 36 prometheusStat = "envoy_http_downstream_cx_active" 37 ) 38 39 // shutdownReadyFile is the default file path used in the /shutdown endpoint. 40 const shutdownReadyFile = "/admin/ok" 41 42 // shutdownReadyCheckInterval is the default polling interval for the file used in the /shutdown endpoint. 43 const shutdownReadyCheckInterval = time.Second * 1 44 45 type shutdownmanagerContext struct { 46 // httpServePort defines what port the shutdown-manager listens on 47 httpServePort int 48 // shutdownReadyFile is the default file path used in the /shutdown endpoint 49 shutdownReadyFile string 50 // shutdownReadyCheckInterval is the polling interval for the file used in the /shutdown endpoint 51 shutdownReadyCheckInterval time.Duration 52 53 logrus.FieldLogger 54 } 55 56 type shutdownContext struct { 57 // checkInterval defines time delay between polling Envoy for open connections 58 checkInterval time.Duration 59 60 // checkDelay defines time to wait before polling Envoy for open connections 61 checkDelay time.Duration 62 63 // drainDelay defines time to wait before draining Envoy connections 64 drainDelay time.Duration 65 66 // minOpenConnections defines the minimum amount of connections 67 // that can be open when polling for active connections in Envoy 68 minOpenConnections int 69 70 // Deprecated: adminPort defines the port for the Envoy admin webpage, being configurable through --admin-port flag 71 adminPort int 72 73 // adminAddress defines the address for the Envoy admin webpage, being configurable through --admin-address flag 74 adminAddress string 75 76 // shutdownReadyFile defines the name of the file that is used to signal that shutdown is completed. 77 shutdownReadyFile string 78 79 logrus.FieldLogger 80 } 81 82 func newShutdownManagerContext() *shutdownmanagerContext { 83 // Set defaults for parameters which are then overridden via flags, ENV, or ConfigFile 84 return &shutdownmanagerContext{ 85 httpServePort: 8090, 86 shutdownReadyFile: shutdownReadyFile, 87 shutdownReadyCheckInterval: shutdownReadyCheckInterval, 88 } 89 } 90 91 func newShutdownContext() *shutdownContext { 92 return &shutdownContext{ 93 checkInterval: 5 * time.Second, 94 checkDelay: 0, 95 drainDelay: 0, 96 minOpenConnections: 0, 97 } 98 } 99 100 // healthzHandler handles the /healthz endpoint which is used for the shutdown-manager's liveness probe. 101 func (s *shutdownmanagerContext) healthzHandler(w http.ResponseWriter, _ *http.Request) { 102 if _, err := w.Write([]byte(http.StatusText(http.StatusOK))); err != nil { 103 s.WithField("context", "healthzHandler").Error(err) 104 } 105 } 106 107 // shutdownReadyHandler handles the /shutdown endpoint which is used by Envoy to determine if it can terminate. 108 // Once enough connections have drained based upon configuration, a file will be written in 109 // the shutdown manager's file system. Any HTTP request to /shutdown will use the existence of this 110 // file to understand if it is safe to terminate. The file-based approach is used since the process in which 111 // the kubelet calls the shutdown command is different than the HTTP request from Envoy to /shutdown 112 func (s *shutdownmanagerContext) shutdownReadyHandler(w http.ResponseWriter, r *http.Request) { 113 l := s.WithField("context", "shutdownReadyHandler") 114 ctx := r.Context() 115 for { 116 _, err := os.Stat(s.shutdownReadyFile) 117 switch { 118 case os.IsNotExist(err): 119 l.Infof("file %s does not exist; checking again in %v", s.shutdownReadyFile, 120 s.shutdownReadyCheckInterval) 121 case err == nil: 122 l.Infof("detected file %s; sending HTTP response", s.shutdownReadyFile) 123 if _, err := w.Write([]byte(http.StatusText(http.StatusOK))); err != nil { 124 l.Error(err) 125 } 126 return 127 default: 128 l.Errorf("error checking for file: %v", err) 129 } 130 131 select { 132 case <-time.After(s.shutdownReadyCheckInterval): 133 case <-ctx.Done(): 134 l.Infof("client request cancelled") 135 return 136 } 137 } 138 } 139 140 // shutdownHandler is called from a pod preStop hook, where it will block pod shutdown 141 // until envoy is able to drain connections to below the min-open threshold. 142 func (s *shutdownContext) shutdownHandler() { 143 s.WithField("context", "shutdownHandler").Infof("waiting %s before draining connections", s.drainDelay) 144 time.Sleep(s.drainDelay) 145 146 // Send shutdown signal to Envoy to start draining connections 147 s.Infof("failing envoy healthchecks") 148 149 // Retry any failures to shutdownEnvoy(s.adminAddress) in a Backoff time window 150 // doing 4 total attempts, multiplying the Duration by the Factor 151 // for each iteration. 152 err := retry.OnError(wait.Backoff{ 153 Steps: 4, 154 Duration: 200 * time.Millisecond, 155 Factor: 5.0, 156 Jitter: 0.1, 157 }, func(err error) bool { 158 // Always retry any error. 159 return true 160 }, func() error { 161 s.Infof("attempting to shutdown") 162 return shutdownEnvoy(s.adminAddress) 163 }) 164 if err != nil { 165 // May be conflict if max retries were hit, or may be something unrelated 166 // like permissions or a network error 167 s.WithField("context", "shutdownHandler").Errorf("error sending envoy healthcheck fail after 4 attempts: %v", err) 168 } 169 170 s.WithField("context", "shutdownHandler").Infof("waiting %s before polling for draining connections", s.checkDelay) 171 time.Sleep(s.checkDelay) 172 173 for { 174 openConnections, err := getOpenConnections(s.adminAddress) 175 if err != nil { 176 s.Error(err) 177 } else { 178 if openConnections <= s.minOpenConnections { 179 s.WithField("context", "shutdownHandler"). 180 WithField("open_connections", openConnections). 181 WithField("min_connections", s.minOpenConnections). 182 Info("min number of open connections found, shutting down") 183 file, err := os.Create(s.shutdownReadyFile) 184 if err != nil { 185 s.Error(err) 186 } 187 defer file.Close() 188 return 189 } 190 s.WithField("context", "shutdownHandler"). 191 WithField("open_connections", openConnections). 192 WithField("min_connections", s.minOpenConnections). 193 Info("polled open connections") 194 } 195 time.Sleep(s.checkInterval) 196 } 197 } 198 199 // shutdownEnvoy sends a POST request to /healthcheck/fail to tell Envoy to start draining connections 200 func shutdownEnvoy(adminAddress string) error { 201 httpClient := http.Client{ 202 Transport: &http.Transport{ 203 DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { 204 return net.Dial("unix", adminAddress) 205 }, 206 }, 207 } 208 /* #nosec */ 209 resp, err := httpClient.Post(healthcheckFailURL, "", nil) 210 if err != nil { 211 return fmt.Errorf("creating healthcheck fail POST request failed: %s", err) 212 } 213 214 defer resp.Body.Close() 215 if resp.StatusCode != http.StatusOK { 216 return fmt.Errorf("POST for %q returned HTTP status %s", healthcheckFailURL, resp.Status) 217 } 218 return nil 219 } 220 221 // getOpenConnections parses a http request to a prometheus endpoint returning the sum of values found 222 func getOpenConnections(adminAddress string) (int, error) { 223 httpClient := http.Client{ 224 Transport: &http.Transport{ 225 DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { 226 return net.Dial("unix", adminAddress) 227 }, 228 }, 229 } 230 231 // Make request to Envoy Prometheus endpoint 232 /* #nosec */ 233 resp, err := httpClient.Get(prometheusURL) 234 if err != nil { 235 return -1, fmt.Errorf("creating metrics GET request failed: %s", err) 236 } 237 defer resp.Body.Close() 238 if resp.StatusCode != http.StatusOK { 239 return -1, fmt.Errorf("GET for %q returned HTTP status %s", prometheusURL, resp.Status) 240 } 241 242 // Parse Prometheus listener stats for open connections 243 return parseOpenConnections(resp.Body) 244 } 245 246 // parseOpenConnections returns the sum of open connections from a Prometheus HTTP request 247 func parseOpenConnections(stats io.Reader) (int, error) { 248 var parser expfmt.TextParser 249 openConnections := 0 250 251 if stats == nil { 252 return -1, fmt.Errorf("stats input was nil") 253 } 254 255 // Parse Prometheus http response 256 metricFamilies, err := parser.TextToMetricFamilies(stats) 257 if err != nil { 258 return -1, fmt.Errorf("parsing Prometheus text format failed: %v", err) 259 } 260 261 // Validate stat exists in output 262 if _, ok := metricFamilies[prometheusStat]; !ok { 263 return -1, fmt.Errorf("error finding Prometheus stat %q in the request result", prometheusStat) 264 } 265 266 // Look up open connections value 267 for _, metrics := range metricFamilies[prometheusStat].Metric { 268 for _, labels := range metrics.Label { 269 switch labels.GetValue() { 270 // don't count connections to these listeners. 271 case "admin", "envoy-admin", "stats", "health", "stats-health": 272 default: 273 openConnections += int(metrics.Gauge.GetValue()) 274 } 275 } 276 } 277 return openConnections, nil 278 } 279 280 func doShutdownManager(config *shutdownmanagerContext) { 281 config.Info("started envoy shutdown manager") 282 283 http.HandleFunc("/healthz", config.healthzHandler) 284 http.HandleFunc("/shutdown", config.shutdownReadyHandler) 285 286 // Fails gosec G114: Use of net/http serve function that has no support for setting timeouts 287 // nolint:gosec 288 if err := http.ListenAndServe(fmt.Sprintf(":%d", config.httpServePort), nil); err != http.ErrServerClosed { 289 log.Fatal(err) 290 } 291 config.Info("stopped") 292 } 293 294 // registerShutdownManager registers the envoy shutdown-manager sub-command and flags 295 func registerShutdownManager(cmd *kingpin.CmdClause, log logrus.FieldLogger) (*kingpin.CmdClause, *shutdownmanagerContext) { 296 ctx := newShutdownManagerContext() 297 ctx.FieldLogger = log.WithField("context", "shutdown-manager") 298 299 shutdownmgr := cmd.Command("shutdown-manager", "Start envoy shutdown-manager.") 300 shutdownmgr.Flag("ready-file", "File to poll while waiting shutdown to be completed.").Default(shutdownReadyFile).StringVar(&ctx.shutdownReadyFile) 301 shutdownmgr.Flag("serve-port", "Port to serve the http server on.").IntVar(&ctx.httpServePort) 302 303 return shutdownmgr, ctx 304 } 305 306 // registerShutdown registers the envoy shutdown sub-command and flags 307 func registerShutdown(cmd *kingpin.CmdClause, log logrus.FieldLogger) (*kingpin.CmdClause, *shutdownContext) { 308 ctx := newShutdownContext() 309 ctx.FieldLogger = log.WithField("context", "shutdown") 310 311 shutdown := cmd.Command("shutdown", "Initiate an shutdown sequence which configures Envoy to begin draining connections.") 312 shutdown.Flag("admin-address", "Envoy admin interface address.").Default("/admin/admin.sock").StringVar(&ctx.adminAddress) 313 shutdown.Flag("admin-port", "DEPRECATED: Envoy admin interface port.").IntVar(&ctx.adminPort) 314 shutdown.Flag("check-delay", "Time to wait before polling Envoy for open connections.").Default("0s").DurationVar(&ctx.checkDelay) 315 shutdown.Flag("check-interval", "Time to poll Envoy for open connections.").DurationVar(&ctx.checkInterval) 316 shutdown.Flag("drain-delay", "Time to wait before draining Envoy connections.").Default("0s").DurationVar(&ctx.drainDelay) 317 shutdown.Flag("min-open-connections", "Min number of open connections when polling Envoy.").IntVar(&ctx.minOpenConnections) 318 shutdown.Flag("ready-file", "File to write when shutdown is completed.").Default(shutdownReadyFile).StringVar(&ctx.shutdownReadyFile) 319 320 return shutdown, ctx 321 }