github.com/projectcontour/contour@v1.28.2/cmd/contour/shutdownmanager.go (about)

     1  // Copyright Project Contour Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  //     http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package main
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"io"
    20  	"log"
    21  	"net"
    22  	"net/http"
    23  	"os"
    24  	"time"
    25  
    26  	"github.com/alecthomas/kingpin/v2"
    27  	"github.com/prometheus/common/expfmt"
    28  	"github.com/sirupsen/logrus"
    29  	"k8s.io/apimachinery/pkg/util/wait"
    30  	"k8s.io/client-go/util/retry"
    31  )
    32  
    33  const (
    34  	prometheusURL      = "http://unix/stats/prometheus"
    35  	healthcheckFailURL = "http://unix/healthcheck/fail"
    36  	prometheusStat     = "envoy_http_downstream_cx_active"
    37  )
    38  
    39  // shutdownReadyFile is the default file path used in the /shutdown endpoint.
    40  const shutdownReadyFile = "/admin/ok"
    41  
    42  // shutdownReadyCheckInterval is the default polling interval for the file used in the /shutdown endpoint.
    43  const shutdownReadyCheckInterval = time.Second * 1
    44  
    45  type shutdownmanagerContext struct {
    46  	// httpServePort defines what port the shutdown-manager listens on
    47  	httpServePort int
    48  	// shutdownReadyFile is the default file path used in the /shutdown endpoint
    49  	shutdownReadyFile string
    50  	// shutdownReadyCheckInterval is the polling interval for the file used in the /shutdown endpoint
    51  	shutdownReadyCheckInterval time.Duration
    52  
    53  	logrus.FieldLogger
    54  }
    55  
    56  type shutdownContext struct {
    57  	// checkInterval defines time delay between polling Envoy for open connections
    58  	checkInterval time.Duration
    59  
    60  	// checkDelay defines time to wait before polling Envoy for open connections
    61  	checkDelay time.Duration
    62  
    63  	// drainDelay defines time to wait before draining Envoy connections
    64  	drainDelay time.Duration
    65  
    66  	// minOpenConnections defines the minimum amount of connections
    67  	// that can be open when polling for active connections in Envoy
    68  	minOpenConnections int
    69  
    70  	// Deprecated: adminPort defines the port for the Envoy admin webpage, being configurable through --admin-port flag
    71  	adminPort int
    72  
    73  	// adminAddress defines the address for the Envoy admin webpage, being configurable through --admin-address flag
    74  	adminAddress string
    75  
    76  	// shutdownReadyFile defines the name of the file that is used to signal that shutdown is completed.
    77  	shutdownReadyFile string
    78  
    79  	logrus.FieldLogger
    80  }
    81  
    82  func newShutdownManagerContext() *shutdownmanagerContext {
    83  	// Set defaults for parameters which are then overridden via flags, ENV, or ConfigFile
    84  	return &shutdownmanagerContext{
    85  		httpServePort:              8090,
    86  		shutdownReadyFile:          shutdownReadyFile,
    87  		shutdownReadyCheckInterval: shutdownReadyCheckInterval,
    88  	}
    89  }
    90  
    91  func newShutdownContext() *shutdownContext {
    92  	return &shutdownContext{
    93  		checkInterval:      5 * time.Second,
    94  		checkDelay:         0,
    95  		drainDelay:         0,
    96  		minOpenConnections: 0,
    97  	}
    98  }
    99  
   100  // healthzHandler handles the /healthz endpoint which is used for the shutdown-manager's liveness probe.
   101  func (s *shutdownmanagerContext) healthzHandler(w http.ResponseWriter, _ *http.Request) {
   102  	if _, err := w.Write([]byte(http.StatusText(http.StatusOK))); err != nil {
   103  		s.WithField("context", "healthzHandler").Error(err)
   104  	}
   105  }
   106  
   107  // shutdownReadyHandler handles the /shutdown endpoint which is used by Envoy to determine if it can terminate.
   108  // Once enough connections have drained based upon configuration, a file will be written in
   109  // the shutdown manager's file system. Any HTTP request to /shutdown will use the existence of this
   110  // file to understand if it is safe to terminate. The file-based approach is used since the process in which
   111  // the kubelet calls the shutdown command is different than the HTTP request from Envoy to /shutdown
   112  func (s *shutdownmanagerContext) shutdownReadyHandler(w http.ResponseWriter, r *http.Request) {
   113  	l := s.WithField("context", "shutdownReadyHandler")
   114  	ctx := r.Context()
   115  	for {
   116  		_, err := os.Stat(s.shutdownReadyFile)
   117  		switch {
   118  		case os.IsNotExist(err):
   119  			l.Infof("file %s does not exist; checking again in %v", s.shutdownReadyFile,
   120  				s.shutdownReadyCheckInterval)
   121  		case err == nil:
   122  			l.Infof("detected file %s; sending HTTP response", s.shutdownReadyFile)
   123  			if _, err := w.Write([]byte(http.StatusText(http.StatusOK))); err != nil {
   124  				l.Error(err)
   125  			}
   126  			return
   127  		default:
   128  			l.Errorf("error checking for file: %v", err)
   129  		}
   130  
   131  		select {
   132  		case <-time.After(s.shutdownReadyCheckInterval):
   133  		case <-ctx.Done():
   134  			l.Infof("client request cancelled")
   135  			return
   136  		}
   137  	}
   138  }
   139  
   140  // shutdownHandler is called from a pod preStop hook, where it will block pod shutdown
   141  // until envoy is able to drain connections to below the min-open threshold.
   142  func (s *shutdownContext) shutdownHandler() {
   143  	s.WithField("context", "shutdownHandler").Infof("waiting %s before draining connections", s.drainDelay)
   144  	time.Sleep(s.drainDelay)
   145  
   146  	// Send shutdown signal to Envoy to start draining connections
   147  	s.Infof("failing envoy healthchecks")
   148  
   149  	// Retry any failures to shutdownEnvoy(s.adminAddress) in a Backoff time window
   150  	// doing 4 total attempts, multiplying the Duration by the Factor
   151  	// for each iteration.
   152  	err := retry.OnError(wait.Backoff{
   153  		Steps:    4,
   154  		Duration: 200 * time.Millisecond,
   155  		Factor:   5.0,
   156  		Jitter:   0.1,
   157  	}, func(err error) bool {
   158  		// Always retry any error.
   159  		return true
   160  	}, func() error {
   161  		s.Infof("attempting to shutdown")
   162  		return shutdownEnvoy(s.adminAddress)
   163  	})
   164  	if err != nil {
   165  		// May be conflict if max retries were hit, or may be something unrelated
   166  		// like permissions or a network error
   167  		s.WithField("context", "shutdownHandler").Errorf("error sending envoy healthcheck fail after 4 attempts: %v", err)
   168  	}
   169  
   170  	s.WithField("context", "shutdownHandler").Infof("waiting %s before polling for draining connections", s.checkDelay)
   171  	time.Sleep(s.checkDelay)
   172  
   173  	for {
   174  		openConnections, err := getOpenConnections(s.adminAddress)
   175  		if err != nil {
   176  			s.Error(err)
   177  		} else {
   178  			if openConnections <= s.minOpenConnections {
   179  				s.WithField("context", "shutdownHandler").
   180  					WithField("open_connections", openConnections).
   181  					WithField("min_connections", s.minOpenConnections).
   182  					Info("min number of open connections found, shutting down")
   183  				file, err := os.Create(s.shutdownReadyFile)
   184  				if err != nil {
   185  					s.Error(err)
   186  				}
   187  				defer file.Close()
   188  				return
   189  			}
   190  			s.WithField("context", "shutdownHandler").
   191  				WithField("open_connections", openConnections).
   192  				WithField("min_connections", s.minOpenConnections).
   193  				Info("polled open connections")
   194  		}
   195  		time.Sleep(s.checkInterval)
   196  	}
   197  }
   198  
   199  // shutdownEnvoy sends a POST request to /healthcheck/fail to tell Envoy to start draining connections
   200  func shutdownEnvoy(adminAddress string) error {
   201  	httpClient := http.Client{
   202  		Transport: &http.Transport{
   203  			DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
   204  				return net.Dial("unix", adminAddress)
   205  			},
   206  		},
   207  	}
   208  	/* #nosec */
   209  	resp, err := httpClient.Post(healthcheckFailURL, "", nil)
   210  	if err != nil {
   211  		return fmt.Errorf("creating healthcheck fail POST request failed: %s", err)
   212  	}
   213  
   214  	defer resp.Body.Close()
   215  	if resp.StatusCode != http.StatusOK {
   216  		return fmt.Errorf("POST for %q returned HTTP status %s", healthcheckFailURL, resp.Status)
   217  	}
   218  	return nil
   219  }
   220  
   221  // getOpenConnections parses a http request to a prometheus endpoint returning the sum of values found
   222  func getOpenConnections(adminAddress string) (int, error) {
   223  	httpClient := http.Client{
   224  		Transport: &http.Transport{
   225  			DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
   226  				return net.Dial("unix", adminAddress)
   227  			},
   228  		},
   229  	}
   230  
   231  	// Make request to Envoy Prometheus endpoint
   232  	/* #nosec */
   233  	resp, err := httpClient.Get(prometheusURL)
   234  	if err != nil {
   235  		return -1, fmt.Errorf("creating metrics GET request failed: %s", err)
   236  	}
   237  	defer resp.Body.Close()
   238  	if resp.StatusCode != http.StatusOK {
   239  		return -1, fmt.Errorf("GET for %q returned HTTP status %s", prometheusURL, resp.Status)
   240  	}
   241  
   242  	// Parse Prometheus listener stats for open connections
   243  	return parseOpenConnections(resp.Body)
   244  }
   245  
   246  // parseOpenConnections returns the sum of open connections from a Prometheus HTTP request
   247  func parseOpenConnections(stats io.Reader) (int, error) {
   248  	var parser expfmt.TextParser
   249  	openConnections := 0
   250  
   251  	if stats == nil {
   252  		return -1, fmt.Errorf("stats input was nil")
   253  	}
   254  
   255  	// Parse Prometheus http response
   256  	metricFamilies, err := parser.TextToMetricFamilies(stats)
   257  	if err != nil {
   258  		return -1, fmt.Errorf("parsing Prometheus text format failed: %v", err)
   259  	}
   260  
   261  	// Validate stat exists in output
   262  	if _, ok := metricFamilies[prometheusStat]; !ok {
   263  		return -1, fmt.Errorf("error finding Prometheus stat %q in the request result", prometheusStat)
   264  	}
   265  
   266  	// Look up open connections value
   267  	for _, metrics := range metricFamilies[prometheusStat].Metric {
   268  		for _, labels := range metrics.Label {
   269  			switch labels.GetValue() {
   270  			// don't count connections to these listeners.
   271  			case "admin", "envoy-admin", "stats", "health", "stats-health":
   272  			default:
   273  				openConnections += int(metrics.Gauge.GetValue())
   274  			}
   275  		}
   276  	}
   277  	return openConnections, nil
   278  }
   279  
   280  func doShutdownManager(config *shutdownmanagerContext) {
   281  	config.Info("started envoy shutdown manager")
   282  
   283  	http.HandleFunc("/healthz", config.healthzHandler)
   284  	http.HandleFunc("/shutdown", config.shutdownReadyHandler)
   285  
   286  	// Fails gosec G114: Use of net/http serve function that has no support for setting timeouts
   287  	// nolint:gosec
   288  	if err := http.ListenAndServe(fmt.Sprintf(":%d", config.httpServePort), nil); err != http.ErrServerClosed {
   289  		log.Fatal(err)
   290  	}
   291  	config.Info("stopped")
   292  }
   293  
   294  // registerShutdownManager registers the envoy shutdown-manager sub-command and flags
   295  func registerShutdownManager(cmd *kingpin.CmdClause, log logrus.FieldLogger) (*kingpin.CmdClause, *shutdownmanagerContext) {
   296  	ctx := newShutdownManagerContext()
   297  	ctx.FieldLogger = log.WithField("context", "shutdown-manager")
   298  
   299  	shutdownmgr := cmd.Command("shutdown-manager", "Start envoy shutdown-manager.")
   300  	shutdownmgr.Flag("ready-file", "File to poll while waiting shutdown to be completed.").Default(shutdownReadyFile).StringVar(&ctx.shutdownReadyFile)
   301  	shutdownmgr.Flag("serve-port", "Port to serve the http server on.").IntVar(&ctx.httpServePort)
   302  
   303  	return shutdownmgr, ctx
   304  }
   305  
   306  // registerShutdown registers the envoy shutdown sub-command and flags
   307  func registerShutdown(cmd *kingpin.CmdClause, log logrus.FieldLogger) (*kingpin.CmdClause, *shutdownContext) {
   308  	ctx := newShutdownContext()
   309  	ctx.FieldLogger = log.WithField("context", "shutdown")
   310  
   311  	shutdown := cmd.Command("shutdown", "Initiate an shutdown sequence which configures Envoy to begin draining connections.")
   312  	shutdown.Flag("admin-address", "Envoy admin interface address.").Default("/admin/admin.sock").StringVar(&ctx.adminAddress)
   313  	shutdown.Flag("admin-port", "DEPRECATED: Envoy admin interface port.").IntVar(&ctx.adminPort)
   314  	shutdown.Flag("check-delay", "Time to wait before polling Envoy for open connections.").Default("0s").DurationVar(&ctx.checkDelay)
   315  	shutdown.Flag("check-interval", "Time to poll Envoy for open connections.").DurationVar(&ctx.checkInterval)
   316  	shutdown.Flag("drain-delay", "Time to wait before draining Envoy connections.").Default("0s").DurationVar(&ctx.drainDelay)
   317  	shutdown.Flag("min-open-connections", "Min number of open connections when polling Envoy.").IntVar(&ctx.minOpenConnections)
   318  	shutdown.Flag("ready-file", "File to write when shutdown is completed.").Default(shutdownReadyFile).StringVar(&ctx.shutdownReadyFile)
   319  
   320  	return shutdown, ctx
   321  }