istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/cmd/pilot-agent/status/server.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package status
    16  
    17  import (
    18  	"context"
    19  	"crypto/tls"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"mime"
    25  	"net"
    26  	"net/http"
    27  	"net/http/pprof"
    28  	"os"
    29  	"regexp"
    30  	"strconv"
    31  	"strings"
    32  	"sync"
    33  	"syscall"
    34  	"time"
    35  
    36  	"github.com/prometheus/client_golang/prometheus"
    37  	"github.com/prometheus/client_golang/prometheus/collectors"
    38  	"github.com/prometheus/common/expfmt"
    39  	"golang.org/x/net/http2"
    40  	"google.golang.org/grpc"
    41  	"google.golang.org/grpc/codes"
    42  	"google.golang.org/grpc/credentials/insecure"
    43  	grpcHealth "google.golang.org/grpc/health/grpc_health_v1"
    44  	grpcStatus "google.golang.org/grpc/status"
    45  	"k8s.io/apimachinery/pkg/util/intstr"
    46  	k8sUtilIo "k8s.io/utils/io"
    47  
    48  	"istio.io/istio/pilot/cmd/pilot-agent/metrics"
    49  	"istio.io/istio/pilot/cmd/pilot-agent/status/grpcready"
    50  	"istio.io/istio/pilot/cmd/pilot-agent/status/ready"
    51  	"istio.io/istio/pkg/config"
    52  	dnsProto "istio.io/istio/pkg/dns/proto"
    53  	"istio.io/istio/pkg/env"
    54  	commonFeatures "istio.io/istio/pkg/features"
    55  	"istio.io/istio/pkg/kube/apimirror"
    56  	"istio.io/istio/pkg/log"
    57  	"istio.io/istio/pkg/model"
    58  	"istio.io/istio/pkg/monitoring"
    59  	"istio.io/istio/pkg/network"
    60  	"istio.io/istio/pkg/slices"
    61  	istioNetUtil "istio.io/istio/pkg/util/net"
    62  )
    63  
    64  const (
    65  	// readyPath is for the pilot agent readiness itself.
    66  	readyPath = "/healthz/ready"
    67  	// quitPath is to notify the pilot agent to quit.
    68  	quitPath  = "/quitquitquit"
    69  	drainPath = "/drain"
    70  	// KubeAppProberEnvName is the name of the command line flag for pilot agent to pass app prober config.
    71  	// The json encoded string to pass app HTTP probe information from injector(istioctl or webhook).
    72  	// For example, ISTIO_KUBE_APP_PROBERS='{"/app-health/httpbin/livez":{"httpGet":{"path": "/hello", "port": 8080}}.
    73  	// indicates that httpbin container liveness prober port is 8080 and probing path is /hello.
    74  	// This environment variable should never be set manually.
    75  	KubeAppProberEnvName = "ISTIO_KUBE_APP_PROBERS"
    76  
    77  	localHostIPv4     = "127.0.0.1"
    78  	localHostIPv6     = "::1"
    79  	maxRespBodyLength = 10 * 1 << 10
    80  )
    81  
    82  var (
    83  	UpstreamLocalAddressIPv4 = &net.TCPAddr{IP: net.ParseIP("127.0.0.6")}
    84  	UpstreamLocalAddressIPv6 = &net.TCPAddr{IP: net.ParseIP("::6")}
    85  )
    86  
    87  var PrometheusScrapingConfig = env.Register("ISTIO_PROMETHEUS_ANNOTATIONS", "", "")
    88  
    89  var (
    90  	appProberPattern = regexp.MustCompile(`^/app-health/[^/]+/(livez|readyz|startupz)$`)
    91  
    92  	EnableHTTP2Probing = env.Register("ISTIO_ENABLE_HTTP2_PROBING", true,
    93  		"If enabled, HTTP2 probes will be enabled for HTTPS probes, following Kubernetes").Get()
    94  
    95  	LegacyLocalhostProbeDestination = env.Register("REWRITE_PROBE_LEGACY_LOCALHOST_DESTINATION", false,
    96  		"If enabled, readiness probes will be sent to 'localhost'. Otherwise, they will be sent to the Pod's IP, matching Kubernetes' behavior.")
    97  
    98  	ProbeKeepaliveConnections = env.Register("ENABLE_PROBE_KEEPALIVE_CONNECTIONS", false,
    99  		"If enabled, readiness probes will keep the connection from pilot-agent to the application alive. "+
   100  			"This mirrors older Istio versions' behaviors, but not kubelet's.").Get()
   101  )
   102  
   103  // KubeAppProbers holds the information about a Kubernetes pod prober.
   104  // It's a map from the prober URL path to the Kubernetes Prober config.
   105  // For example, "/app-health/hello-world/livez" entry contains liveness prober config for
   106  // container "hello-world".
   107  type KubeAppProbers map[string]*Prober
   108  
   109  // Prober represents a single container prober
   110  type Prober struct {
   111  	HTTPGet        *apimirror.HTTPGetAction   `json:"httpGet,omitempty"`
   112  	TCPSocket      *apimirror.TCPSocketAction `json:"tcpSocket,omitempty"`
   113  	GRPC           *apimirror.GRPCAction      `json:"grpc,omitempty"`
   114  	TimeoutSeconds int32                      `json:"timeoutSeconds,omitempty"`
   115  }
   116  
   117  // Options for the status server.
   118  type Options struct {
   119  	// Ip of the pod. Note: this is only applicable for Kubernetes pods and should only be used for
   120  	// the prober.
   121  	PodIP string
   122  	// KubeAppProbers is a json with Kubernetes application prober config encoded.
   123  	KubeAppProbers      string
   124  	NodeType            model.NodeType
   125  	StatusPort          uint16
   126  	AdminPort           uint16
   127  	IPv6                bool
   128  	Probes              []ready.Prober
   129  	EnvoyPrometheusPort int
   130  	Context             context.Context
   131  	FetchDNS            func() *dnsProto.NameTable
   132  	NoEnvoy             bool
   133  	GRPCBootstrap       string
   134  	EnableProfiling     bool
   135  	// PrometheusRegistry to use. Just for testing.
   136  	PrometheusRegistry prometheus.Gatherer
   137  	Shutdown           context.CancelFunc
   138  	TriggerDrain       func()
   139  }
   140  
   141  // Server provides an endpoint for handling status probes.
   142  type Server struct {
   143  	ready                 []ready.Prober
   144  	prometheus            *PrometheusScrapeConfiguration
   145  	mutex                 sync.RWMutex
   146  	appProbersDestination string
   147  	appKubeProbers        KubeAppProbers
   148  	appProbeClient        map[string]*http.Client
   149  	statusPort            uint16
   150  	lastProbeSuccessful   bool
   151  	envoyStatsPort        int
   152  	fetchDNS              func() *dnsProto.NameTable
   153  	upstreamLocalAddress  *net.TCPAddr
   154  	config                Options
   155  	http                  *http.Client
   156  	enableProfiling       bool
   157  	registry              prometheus.Gatherer
   158  	shutdown              context.CancelFunc
   159  	drain                 func()
   160  }
   161  
   162  func initializeMonitoring() (prometheus.Gatherer, error) {
   163  	registry := prometheus.NewRegistry()
   164  	wrapped := prometheus.WrapRegistererWithPrefix("istio_agent_", registry)
   165  	wrapped.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
   166  	wrapped.MustRegister(collectors.NewGoCollector())
   167  
   168  	_, err := monitoring.RegisterPrometheusExporter(wrapped, registry)
   169  	if err != nil {
   170  		return nil, fmt.Errorf("could not setup exporter: %v", err)
   171  	}
   172  	return registry, nil
   173  }
   174  
   175  // NewServer creates a new status server.
   176  func NewServer(config Options) (*Server, error) {
   177  	localhost := localHostIPv4
   178  	upstreamLocalAddress := UpstreamLocalAddressIPv4
   179  	if config.IPv6 {
   180  		localhost = localHostIPv6
   181  		upstreamLocalAddress = UpstreamLocalAddressIPv6
   182  	} else {
   183  		// if not ipv6-only, it can be ipv4-only or dual-stack
   184  		// let InstanceIP decide the localhost
   185  		netIP := net.ParseIP(config.PodIP)
   186  		if netIP.To4() == nil && netIP.To16() != nil && !netIP.IsLinkLocalUnicast() {
   187  			localhost = localHostIPv6
   188  			upstreamLocalAddress = UpstreamLocalAddressIPv6
   189  		}
   190  	}
   191  	probes := make([]ready.Prober, 0)
   192  	if !config.NoEnvoy {
   193  		probes = append(probes, &ready.Probe{
   194  			LocalHostAddr: localhost,
   195  			AdminPort:     config.AdminPort,
   196  			Context:       config.Context,
   197  			NoEnvoy:       config.NoEnvoy,
   198  		})
   199  	}
   200  
   201  	if config.GRPCBootstrap != "" {
   202  		probes = append(probes, grpcready.NewProbe(config.GRPCBootstrap))
   203  	}
   204  
   205  	probes = append(probes, config.Probes...)
   206  	registry := config.PrometheusRegistry
   207  	if registry == nil {
   208  		var err error
   209  		registry, err = initializeMonitoring()
   210  		if err != nil {
   211  			return nil, err
   212  		}
   213  	}
   214  	s := &Server{
   215  		statusPort:            config.StatusPort,
   216  		ready:                 probes,
   217  		http:                  &http.Client{},
   218  		appProbersDestination: config.PodIP,
   219  		envoyStatsPort:        config.EnvoyPrometheusPort,
   220  		fetchDNS:              config.FetchDNS,
   221  		upstreamLocalAddress:  upstreamLocalAddress,
   222  		config:                config,
   223  		enableProfiling:       config.EnableProfiling,
   224  		registry:              registry,
   225  		shutdown: func() {
   226  			config.Shutdown()
   227  		},
   228  		drain: config.TriggerDrain,
   229  	}
   230  	if LegacyLocalhostProbeDestination.Get() {
   231  		s.appProbersDestination = "localhost"
   232  	}
   233  
   234  	// Enable prometheus server if its configured and a sidecar
   235  	// Because port 15020 is exposed in the gateway Services, we cannot safely serve this endpoint
   236  	// If we need to do this in the future, we should use envoy to do routing or have another port to make this internal
   237  	// only. For now, its not needed for gateway, as we can just get Envoy stats directly, but if we
   238  	// want to expose istio-agent metrics we may want to revisit this.
   239  	if cfg, f := PrometheusScrapingConfig.Lookup(); config.NodeType == model.SidecarProxy && f {
   240  		var prom PrometheusScrapeConfiguration
   241  		if err := json.Unmarshal([]byte(cfg), &prom); err != nil {
   242  			return nil, fmt.Errorf("failed to unmarshal %s: %v", PrometheusScrapingConfig.Name, err)
   243  		}
   244  		log.Infof("Prometheus scraping configuration: %v", prom)
   245  		if prom.Scrape != "false" {
   246  			s.prometheus = &prom
   247  			if s.prometheus.Path == "" {
   248  				s.prometheus.Path = "/metrics"
   249  			}
   250  			if s.prometheus.Port == "" {
   251  				s.prometheus.Port = "80"
   252  			}
   253  			if s.prometheus.Port == strconv.Itoa(int(config.StatusPort)) {
   254  				return nil, fmt.Errorf("invalid prometheus scrape configuration: "+
   255  					"application port is the same as agent port, which may lead to a recursive loop. "+
   256  					"Ensure pod does not have prometheus.io/port=%d label, or that injection is not happening multiple times", config.StatusPort)
   257  			}
   258  		}
   259  	}
   260  
   261  	if config.KubeAppProbers == "" {
   262  		return s, nil
   263  	}
   264  	if err := json.Unmarshal([]byte(config.KubeAppProbers), &s.appKubeProbers); err != nil {
   265  		return nil, fmt.Errorf("failed to decode app prober err = %v, json string = %v", err, config.KubeAppProbers)
   266  	}
   267  
   268  	s.appProbeClient = make(map[string]*http.Client, len(s.appKubeProbers))
   269  	// Validate the map key matching the regex pattern.
   270  	for path, prober := range s.appKubeProbers {
   271  		err := validateAppKubeProber(path, prober)
   272  		if err != nil {
   273  			return nil, err
   274  		}
   275  		if prober.HTTPGet != nil {
   276  			d := ProbeDialer()
   277  			d.LocalAddr = s.upstreamLocalAddress
   278  			// nolint: gosec
   279  			// This is matching Kubernetes. It is a reasonable usage of this, as it is just a health check over localhost.
   280  			transport, err := setTransportDefaults(&http.Transport{
   281  				TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
   282  				DialContext:     d.DialContext,
   283  				// https://github.com/kubernetes/kubernetes/blob/0153febd9f0098d4b8d0d484927710eaf899ef40/pkg/probe/http/http.go#L55
   284  				// Match Kubernetes logic. This also ensures idle timeouts do not trigger probe failures
   285  				DisableKeepAlives: !ProbeKeepaliveConnections,
   286  			})
   287  			if err != nil {
   288  				return nil, err
   289  			}
   290  			// Construct a http client and cache it in order to reuse the connection.
   291  			s.appProbeClient[path] = &http.Client{
   292  				Timeout: time.Duration(prober.TimeoutSeconds) * time.Second,
   293  				// We skip the verification since kubelet skips the verification for HTTPS prober as well
   294  				// https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes
   295  				Transport:     transport,
   296  				CheckRedirect: redirectChecker(),
   297  			}
   298  		}
   299  	}
   300  
   301  	return s, nil
   302  }
   303  
   304  // Copies logic from https://github.com/kubernetes/kubernetes/blob/b152001f459/pkg/probe/http/http.go#L129-L130
   305  func isRedirect(code int) bool {
   306  	return code >= http.StatusMultipleChoices && code < http.StatusBadRequest
   307  }
   308  
   309  // Using the same redirect logic that kubelet does: https://github.com/kubernetes/kubernetes/blob/b152001f459/pkg/probe/http/http.go#L141
   310  // This means that:
   311  // * If we exceed 10 redirects, the probe fails
   312  // * If we redirect somewhere external, the probe succeeds (https://github.com/kubernetes/kubernetes/blob/b152001f459/pkg/probe/http/http.go#L130)
   313  // * If we redirect to the same address, the probe will follow the redirect
   314  func redirectChecker() func(*http.Request, []*http.Request) error {
   315  	return func(req *http.Request, via []*http.Request) error {
   316  		if req.URL.Hostname() != via[0].URL.Hostname() {
   317  			return http.ErrUseLastResponse
   318  		}
   319  		// Default behavior: stop after 10 redirects.
   320  		if len(via) >= 10 {
   321  			return errors.New("stopped after 10 redirects")
   322  		}
   323  		return nil
   324  	}
   325  }
   326  
   327  func validateAppKubeProber(path string, prober *Prober) error {
   328  	if !appProberPattern.MatchString(path) {
   329  		return fmt.Errorf(`invalid path, must be in form of regex pattern %v`, appProberPattern)
   330  	}
   331  	count := 0
   332  	if prober.HTTPGet != nil {
   333  		count++
   334  	}
   335  	if prober.TCPSocket != nil {
   336  		count++
   337  	}
   338  	if prober.GRPC != nil {
   339  		count++
   340  	}
   341  	if count != 1 {
   342  		return fmt.Errorf(`invalid prober type, must be one of type httpGet, tcpSocket or gRPC`)
   343  	}
   344  	if prober.HTTPGet != nil && prober.HTTPGet.Port.Type != intstr.Int {
   345  		return fmt.Errorf("invalid prober config for %v, the port must be int type", path)
   346  	}
   347  	if prober.TCPSocket != nil && prober.TCPSocket.Port.Type != intstr.Int {
   348  		return fmt.Errorf("invalid prober config for %v, the port must be int type", path)
   349  	}
   350  	return nil
   351  }
   352  
   353  // FormatProberURL returns a set of HTTP URLs that pilot agent will serve to take over Kubernetes
   354  // app probers.
   355  func FormatProberURL(container string) (string, string, string) {
   356  	return fmt.Sprintf("/app-health/%v/readyz", container),
   357  		fmt.Sprintf("/app-health/%v/livez", container),
   358  		fmt.Sprintf("/app-health/%v/startupz", container)
   359  }
   360  
   361  // Run opens a the status port and begins accepting probes.
   362  func (s *Server) Run(ctx context.Context) {
   363  	log.Infof("Opening status port %d", s.statusPort)
   364  
   365  	mux := http.NewServeMux()
   366  
   367  	// Add the handler for ready probes.
   368  	mux.HandleFunc(readyPath, s.handleReadyProbe)
   369  	// Default path for prom
   370  	mux.HandleFunc(`/metrics`, s.handleStats)
   371  	// Envoy uses something else - and original agent used the same.
   372  	// Keep for backward compat with configs.
   373  	mux.HandleFunc(`/stats/prometheus`, s.handleStats)
   374  	mux.HandleFunc(quitPath, s.handleQuit)
   375  	mux.HandleFunc(drainPath, s.handleDrain)
   376  	mux.HandleFunc("/app-health/", s.handleAppProbe)
   377  
   378  	if s.enableProfiling {
   379  		// Add the handler for pprof.
   380  		mux.HandleFunc("/debug/pprof/", s.handlePprofIndex)
   381  		mux.HandleFunc("/debug/pprof/cmdline", s.handlePprofCmdline)
   382  		mux.HandleFunc("/debug/pprof/profile", s.handlePprofProfile)
   383  		mux.HandleFunc("/debug/pprof/symbol", s.handlePprofSymbol)
   384  		mux.HandleFunc("/debug/pprof/trace", s.handlePprofTrace)
   385  	}
   386  	mux.HandleFunc("/debug/ndsz", s.handleNdsz)
   387  
   388  	l, err := net.Listen("tcp", fmt.Sprintf(":%d", s.statusPort))
   389  	if err != nil {
   390  		log.Errorf("Error listening on status port: %v", err.Error())
   391  		return
   392  	}
   393  	// for testing.
   394  	if s.statusPort == 0 {
   395  		_, hostPort, _ := net.SplitHostPort(l.Addr().String())
   396  		allocatedPort, _ := strconv.Atoi(hostPort)
   397  		s.mutex.Lock()
   398  		s.statusPort = uint16(allocatedPort)
   399  		s.mutex.Unlock()
   400  	}
   401  	defer l.Close()
   402  
   403  	go func() {
   404  		if err := http.Serve(l, mux); err != nil {
   405  			if network.IsUnexpectedListenerError(err) {
   406  				log.Error(err)
   407  			}
   408  			select {
   409  			case <-ctx.Done():
   410  				// We are shutting down already, don't trigger SIGTERM
   411  				return
   412  			default:
   413  				// If the server errors then pilot-agent can never pass readiness or liveness probes
   414  				// Therefore, trigger graceful termination by sending SIGTERM to the binary pid
   415  				notifyExit()
   416  			}
   417  		}
   418  	}()
   419  
   420  	// Wait for the agent to be shut down.
   421  	<-ctx.Done()
   422  	log.Info("Status server has successfully terminated")
   423  }
   424  
   425  func (s *Server) handlePprofIndex(w http.ResponseWriter, r *http.Request) {
   426  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   427  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   428  		return
   429  	}
   430  
   431  	pprof.Index(w, r)
   432  }
   433  
   434  func (s *Server) handlePprofCmdline(w http.ResponseWriter, r *http.Request) {
   435  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   436  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   437  		return
   438  	}
   439  
   440  	pprof.Cmdline(w, r)
   441  }
   442  
   443  func (s *Server) handlePprofSymbol(w http.ResponseWriter, r *http.Request) {
   444  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   445  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   446  		return
   447  	}
   448  
   449  	pprof.Symbol(w, r)
   450  }
   451  
   452  func (s *Server) handlePprofProfile(w http.ResponseWriter, r *http.Request) {
   453  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   454  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   455  		return
   456  	}
   457  
   458  	pprof.Profile(w, r)
   459  }
   460  
   461  func (s *Server) handlePprofTrace(w http.ResponseWriter, r *http.Request) {
   462  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   463  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   464  		return
   465  	}
   466  
   467  	pprof.Trace(w, r)
   468  }
   469  
   470  func (s *Server) handleReadyProbe(w http.ResponseWriter, _ *http.Request) {
   471  	err := s.isReady()
   472  	s.mutex.Lock()
   473  	if err != nil {
   474  		w.WriteHeader(http.StatusServiceUnavailable)
   475  
   476  		log.Warnf("Envoy proxy is NOT ready: %s", err.Error())
   477  		s.lastProbeSuccessful = false
   478  	} else {
   479  		w.WriteHeader(http.StatusOK)
   480  
   481  		if !s.lastProbeSuccessful {
   482  			log.Info("Envoy proxy is ready")
   483  		}
   484  		s.lastProbeSuccessful = true
   485  	}
   486  	s.mutex.Unlock()
   487  }
   488  
   489  func (s *Server) isReady() error {
   490  	for _, p := range s.ready {
   491  		if err := p.Check(); err != nil {
   492  			return err
   493  		}
   494  	}
   495  	return nil
   496  }
   497  
   498  type PrometheusScrapeConfiguration struct {
   499  	Scrape string `json:"scrape"`
   500  	Path   string `json:"path"`
   501  	Port   string `json:"port"`
   502  }
   503  
   504  // handleStats handles prometheus stats scraping. This will scrape envoy metrics, and, if configured,
   505  // the application metrics and merge them together.
   506  // The merge here is a simple string concatenation. This works for almost all cases, assuming the application
   507  // is not exposing the same metrics as Envoy.
   508  // This merging works for both FmtText and FmtOpenMetrics and will use the format of the application metrics
   509  // Note that we do not return any errors here. If we do, we will drop metrics. For example, the app may be having issues,
   510  // but we still want Envoy metrics. Instead, errors are tracked in the failed scrape metrics/logs.
   511  func (s *Server) handleStats(w http.ResponseWriter, r *http.Request) {
   512  	if commonFeatures.MetricsLocalhostAccessOnly && !istioNetUtil.IsRequestFromLocalhost(r) {
   513  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   514  		return
   515  	}
   516  	metrics.ScrapeTotals.Increment()
   517  	var err error
   518  	var envoy, application io.ReadCloser
   519  	var envoyCancel, appCancel context.CancelFunc
   520  	defer func() {
   521  		if envoy != nil {
   522  			err = envoy.Close()
   523  			if err != nil {
   524  				log.Infof("envoy connection is not closed: %v", err)
   525  			}
   526  		}
   527  		if application != nil {
   528  			err = application.Close()
   529  			if err != nil {
   530  				log.Infof("app connection is not closed: %v", err)
   531  			}
   532  		}
   533  		if envoyCancel != nil {
   534  			envoyCancel()
   535  		}
   536  		if appCancel != nil {
   537  			appCancel()
   538  		}
   539  	}()
   540  
   541  	// Gather all the metrics we will merge
   542  	if !s.config.NoEnvoy {
   543  		if envoy, envoyCancel, _, err = s.scrape(fmt.Sprintf("http://localhost:%d/stats/prometheus", s.envoyStatsPort), r.Header); err != nil {
   544  			log.Errorf("failed scraping envoy metrics: %v", err)
   545  			metrics.EnvoyScrapeErrors.Increment()
   546  		}
   547  	}
   548  
   549  	// Scrape app metrics if defined and capture their format
   550  	var format expfmt.Format
   551  	if s.prometheus != nil {
   552  		var contentType string
   553  		url := fmt.Sprintf("http://localhost:%s%s", s.prometheus.Port, s.prometheus.Path)
   554  		if application, appCancel, contentType, err = s.scrape(url, r.Header); err != nil {
   555  			log.Errorf("failed scraping application metrics: %v", err)
   556  			metrics.AppScrapeErrors.Increment()
   557  		}
   558  		format = negotiateMetricsFormat(contentType)
   559  	} else {
   560  		// Without app metrics format use a default
   561  		format = FmtText
   562  	}
   563  
   564  	w.Header().Set("Content-Type", string(format))
   565  
   566  	// Write out the metrics
   567  	if err = scrapeAndWriteAgentMetrics(s.registry, io.Writer(w)); err != nil {
   568  		log.Errorf("failed scraping and writing agent metrics: %v", err)
   569  		metrics.AgentScrapeErrors.Increment()
   570  	}
   571  
   572  	if envoy != nil {
   573  		_, err = io.Copy(w, envoy)
   574  		if err != nil {
   575  			log.Errorf("failed to scraping and writing envoy metrics: %v", err)
   576  			metrics.EnvoyScrapeErrors.Increment()
   577  		}
   578  	}
   579  
   580  	// App metrics must go last because if they are FmtOpenMetrics,
   581  	// they will have a trailing "# EOF" which terminates the full exposition
   582  	if application != nil {
   583  		_, err = io.Copy(w, application)
   584  		if err != nil {
   585  			log.Errorf("failed to scraping and writing application metrics: %v", err)
   586  			metrics.AppScrapeErrors.Increment()
   587  		}
   588  	}
   589  }
   590  
   591  const (
   592  	// nolint: revive, stylecheck
   593  	FmtOpenMetrics_0_0_1 = expfmt.OpenMetricsType + `; version=` + expfmt.OpenMetricsVersion_0_0_1 + `; charset=utf-8`
   594  	// nolint: revive, stylecheck
   595  	FmtOpenMetrics_1_0_0 = expfmt.OpenMetricsType + `; version=` + expfmt.OpenMetricsVersion_1_0_0 + `; charset=utf-8`
   596  	FmtText              = `text/plain; version=` + expfmt.TextVersion + `; charset=utf-8`
   597  )
   598  
   599  func negotiateMetricsFormat(contentType string) expfmt.Format {
   600  	mediaType, params, err := mime.ParseMediaType(contentType)
   601  	if err == nil && mediaType == expfmt.OpenMetricsType {
   602  		switch params["version"] {
   603  		case expfmt.OpenMetricsVersion_1_0_0:
   604  			return FmtOpenMetrics_1_0_0
   605  		case expfmt.OpenMetricsVersion_0_0_1, "":
   606  			return FmtOpenMetrics_0_0_1
   607  		}
   608  	}
   609  	return FmtText
   610  }
   611  
   612  func scrapeAndWriteAgentMetrics(registry prometheus.Gatherer, w io.Writer) error {
   613  	mfs, err := registry.Gather()
   614  	enc := expfmt.NewEncoder(w, FmtText)
   615  	if err != nil {
   616  		return err
   617  	}
   618  	for _, mf := range mfs {
   619  		if err := enc.Encode(mf); err != nil {
   620  			return err
   621  		}
   622  	}
   623  	return nil
   624  }
   625  
   626  func applyHeaders(into http.Header, from http.Header, keys ...string) {
   627  	for _, key := range keys {
   628  		val := from.Get(key)
   629  		if val != "" {
   630  			into.Set(key, val)
   631  		}
   632  	}
   633  }
   634  
   635  // getHeaderTimeout parse a string like (1.234) representing number of seconds
   636  func getHeaderTimeout(timeout string) (time.Duration, error) {
   637  	timeoutSeconds, err := strconv.ParseFloat(timeout, 64)
   638  	if err != nil {
   639  		return 0 * time.Second, err
   640  	}
   641  
   642  	return time.Duration(timeoutSeconds * 1e9), nil
   643  }
   644  
   645  // scrape will send a request to the provided url to scrape metrics from
   646  // This will attempt to mimic some of Prometheus functionality by passing some of the headers through
   647  // such as accept, timeout, and user agent
   648  // Returns the scraped metrics reader as well as the response's "Content-Type" header to determine the metrics format
   649  func (s *Server) scrape(url string, header http.Header) (io.ReadCloser, context.CancelFunc, string, error) {
   650  	var cancel context.CancelFunc
   651  	ctx := context.Background()
   652  	if timeoutString := header.Get("X-Prometheus-Scrape-Timeout-Seconds"); timeoutString != "" {
   653  		timeout, err := getHeaderTimeout(timeoutString)
   654  		if err != nil {
   655  			log.Warnf("Failed to parse timeout header %v: %v", timeoutString, err)
   656  		} else {
   657  			ctx, cancel = context.WithTimeout(ctx, timeout)
   658  		}
   659  	}
   660  	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
   661  	if err != nil {
   662  		return nil, cancel, "", err
   663  	}
   664  	applyHeaders(req.Header, header, "Accept",
   665  		"User-Agent",
   666  		"X-Prometheus-Scrape-Timeout-Seconds",
   667  	)
   668  
   669  	resp, err := s.http.Do(req)
   670  	if err != nil {
   671  		return nil, cancel, "", fmt.Errorf("error scraping %s: %v", url, err)
   672  	}
   673  	if resp.StatusCode != http.StatusOK {
   674  		resp.Body.Close()
   675  		return nil, cancel, "", fmt.Errorf("error scraping %s, status code: %v", url, resp.StatusCode)
   676  	}
   677  	format := resp.Header.Get("Content-Type")
   678  	return resp.Body, cancel, format, nil
   679  }
   680  
   681  func (s *Server) handleQuit(w http.ResponseWriter, r *http.Request) {
   682  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   683  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   684  		return
   685  	}
   686  	if r.Method != http.MethodPost {
   687  		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
   688  		return
   689  	}
   690  	w.WriteHeader(http.StatusOK)
   691  	_, _ = w.Write([]byte("OK"))
   692  	log.Infof("handling %s, notifying pilot-agent to exit", quitPath)
   693  	s.shutdown()
   694  }
   695  
   696  func (s *Server) handleDrain(w http.ResponseWriter, r *http.Request) {
   697  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   698  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   699  		return
   700  	}
   701  	if r.Method != http.MethodPost {
   702  		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
   703  		return
   704  	}
   705  	w.WriteHeader(http.StatusOK)
   706  	_, _ = w.Write([]byte("OK"))
   707  	log.Infof("handling %s, starting drain", drainPath)
   708  	s.drain()
   709  }
   710  
   711  func (s *Server) handleAppProbe(w http.ResponseWriter, req *http.Request) {
   712  	// Validate the request first.
   713  	path := req.URL.Path
   714  	if !strings.HasPrefix(path, "/") {
   715  		path = "/" + req.URL.Path
   716  	}
   717  	prober, exists := s.appKubeProbers[path]
   718  	if !exists {
   719  		log.Errorf("Prober does not exists url %v", path)
   720  		w.WriteHeader(http.StatusBadRequest)
   721  		_, _ = w.Write([]byte(fmt.Sprintf("app prober config does not exists for %v", path)))
   722  		return
   723  	}
   724  
   725  	switch {
   726  	case prober.HTTPGet != nil:
   727  		s.handleAppProbeHTTPGet(w, req, prober, path)
   728  	case prober.TCPSocket != nil:
   729  		s.handleAppProbeTCPSocket(w, prober)
   730  	case prober.GRPC != nil:
   731  		s.handleAppProbeGRPC(w, req, prober)
   732  	}
   733  }
   734  
   735  func (s *Server) handleAppProbeHTTPGet(w http.ResponseWriter, req *http.Request, prober *Prober, path string) {
   736  	proberPath := prober.HTTPGet.Path
   737  	if !strings.HasPrefix(proberPath, "/") {
   738  		proberPath = "/" + proberPath
   739  	}
   740  	var url string
   741  
   742  	hostPort := net.JoinHostPort(s.appProbersDestination, strconv.Itoa(prober.HTTPGet.Port.IntValue()))
   743  	if prober.HTTPGet.Scheme == apimirror.URISchemeHTTPS {
   744  		url = fmt.Sprintf("https://%s%s", hostPort, proberPath)
   745  	} else {
   746  		url = fmt.Sprintf("http://%s%s", hostPort, proberPath)
   747  	}
   748  	appReq, err := http.NewRequest(http.MethodGet, url, nil)
   749  	if err != nil {
   750  		log.Errorf("Failed to create request to probe app %v, original url %v", err, path)
   751  		w.WriteHeader(http.StatusInternalServerError)
   752  		return
   753  	}
   754  
   755  	appReq.Host = req.Host
   756  	if host, port, err := net.SplitHostPort(req.Host); err == nil {
   757  		port, _ := strconv.Atoi(port)
   758  		// the port is same as the status port, then we need to replace the port in the host with the real one
   759  		if port == int(s.statusPort) {
   760  			realPort := strconv.Itoa(prober.HTTPGet.Port.IntValue())
   761  			appReq.Host = net.JoinHostPort(host, realPort)
   762  		}
   763  	}
   764  	// Forward incoming headers to the application.
   765  	for name, values := range req.Header {
   766  		appReq.Header[name] = slices.Clone(values)
   767  		if len(values) > 0 && (name == "Host") {
   768  			// Probe has specific host header override; honor it
   769  			appReq.Host = values[0]
   770  		}
   771  	}
   772  
   773  	// get the http client must exist because
   774  	httpClient := s.appProbeClient[path]
   775  
   776  	// Send the request.
   777  	response, err := httpClient.Do(appReq)
   778  	if err != nil {
   779  		log.Errorf("Request to probe app failed: %v, original URL path = %v\napp URL path = %v", err, path, proberPath)
   780  		w.WriteHeader(http.StatusInternalServerError)
   781  		return
   782  	}
   783  	defer func() {
   784  		// Drain and close the body to let the Transport reuse the connection
   785  		_, _ = io.Copy(io.Discard, response.Body)
   786  		_ = response.Body.Close()
   787  	}()
   788  
   789  	if isRedirect(response.StatusCode) { // Redirect
   790  		// In other cases, we return the original status code. For redirects, it is illegal to
   791  		// not have Location header, so we need to switch to just 200.
   792  		w.WriteHeader(http.StatusOK)
   793  		return
   794  	}
   795  	// We only write the status code to the response.
   796  	w.WriteHeader(response.StatusCode)
   797  	// Return the body from probe as well
   798  	b, _ := k8sUtilIo.ReadAtMost(response.Body, maxRespBodyLength)
   799  	_, _ = w.Write(b)
   800  }
   801  
   802  func (s *Server) handleAppProbeTCPSocket(w http.ResponseWriter, prober *Prober) {
   803  	timeout := time.Duration(prober.TimeoutSeconds) * time.Second
   804  
   805  	d := ProbeDialer()
   806  	d.LocalAddr = s.upstreamLocalAddress
   807  	d.Timeout = timeout
   808  
   809  	conn, err := d.Dial("tcp", net.JoinHostPort(s.appProbersDestination, strconv.Itoa(prober.TCPSocket.Port.IntValue())))
   810  	if err != nil {
   811  		w.WriteHeader(http.StatusInternalServerError)
   812  	} else {
   813  		w.WriteHeader(http.StatusOK)
   814  		err = conn.Close()
   815  		if err != nil {
   816  			log.Infof("tcp connection is not closed: %v", err)
   817  		}
   818  	}
   819  }
   820  
   821  func (s *Server) handleAppProbeGRPC(w http.ResponseWriter, req *http.Request, prober *Prober) {
   822  	timeout := time.Duration(prober.TimeoutSeconds) * time.Second
   823  	// the DialOptions are referenced from https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/grpc/grpc.go#L55-L59
   824  	opts := []grpc.DialOption{
   825  		grpc.WithBlock(),
   826  		grpc.WithTransportCredentials(insecure.NewCredentials()), // credentials are currently not supported
   827  		grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
   828  			d := ProbeDialer()
   829  			d.LocalAddr = s.upstreamLocalAddress
   830  			d.Timeout = timeout
   831  			return d.DialContext(ctx, "tcp", addr)
   832  		}),
   833  	}
   834  	if userAgent := req.Header["User-Agent"]; len(userAgent) > 0 {
   835  		// simulate kubelet
   836  		// please refer to:
   837  		// https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/grpc/grpc.go#L56
   838  		// https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/http/http.go#L103
   839  		opts = append(opts, grpc.WithUserAgent(userAgent[0]))
   840  	}
   841  
   842  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
   843  	defer cancel()
   844  
   845  	addr := net.JoinHostPort(s.appProbersDestination, strconv.Itoa(int(prober.GRPC.Port)))
   846  	conn, err := grpc.DialContext(ctx, addr, opts...)
   847  	if err != nil {
   848  		log.Errorf("Failed to create grpc connection to probe app: %v", err)
   849  		w.WriteHeader(http.StatusInternalServerError)
   850  		return
   851  	}
   852  	defer conn.Close()
   853  
   854  	var svc string
   855  	if prober.GRPC.Service != nil {
   856  		svc = *prober.GRPC.Service
   857  	}
   858  	grpcClient := grpcHealth.NewHealthClient(conn)
   859  	resp, err := grpcClient.Check(ctx, &grpcHealth.HealthCheckRequest{
   860  		Service: svc,
   861  	})
   862  	// the error handling is referenced from https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/grpc/grpc.go#L88-L106
   863  	if err != nil {
   864  		status, ok := grpcStatus.FromError(err)
   865  		if ok {
   866  			switch status.Code() {
   867  			case codes.Unimplemented:
   868  				log.Errorf("server does not implement the grpc health protocol (grpc.health.v1.Health): %v", err)
   869  			case codes.DeadlineExceeded:
   870  				log.Errorf("grpc request not finished within timeout: %v", err)
   871  			default:
   872  				log.Errorf("grpc probe failed: %v", err)
   873  			}
   874  		} else {
   875  			log.Errorf("grpc probe failed: %v", err)
   876  		}
   877  		w.WriteHeader(http.StatusInternalServerError)
   878  		return
   879  	}
   880  
   881  	if resp.GetStatus() == grpcHealth.HealthCheckResponse_SERVING {
   882  		w.WriteHeader(http.StatusOK)
   883  		return
   884  	}
   885  	w.WriteHeader(http.StatusInternalServerError)
   886  }
   887  
   888  func (s *Server) handleNdsz(w http.ResponseWriter, r *http.Request) {
   889  	if !istioNetUtil.IsRequestFromLocalhost(r) {
   890  		http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden)
   891  		return
   892  	}
   893  	nametable := s.fetchDNS()
   894  	if nametable == nil {
   895  		// See https://golang.org/doc/faq#nil_error for why writeJSONProto cannot handle this
   896  		w.WriteHeader(http.StatusNotFound)
   897  		_, _ = w.Write([]byte(`{}`))
   898  		return
   899  	}
   900  	writeJSONProto(w, nametable)
   901  }
   902  
   903  // writeJSONProto writes a protobuf to a json payload, handling content type, marshaling, and errors
   904  func writeJSONProto(w http.ResponseWriter, obj any) {
   905  	w.Header().Set("Content-Type", "application/json")
   906  	b, err := config.ToJSON(obj)
   907  	if err != nil {
   908  		w.WriteHeader(http.StatusInternalServerError)
   909  		_, _ = w.Write([]byte(err.Error()))
   910  		return
   911  	}
   912  	_, err = w.Write(b)
   913  	if err != nil {
   914  		w.WriteHeader(http.StatusInternalServerError)
   915  	}
   916  }
   917  
   918  // notifyExit sends SIGTERM to itself
   919  func notifyExit() {
   920  	p, err := os.FindProcess(os.Getpid())
   921  	if err != nil {
   922  		log.Error(err)
   923  	}
   924  	if err := p.Signal(syscall.SIGTERM); err != nil {
   925  		log.Errorf("failed to send SIGTERM to self: %v", err)
   926  	}
   927  }
   928  
   929  var defaultTransport = http.DefaultTransport.(*http.Transport)
   930  
   931  // SetTransportDefaults mirrors Kubernetes probe settings
   932  // https://github.com/kubernetes/kubernetes/blob/0153febd9f0098d4b8d0d484927710eaf899ef40/pkg/probe/http/http.go#L52
   933  func setTransportDefaults(t *http.Transport) (*http.Transport, error) {
   934  	if !EnableHTTP2Probing {
   935  		return t, nil
   936  	}
   937  	if t.TLSHandshakeTimeout == 0 {
   938  		t.TLSHandshakeTimeout = defaultTransport.TLSHandshakeTimeout
   939  	}
   940  	if t.IdleConnTimeout == 0 {
   941  		t.IdleConnTimeout = defaultTransport.IdleConnTimeout
   942  	}
   943  	t2, err := http2.ConfigureTransports(t)
   944  	if err != nil {
   945  		return nil, err
   946  	}
   947  	t2.ReadIdleTimeout = time.Duration(30) * time.Second
   948  	t2.PingTimeout = time.Duration(15) * time.Second
   949  	return t, nil
   950  }