istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/cmd/pilot-agent/status/server.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package status 16 17 import ( 18 "context" 19 "crypto/tls" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "mime" 25 "net" 26 "net/http" 27 "net/http/pprof" 28 "os" 29 "regexp" 30 "strconv" 31 "strings" 32 "sync" 33 "syscall" 34 "time" 35 36 "github.com/prometheus/client_golang/prometheus" 37 "github.com/prometheus/client_golang/prometheus/collectors" 38 "github.com/prometheus/common/expfmt" 39 "golang.org/x/net/http2" 40 "google.golang.org/grpc" 41 "google.golang.org/grpc/codes" 42 "google.golang.org/grpc/credentials/insecure" 43 grpcHealth "google.golang.org/grpc/health/grpc_health_v1" 44 grpcStatus "google.golang.org/grpc/status" 45 "k8s.io/apimachinery/pkg/util/intstr" 46 k8sUtilIo "k8s.io/utils/io" 47 48 "istio.io/istio/pilot/cmd/pilot-agent/metrics" 49 "istio.io/istio/pilot/cmd/pilot-agent/status/grpcready" 50 "istio.io/istio/pilot/cmd/pilot-agent/status/ready" 51 "istio.io/istio/pkg/config" 52 dnsProto "istio.io/istio/pkg/dns/proto" 53 "istio.io/istio/pkg/env" 54 commonFeatures "istio.io/istio/pkg/features" 55 "istio.io/istio/pkg/kube/apimirror" 56 "istio.io/istio/pkg/log" 57 "istio.io/istio/pkg/model" 58 "istio.io/istio/pkg/monitoring" 59 "istio.io/istio/pkg/network" 60 "istio.io/istio/pkg/slices" 61 istioNetUtil "istio.io/istio/pkg/util/net" 62 ) 63 64 const ( 65 // readyPath is for the pilot agent readiness itself. 66 readyPath = "/healthz/ready" 67 // quitPath is to notify the pilot agent to quit. 68 quitPath = "/quitquitquit" 69 drainPath = "/drain" 70 // KubeAppProberEnvName is the name of the command line flag for pilot agent to pass app prober config. 71 // The json encoded string to pass app HTTP probe information from injector(istioctl or webhook). 72 // For example, ISTIO_KUBE_APP_PROBERS='{"/app-health/httpbin/livez":{"httpGet":{"path": "/hello", "port": 8080}}. 73 // indicates that httpbin container liveness prober port is 8080 and probing path is /hello. 74 // This environment variable should never be set manually. 75 KubeAppProberEnvName = "ISTIO_KUBE_APP_PROBERS" 76 77 localHostIPv4 = "127.0.0.1" 78 localHostIPv6 = "::1" 79 maxRespBodyLength = 10 * 1 << 10 80 ) 81 82 var ( 83 UpstreamLocalAddressIPv4 = &net.TCPAddr{IP: net.ParseIP("127.0.0.6")} 84 UpstreamLocalAddressIPv6 = &net.TCPAddr{IP: net.ParseIP("::6")} 85 ) 86 87 var PrometheusScrapingConfig = env.Register("ISTIO_PROMETHEUS_ANNOTATIONS", "", "") 88 89 var ( 90 appProberPattern = regexp.MustCompile(`^/app-health/[^/]+/(livez|readyz|startupz)$`) 91 92 EnableHTTP2Probing = env.Register("ISTIO_ENABLE_HTTP2_PROBING", true, 93 "If enabled, HTTP2 probes will be enabled for HTTPS probes, following Kubernetes").Get() 94 95 LegacyLocalhostProbeDestination = env.Register("REWRITE_PROBE_LEGACY_LOCALHOST_DESTINATION", false, 96 "If enabled, readiness probes will be sent to 'localhost'. Otherwise, they will be sent to the Pod's IP, matching Kubernetes' behavior.") 97 98 ProbeKeepaliveConnections = env.Register("ENABLE_PROBE_KEEPALIVE_CONNECTIONS", false, 99 "If enabled, readiness probes will keep the connection from pilot-agent to the application alive. "+ 100 "This mirrors older Istio versions' behaviors, but not kubelet's.").Get() 101 ) 102 103 // KubeAppProbers holds the information about a Kubernetes pod prober. 104 // It's a map from the prober URL path to the Kubernetes Prober config. 105 // For example, "/app-health/hello-world/livez" entry contains liveness prober config for 106 // container "hello-world". 107 type KubeAppProbers map[string]*Prober 108 109 // Prober represents a single container prober 110 type Prober struct { 111 HTTPGet *apimirror.HTTPGetAction `json:"httpGet,omitempty"` 112 TCPSocket *apimirror.TCPSocketAction `json:"tcpSocket,omitempty"` 113 GRPC *apimirror.GRPCAction `json:"grpc,omitempty"` 114 TimeoutSeconds int32 `json:"timeoutSeconds,omitempty"` 115 } 116 117 // Options for the status server. 118 type Options struct { 119 // Ip of the pod. Note: this is only applicable for Kubernetes pods and should only be used for 120 // the prober. 121 PodIP string 122 // KubeAppProbers is a json with Kubernetes application prober config encoded. 123 KubeAppProbers string 124 NodeType model.NodeType 125 StatusPort uint16 126 AdminPort uint16 127 IPv6 bool 128 Probes []ready.Prober 129 EnvoyPrometheusPort int 130 Context context.Context 131 FetchDNS func() *dnsProto.NameTable 132 NoEnvoy bool 133 GRPCBootstrap string 134 EnableProfiling bool 135 // PrometheusRegistry to use. Just for testing. 136 PrometheusRegistry prometheus.Gatherer 137 Shutdown context.CancelFunc 138 TriggerDrain func() 139 } 140 141 // Server provides an endpoint for handling status probes. 142 type Server struct { 143 ready []ready.Prober 144 prometheus *PrometheusScrapeConfiguration 145 mutex sync.RWMutex 146 appProbersDestination string 147 appKubeProbers KubeAppProbers 148 appProbeClient map[string]*http.Client 149 statusPort uint16 150 lastProbeSuccessful bool 151 envoyStatsPort int 152 fetchDNS func() *dnsProto.NameTable 153 upstreamLocalAddress *net.TCPAddr 154 config Options 155 http *http.Client 156 enableProfiling bool 157 registry prometheus.Gatherer 158 shutdown context.CancelFunc 159 drain func() 160 } 161 162 func initializeMonitoring() (prometheus.Gatherer, error) { 163 registry := prometheus.NewRegistry() 164 wrapped := prometheus.WrapRegistererWithPrefix("istio_agent_", registry) 165 wrapped.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) 166 wrapped.MustRegister(collectors.NewGoCollector()) 167 168 _, err := monitoring.RegisterPrometheusExporter(wrapped, registry) 169 if err != nil { 170 return nil, fmt.Errorf("could not setup exporter: %v", err) 171 } 172 return registry, nil 173 } 174 175 // NewServer creates a new status server. 176 func NewServer(config Options) (*Server, error) { 177 localhost := localHostIPv4 178 upstreamLocalAddress := UpstreamLocalAddressIPv4 179 if config.IPv6 { 180 localhost = localHostIPv6 181 upstreamLocalAddress = UpstreamLocalAddressIPv6 182 } else { 183 // if not ipv6-only, it can be ipv4-only or dual-stack 184 // let InstanceIP decide the localhost 185 netIP := net.ParseIP(config.PodIP) 186 if netIP.To4() == nil && netIP.To16() != nil && !netIP.IsLinkLocalUnicast() { 187 localhost = localHostIPv6 188 upstreamLocalAddress = UpstreamLocalAddressIPv6 189 } 190 } 191 probes := make([]ready.Prober, 0) 192 if !config.NoEnvoy { 193 probes = append(probes, &ready.Probe{ 194 LocalHostAddr: localhost, 195 AdminPort: config.AdminPort, 196 Context: config.Context, 197 NoEnvoy: config.NoEnvoy, 198 }) 199 } 200 201 if config.GRPCBootstrap != "" { 202 probes = append(probes, grpcready.NewProbe(config.GRPCBootstrap)) 203 } 204 205 probes = append(probes, config.Probes...) 206 registry := config.PrometheusRegistry 207 if registry == nil { 208 var err error 209 registry, err = initializeMonitoring() 210 if err != nil { 211 return nil, err 212 } 213 } 214 s := &Server{ 215 statusPort: config.StatusPort, 216 ready: probes, 217 http: &http.Client{}, 218 appProbersDestination: config.PodIP, 219 envoyStatsPort: config.EnvoyPrometheusPort, 220 fetchDNS: config.FetchDNS, 221 upstreamLocalAddress: upstreamLocalAddress, 222 config: config, 223 enableProfiling: config.EnableProfiling, 224 registry: registry, 225 shutdown: func() { 226 config.Shutdown() 227 }, 228 drain: config.TriggerDrain, 229 } 230 if LegacyLocalhostProbeDestination.Get() { 231 s.appProbersDestination = "localhost" 232 } 233 234 // Enable prometheus server if its configured and a sidecar 235 // Because port 15020 is exposed in the gateway Services, we cannot safely serve this endpoint 236 // If we need to do this in the future, we should use envoy to do routing or have another port to make this internal 237 // only. For now, its not needed for gateway, as we can just get Envoy stats directly, but if we 238 // want to expose istio-agent metrics we may want to revisit this. 239 if cfg, f := PrometheusScrapingConfig.Lookup(); config.NodeType == model.SidecarProxy && f { 240 var prom PrometheusScrapeConfiguration 241 if err := json.Unmarshal([]byte(cfg), &prom); err != nil { 242 return nil, fmt.Errorf("failed to unmarshal %s: %v", PrometheusScrapingConfig.Name, err) 243 } 244 log.Infof("Prometheus scraping configuration: %v", prom) 245 if prom.Scrape != "false" { 246 s.prometheus = &prom 247 if s.prometheus.Path == "" { 248 s.prometheus.Path = "/metrics" 249 } 250 if s.prometheus.Port == "" { 251 s.prometheus.Port = "80" 252 } 253 if s.prometheus.Port == strconv.Itoa(int(config.StatusPort)) { 254 return nil, fmt.Errorf("invalid prometheus scrape configuration: "+ 255 "application port is the same as agent port, which may lead to a recursive loop. "+ 256 "Ensure pod does not have prometheus.io/port=%d label, or that injection is not happening multiple times", config.StatusPort) 257 } 258 } 259 } 260 261 if config.KubeAppProbers == "" { 262 return s, nil 263 } 264 if err := json.Unmarshal([]byte(config.KubeAppProbers), &s.appKubeProbers); err != nil { 265 return nil, fmt.Errorf("failed to decode app prober err = %v, json string = %v", err, config.KubeAppProbers) 266 } 267 268 s.appProbeClient = make(map[string]*http.Client, len(s.appKubeProbers)) 269 // Validate the map key matching the regex pattern. 270 for path, prober := range s.appKubeProbers { 271 err := validateAppKubeProber(path, prober) 272 if err != nil { 273 return nil, err 274 } 275 if prober.HTTPGet != nil { 276 d := ProbeDialer() 277 d.LocalAddr = s.upstreamLocalAddress 278 // nolint: gosec 279 // This is matching Kubernetes. It is a reasonable usage of this, as it is just a health check over localhost. 280 transport, err := setTransportDefaults(&http.Transport{ 281 TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 282 DialContext: d.DialContext, 283 // https://github.com/kubernetes/kubernetes/blob/0153febd9f0098d4b8d0d484927710eaf899ef40/pkg/probe/http/http.go#L55 284 // Match Kubernetes logic. This also ensures idle timeouts do not trigger probe failures 285 DisableKeepAlives: !ProbeKeepaliveConnections, 286 }) 287 if err != nil { 288 return nil, err 289 } 290 // Construct a http client and cache it in order to reuse the connection. 291 s.appProbeClient[path] = &http.Client{ 292 Timeout: time.Duration(prober.TimeoutSeconds) * time.Second, 293 // We skip the verification since kubelet skips the verification for HTTPS prober as well 294 // https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes 295 Transport: transport, 296 CheckRedirect: redirectChecker(), 297 } 298 } 299 } 300 301 return s, nil 302 } 303 304 // Copies logic from https://github.com/kubernetes/kubernetes/blob/b152001f459/pkg/probe/http/http.go#L129-L130 305 func isRedirect(code int) bool { 306 return code >= http.StatusMultipleChoices && code < http.StatusBadRequest 307 } 308 309 // Using the same redirect logic that kubelet does: https://github.com/kubernetes/kubernetes/blob/b152001f459/pkg/probe/http/http.go#L141 310 // This means that: 311 // * If we exceed 10 redirects, the probe fails 312 // * If we redirect somewhere external, the probe succeeds (https://github.com/kubernetes/kubernetes/blob/b152001f459/pkg/probe/http/http.go#L130) 313 // * If we redirect to the same address, the probe will follow the redirect 314 func redirectChecker() func(*http.Request, []*http.Request) error { 315 return func(req *http.Request, via []*http.Request) error { 316 if req.URL.Hostname() != via[0].URL.Hostname() { 317 return http.ErrUseLastResponse 318 } 319 // Default behavior: stop after 10 redirects. 320 if len(via) >= 10 { 321 return errors.New("stopped after 10 redirects") 322 } 323 return nil 324 } 325 } 326 327 func validateAppKubeProber(path string, prober *Prober) error { 328 if !appProberPattern.MatchString(path) { 329 return fmt.Errorf(`invalid path, must be in form of regex pattern %v`, appProberPattern) 330 } 331 count := 0 332 if prober.HTTPGet != nil { 333 count++ 334 } 335 if prober.TCPSocket != nil { 336 count++ 337 } 338 if prober.GRPC != nil { 339 count++ 340 } 341 if count != 1 { 342 return fmt.Errorf(`invalid prober type, must be one of type httpGet, tcpSocket or gRPC`) 343 } 344 if prober.HTTPGet != nil && prober.HTTPGet.Port.Type != intstr.Int { 345 return fmt.Errorf("invalid prober config for %v, the port must be int type", path) 346 } 347 if prober.TCPSocket != nil && prober.TCPSocket.Port.Type != intstr.Int { 348 return fmt.Errorf("invalid prober config for %v, the port must be int type", path) 349 } 350 return nil 351 } 352 353 // FormatProberURL returns a set of HTTP URLs that pilot agent will serve to take over Kubernetes 354 // app probers. 355 func FormatProberURL(container string) (string, string, string) { 356 return fmt.Sprintf("/app-health/%v/readyz", container), 357 fmt.Sprintf("/app-health/%v/livez", container), 358 fmt.Sprintf("/app-health/%v/startupz", container) 359 } 360 361 // Run opens a the status port and begins accepting probes. 362 func (s *Server) Run(ctx context.Context) { 363 log.Infof("Opening status port %d", s.statusPort) 364 365 mux := http.NewServeMux() 366 367 // Add the handler for ready probes. 368 mux.HandleFunc(readyPath, s.handleReadyProbe) 369 // Default path for prom 370 mux.HandleFunc(`/metrics`, s.handleStats) 371 // Envoy uses something else - and original agent used the same. 372 // Keep for backward compat with configs. 373 mux.HandleFunc(`/stats/prometheus`, s.handleStats) 374 mux.HandleFunc(quitPath, s.handleQuit) 375 mux.HandleFunc(drainPath, s.handleDrain) 376 mux.HandleFunc("/app-health/", s.handleAppProbe) 377 378 if s.enableProfiling { 379 // Add the handler for pprof. 380 mux.HandleFunc("/debug/pprof/", s.handlePprofIndex) 381 mux.HandleFunc("/debug/pprof/cmdline", s.handlePprofCmdline) 382 mux.HandleFunc("/debug/pprof/profile", s.handlePprofProfile) 383 mux.HandleFunc("/debug/pprof/symbol", s.handlePprofSymbol) 384 mux.HandleFunc("/debug/pprof/trace", s.handlePprofTrace) 385 } 386 mux.HandleFunc("/debug/ndsz", s.handleNdsz) 387 388 l, err := net.Listen("tcp", fmt.Sprintf(":%d", s.statusPort)) 389 if err != nil { 390 log.Errorf("Error listening on status port: %v", err.Error()) 391 return 392 } 393 // for testing. 394 if s.statusPort == 0 { 395 _, hostPort, _ := net.SplitHostPort(l.Addr().String()) 396 allocatedPort, _ := strconv.Atoi(hostPort) 397 s.mutex.Lock() 398 s.statusPort = uint16(allocatedPort) 399 s.mutex.Unlock() 400 } 401 defer l.Close() 402 403 go func() { 404 if err := http.Serve(l, mux); err != nil { 405 if network.IsUnexpectedListenerError(err) { 406 log.Error(err) 407 } 408 select { 409 case <-ctx.Done(): 410 // We are shutting down already, don't trigger SIGTERM 411 return 412 default: 413 // If the server errors then pilot-agent can never pass readiness or liveness probes 414 // Therefore, trigger graceful termination by sending SIGTERM to the binary pid 415 notifyExit() 416 } 417 } 418 }() 419 420 // Wait for the agent to be shut down. 421 <-ctx.Done() 422 log.Info("Status server has successfully terminated") 423 } 424 425 func (s *Server) handlePprofIndex(w http.ResponseWriter, r *http.Request) { 426 if !istioNetUtil.IsRequestFromLocalhost(r) { 427 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 428 return 429 } 430 431 pprof.Index(w, r) 432 } 433 434 func (s *Server) handlePprofCmdline(w http.ResponseWriter, r *http.Request) { 435 if !istioNetUtil.IsRequestFromLocalhost(r) { 436 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 437 return 438 } 439 440 pprof.Cmdline(w, r) 441 } 442 443 func (s *Server) handlePprofSymbol(w http.ResponseWriter, r *http.Request) { 444 if !istioNetUtil.IsRequestFromLocalhost(r) { 445 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 446 return 447 } 448 449 pprof.Symbol(w, r) 450 } 451 452 func (s *Server) handlePprofProfile(w http.ResponseWriter, r *http.Request) { 453 if !istioNetUtil.IsRequestFromLocalhost(r) { 454 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 455 return 456 } 457 458 pprof.Profile(w, r) 459 } 460 461 func (s *Server) handlePprofTrace(w http.ResponseWriter, r *http.Request) { 462 if !istioNetUtil.IsRequestFromLocalhost(r) { 463 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 464 return 465 } 466 467 pprof.Trace(w, r) 468 } 469 470 func (s *Server) handleReadyProbe(w http.ResponseWriter, _ *http.Request) { 471 err := s.isReady() 472 s.mutex.Lock() 473 if err != nil { 474 w.WriteHeader(http.StatusServiceUnavailable) 475 476 log.Warnf("Envoy proxy is NOT ready: %s", err.Error()) 477 s.lastProbeSuccessful = false 478 } else { 479 w.WriteHeader(http.StatusOK) 480 481 if !s.lastProbeSuccessful { 482 log.Info("Envoy proxy is ready") 483 } 484 s.lastProbeSuccessful = true 485 } 486 s.mutex.Unlock() 487 } 488 489 func (s *Server) isReady() error { 490 for _, p := range s.ready { 491 if err := p.Check(); err != nil { 492 return err 493 } 494 } 495 return nil 496 } 497 498 type PrometheusScrapeConfiguration struct { 499 Scrape string `json:"scrape"` 500 Path string `json:"path"` 501 Port string `json:"port"` 502 } 503 504 // handleStats handles prometheus stats scraping. This will scrape envoy metrics, and, if configured, 505 // the application metrics and merge them together. 506 // The merge here is a simple string concatenation. This works for almost all cases, assuming the application 507 // is not exposing the same metrics as Envoy. 508 // This merging works for both FmtText and FmtOpenMetrics and will use the format of the application metrics 509 // Note that we do not return any errors here. If we do, we will drop metrics. For example, the app may be having issues, 510 // but we still want Envoy metrics. Instead, errors are tracked in the failed scrape metrics/logs. 511 func (s *Server) handleStats(w http.ResponseWriter, r *http.Request) { 512 if commonFeatures.MetricsLocalhostAccessOnly && !istioNetUtil.IsRequestFromLocalhost(r) { 513 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 514 return 515 } 516 metrics.ScrapeTotals.Increment() 517 var err error 518 var envoy, application io.ReadCloser 519 var envoyCancel, appCancel context.CancelFunc 520 defer func() { 521 if envoy != nil { 522 err = envoy.Close() 523 if err != nil { 524 log.Infof("envoy connection is not closed: %v", err) 525 } 526 } 527 if application != nil { 528 err = application.Close() 529 if err != nil { 530 log.Infof("app connection is not closed: %v", err) 531 } 532 } 533 if envoyCancel != nil { 534 envoyCancel() 535 } 536 if appCancel != nil { 537 appCancel() 538 } 539 }() 540 541 // Gather all the metrics we will merge 542 if !s.config.NoEnvoy { 543 if envoy, envoyCancel, _, err = s.scrape(fmt.Sprintf("http://localhost:%d/stats/prometheus", s.envoyStatsPort), r.Header); err != nil { 544 log.Errorf("failed scraping envoy metrics: %v", err) 545 metrics.EnvoyScrapeErrors.Increment() 546 } 547 } 548 549 // Scrape app metrics if defined and capture their format 550 var format expfmt.Format 551 if s.prometheus != nil { 552 var contentType string 553 url := fmt.Sprintf("http://localhost:%s%s", s.prometheus.Port, s.prometheus.Path) 554 if application, appCancel, contentType, err = s.scrape(url, r.Header); err != nil { 555 log.Errorf("failed scraping application metrics: %v", err) 556 metrics.AppScrapeErrors.Increment() 557 } 558 format = negotiateMetricsFormat(contentType) 559 } else { 560 // Without app metrics format use a default 561 format = FmtText 562 } 563 564 w.Header().Set("Content-Type", string(format)) 565 566 // Write out the metrics 567 if err = scrapeAndWriteAgentMetrics(s.registry, io.Writer(w)); err != nil { 568 log.Errorf("failed scraping and writing agent metrics: %v", err) 569 metrics.AgentScrapeErrors.Increment() 570 } 571 572 if envoy != nil { 573 _, err = io.Copy(w, envoy) 574 if err != nil { 575 log.Errorf("failed to scraping and writing envoy metrics: %v", err) 576 metrics.EnvoyScrapeErrors.Increment() 577 } 578 } 579 580 // App metrics must go last because if they are FmtOpenMetrics, 581 // they will have a trailing "# EOF" which terminates the full exposition 582 if application != nil { 583 _, err = io.Copy(w, application) 584 if err != nil { 585 log.Errorf("failed to scraping and writing application metrics: %v", err) 586 metrics.AppScrapeErrors.Increment() 587 } 588 } 589 } 590 591 const ( 592 // nolint: revive, stylecheck 593 FmtOpenMetrics_0_0_1 = expfmt.OpenMetricsType + `; version=` + expfmt.OpenMetricsVersion_0_0_1 + `; charset=utf-8` 594 // nolint: revive, stylecheck 595 FmtOpenMetrics_1_0_0 = expfmt.OpenMetricsType + `; version=` + expfmt.OpenMetricsVersion_1_0_0 + `; charset=utf-8` 596 FmtText = `text/plain; version=` + expfmt.TextVersion + `; charset=utf-8` 597 ) 598 599 func negotiateMetricsFormat(contentType string) expfmt.Format { 600 mediaType, params, err := mime.ParseMediaType(contentType) 601 if err == nil && mediaType == expfmt.OpenMetricsType { 602 switch params["version"] { 603 case expfmt.OpenMetricsVersion_1_0_0: 604 return FmtOpenMetrics_1_0_0 605 case expfmt.OpenMetricsVersion_0_0_1, "": 606 return FmtOpenMetrics_0_0_1 607 } 608 } 609 return FmtText 610 } 611 612 func scrapeAndWriteAgentMetrics(registry prometheus.Gatherer, w io.Writer) error { 613 mfs, err := registry.Gather() 614 enc := expfmt.NewEncoder(w, FmtText) 615 if err != nil { 616 return err 617 } 618 for _, mf := range mfs { 619 if err := enc.Encode(mf); err != nil { 620 return err 621 } 622 } 623 return nil 624 } 625 626 func applyHeaders(into http.Header, from http.Header, keys ...string) { 627 for _, key := range keys { 628 val := from.Get(key) 629 if val != "" { 630 into.Set(key, val) 631 } 632 } 633 } 634 635 // getHeaderTimeout parse a string like (1.234) representing number of seconds 636 func getHeaderTimeout(timeout string) (time.Duration, error) { 637 timeoutSeconds, err := strconv.ParseFloat(timeout, 64) 638 if err != nil { 639 return 0 * time.Second, err 640 } 641 642 return time.Duration(timeoutSeconds * 1e9), nil 643 } 644 645 // scrape will send a request to the provided url to scrape metrics from 646 // This will attempt to mimic some of Prometheus functionality by passing some of the headers through 647 // such as accept, timeout, and user agent 648 // Returns the scraped metrics reader as well as the response's "Content-Type" header to determine the metrics format 649 func (s *Server) scrape(url string, header http.Header) (io.ReadCloser, context.CancelFunc, string, error) { 650 var cancel context.CancelFunc 651 ctx := context.Background() 652 if timeoutString := header.Get("X-Prometheus-Scrape-Timeout-Seconds"); timeoutString != "" { 653 timeout, err := getHeaderTimeout(timeoutString) 654 if err != nil { 655 log.Warnf("Failed to parse timeout header %v: %v", timeoutString, err) 656 } else { 657 ctx, cancel = context.WithTimeout(ctx, timeout) 658 } 659 } 660 req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 661 if err != nil { 662 return nil, cancel, "", err 663 } 664 applyHeaders(req.Header, header, "Accept", 665 "User-Agent", 666 "X-Prometheus-Scrape-Timeout-Seconds", 667 ) 668 669 resp, err := s.http.Do(req) 670 if err != nil { 671 return nil, cancel, "", fmt.Errorf("error scraping %s: %v", url, err) 672 } 673 if resp.StatusCode != http.StatusOK { 674 resp.Body.Close() 675 return nil, cancel, "", fmt.Errorf("error scraping %s, status code: %v", url, resp.StatusCode) 676 } 677 format := resp.Header.Get("Content-Type") 678 return resp.Body, cancel, format, nil 679 } 680 681 func (s *Server) handleQuit(w http.ResponseWriter, r *http.Request) { 682 if !istioNetUtil.IsRequestFromLocalhost(r) { 683 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 684 return 685 } 686 if r.Method != http.MethodPost { 687 http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) 688 return 689 } 690 w.WriteHeader(http.StatusOK) 691 _, _ = w.Write([]byte("OK")) 692 log.Infof("handling %s, notifying pilot-agent to exit", quitPath) 693 s.shutdown() 694 } 695 696 func (s *Server) handleDrain(w http.ResponseWriter, r *http.Request) { 697 if !istioNetUtil.IsRequestFromLocalhost(r) { 698 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 699 return 700 } 701 if r.Method != http.MethodPost { 702 http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) 703 return 704 } 705 w.WriteHeader(http.StatusOK) 706 _, _ = w.Write([]byte("OK")) 707 log.Infof("handling %s, starting drain", drainPath) 708 s.drain() 709 } 710 711 func (s *Server) handleAppProbe(w http.ResponseWriter, req *http.Request) { 712 // Validate the request first. 713 path := req.URL.Path 714 if !strings.HasPrefix(path, "/") { 715 path = "/" + req.URL.Path 716 } 717 prober, exists := s.appKubeProbers[path] 718 if !exists { 719 log.Errorf("Prober does not exists url %v", path) 720 w.WriteHeader(http.StatusBadRequest) 721 _, _ = w.Write([]byte(fmt.Sprintf("app prober config does not exists for %v", path))) 722 return 723 } 724 725 switch { 726 case prober.HTTPGet != nil: 727 s.handleAppProbeHTTPGet(w, req, prober, path) 728 case prober.TCPSocket != nil: 729 s.handleAppProbeTCPSocket(w, prober) 730 case prober.GRPC != nil: 731 s.handleAppProbeGRPC(w, req, prober) 732 } 733 } 734 735 func (s *Server) handleAppProbeHTTPGet(w http.ResponseWriter, req *http.Request, prober *Prober, path string) { 736 proberPath := prober.HTTPGet.Path 737 if !strings.HasPrefix(proberPath, "/") { 738 proberPath = "/" + proberPath 739 } 740 var url string 741 742 hostPort := net.JoinHostPort(s.appProbersDestination, strconv.Itoa(prober.HTTPGet.Port.IntValue())) 743 if prober.HTTPGet.Scheme == apimirror.URISchemeHTTPS { 744 url = fmt.Sprintf("https://%s%s", hostPort, proberPath) 745 } else { 746 url = fmt.Sprintf("http://%s%s", hostPort, proberPath) 747 } 748 appReq, err := http.NewRequest(http.MethodGet, url, nil) 749 if err != nil { 750 log.Errorf("Failed to create request to probe app %v, original url %v", err, path) 751 w.WriteHeader(http.StatusInternalServerError) 752 return 753 } 754 755 appReq.Host = req.Host 756 if host, port, err := net.SplitHostPort(req.Host); err == nil { 757 port, _ := strconv.Atoi(port) 758 // the port is same as the status port, then we need to replace the port in the host with the real one 759 if port == int(s.statusPort) { 760 realPort := strconv.Itoa(prober.HTTPGet.Port.IntValue()) 761 appReq.Host = net.JoinHostPort(host, realPort) 762 } 763 } 764 // Forward incoming headers to the application. 765 for name, values := range req.Header { 766 appReq.Header[name] = slices.Clone(values) 767 if len(values) > 0 && (name == "Host") { 768 // Probe has specific host header override; honor it 769 appReq.Host = values[0] 770 } 771 } 772 773 // get the http client must exist because 774 httpClient := s.appProbeClient[path] 775 776 // Send the request. 777 response, err := httpClient.Do(appReq) 778 if err != nil { 779 log.Errorf("Request to probe app failed: %v, original URL path = %v\napp URL path = %v", err, path, proberPath) 780 w.WriteHeader(http.StatusInternalServerError) 781 return 782 } 783 defer func() { 784 // Drain and close the body to let the Transport reuse the connection 785 _, _ = io.Copy(io.Discard, response.Body) 786 _ = response.Body.Close() 787 }() 788 789 if isRedirect(response.StatusCode) { // Redirect 790 // In other cases, we return the original status code. For redirects, it is illegal to 791 // not have Location header, so we need to switch to just 200. 792 w.WriteHeader(http.StatusOK) 793 return 794 } 795 // We only write the status code to the response. 796 w.WriteHeader(response.StatusCode) 797 // Return the body from probe as well 798 b, _ := k8sUtilIo.ReadAtMost(response.Body, maxRespBodyLength) 799 _, _ = w.Write(b) 800 } 801 802 func (s *Server) handleAppProbeTCPSocket(w http.ResponseWriter, prober *Prober) { 803 timeout := time.Duration(prober.TimeoutSeconds) * time.Second 804 805 d := ProbeDialer() 806 d.LocalAddr = s.upstreamLocalAddress 807 d.Timeout = timeout 808 809 conn, err := d.Dial("tcp", net.JoinHostPort(s.appProbersDestination, strconv.Itoa(prober.TCPSocket.Port.IntValue()))) 810 if err != nil { 811 w.WriteHeader(http.StatusInternalServerError) 812 } else { 813 w.WriteHeader(http.StatusOK) 814 err = conn.Close() 815 if err != nil { 816 log.Infof("tcp connection is not closed: %v", err) 817 } 818 } 819 } 820 821 func (s *Server) handleAppProbeGRPC(w http.ResponseWriter, req *http.Request, prober *Prober) { 822 timeout := time.Duration(prober.TimeoutSeconds) * time.Second 823 // the DialOptions are referenced from https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/grpc/grpc.go#L55-L59 824 opts := []grpc.DialOption{ 825 grpc.WithBlock(), 826 grpc.WithTransportCredentials(insecure.NewCredentials()), // credentials are currently not supported 827 grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { 828 d := ProbeDialer() 829 d.LocalAddr = s.upstreamLocalAddress 830 d.Timeout = timeout 831 return d.DialContext(ctx, "tcp", addr) 832 }), 833 } 834 if userAgent := req.Header["User-Agent"]; len(userAgent) > 0 { 835 // simulate kubelet 836 // please refer to: 837 // https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/grpc/grpc.go#L56 838 // https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/http/http.go#L103 839 opts = append(opts, grpc.WithUserAgent(userAgent[0])) 840 } 841 842 ctx, cancel := context.WithTimeout(context.Background(), timeout) 843 defer cancel() 844 845 addr := net.JoinHostPort(s.appProbersDestination, strconv.Itoa(int(prober.GRPC.Port))) 846 conn, err := grpc.DialContext(ctx, addr, opts...) 847 if err != nil { 848 log.Errorf("Failed to create grpc connection to probe app: %v", err) 849 w.WriteHeader(http.StatusInternalServerError) 850 return 851 } 852 defer conn.Close() 853 854 var svc string 855 if prober.GRPC.Service != nil { 856 svc = *prober.GRPC.Service 857 } 858 grpcClient := grpcHealth.NewHealthClient(conn) 859 resp, err := grpcClient.Check(ctx, &grpcHealth.HealthCheckRequest{ 860 Service: svc, 861 }) 862 // the error handling is referenced from https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/probe/grpc/grpc.go#L88-L106 863 if err != nil { 864 status, ok := grpcStatus.FromError(err) 865 if ok { 866 switch status.Code() { 867 case codes.Unimplemented: 868 log.Errorf("server does not implement the grpc health protocol (grpc.health.v1.Health): %v", err) 869 case codes.DeadlineExceeded: 870 log.Errorf("grpc request not finished within timeout: %v", err) 871 default: 872 log.Errorf("grpc probe failed: %v", err) 873 } 874 } else { 875 log.Errorf("grpc probe failed: %v", err) 876 } 877 w.WriteHeader(http.StatusInternalServerError) 878 return 879 } 880 881 if resp.GetStatus() == grpcHealth.HealthCheckResponse_SERVING { 882 w.WriteHeader(http.StatusOK) 883 return 884 } 885 w.WriteHeader(http.StatusInternalServerError) 886 } 887 888 func (s *Server) handleNdsz(w http.ResponseWriter, r *http.Request) { 889 if !istioNetUtil.IsRequestFromLocalhost(r) { 890 http.Error(w, "Only requests from localhost are allowed", http.StatusForbidden) 891 return 892 } 893 nametable := s.fetchDNS() 894 if nametable == nil { 895 // See https://golang.org/doc/faq#nil_error for why writeJSONProto cannot handle this 896 w.WriteHeader(http.StatusNotFound) 897 _, _ = w.Write([]byte(`{}`)) 898 return 899 } 900 writeJSONProto(w, nametable) 901 } 902 903 // writeJSONProto writes a protobuf to a json payload, handling content type, marshaling, and errors 904 func writeJSONProto(w http.ResponseWriter, obj any) { 905 w.Header().Set("Content-Type", "application/json") 906 b, err := config.ToJSON(obj) 907 if err != nil { 908 w.WriteHeader(http.StatusInternalServerError) 909 _, _ = w.Write([]byte(err.Error())) 910 return 911 } 912 _, err = w.Write(b) 913 if err != nil { 914 w.WriteHeader(http.StatusInternalServerError) 915 } 916 } 917 918 // notifyExit sends SIGTERM to itself 919 func notifyExit() { 920 p, err := os.FindProcess(os.Getpid()) 921 if err != nil { 922 log.Error(err) 923 } 924 if err := p.Signal(syscall.SIGTERM); err != nil { 925 log.Errorf("failed to send SIGTERM to self: %v", err) 926 } 927 } 928 929 var defaultTransport = http.DefaultTransport.(*http.Transport) 930 931 // SetTransportDefaults mirrors Kubernetes probe settings 932 // https://github.com/kubernetes/kubernetes/blob/0153febd9f0098d4b8d0d484927710eaf899ef40/pkg/probe/http/http.go#L52 933 func setTransportDefaults(t *http.Transport) (*http.Transport, error) { 934 if !EnableHTTP2Probing { 935 return t, nil 936 } 937 if t.TLSHandshakeTimeout == 0 { 938 t.TLSHandshakeTimeout = defaultTransport.TLSHandshakeTimeout 939 } 940 if t.IdleConnTimeout == 0 { 941 t.IdleConnTimeout = defaultTransport.IdleConnTimeout 942 } 943 t2, err := http2.ConfigureTransports(t) 944 if err != nil { 945 return nil, err 946 } 947 t2.ReadIdleTimeout = time.Duration(30) * time.Second 948 t2.PingTimeout = time.Duration(15) * time.Second 949 return t, nil 950 }