istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/cmd/pilot-agent/status/ready/probe.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package ready
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	"istio.io/istio/pilot/cmd/pilot-agent/metrics"
    22  	"istio.io/istio/pilot/cmd/pilot-agent/status/util"
    23  )
    24  
    25  // Probe for readiness.
    26  type Probe struct {
    27  	LocalHostAddr       string
    28  	AdminPort           uint16
    29  	receivedFirstUpdate bool
    30  	// Indicates that Envoy is ready at least once so that we can cache and reuse that probe.
    31  	atleastOnceReady bool
    32  	Context          context.Context
    33  	// NoEnvoy so we only check config status
    34  	NoEnvoy bool
    35  }
    36  
    37  type Prober interface {
    38  	// Check executes the probe and returns an error if the probe fails.
    39  	Check() error
    40  }
    41  
    42  var _ Prober = &Probe{}
    43  
    44  // Check executes the probe and returns an error if the probe fails.
    45  func (p *Probe) Check() error {
    46  	// First, check that Envoy has received a configuration update from Pilot.
    47  	if err := p.checkConfigStatus(); err != nil {
    48  		return err
    49  	}
    50  	return p.isEnvoyReady()
    51  }
    52  
    53  // checkConfigStatus checks to make sure initial configs have been received from Pilot.
    54  func (p *Probe) checkConfigStatus() error {
    55  	if p.NoEnvoy {
    56  		// TODO some way to verify XDS proxy -> control plane works
    57  		return nil
    58  	}
    59  	if p.receivedFirstUpdate {
    60  		return nil
    61  	}
    62  
    63  	s, err := util.GetUpdateStatusStats(p.LocalHostAddr, p.AdminPort)
    64  	if err != nil {
    65  		return err
    66  	}
    67  
    68  	CDSUpdated := s.CDSUpdatesSuccess > 0
    69  	LDSUpdated := s.LDSUpdatesSuccess > 0
    70  	if CDSUpdated && LDSUpdated {
    71  		p.receivedFirstUpdate = true
    72  		return nil
    73  	}
    74  
    75  	if !CDSUpdated && !LDSUpdated {
    76  		return fmt.Errorf("config not received from XDS server (is Istiod running?): %s", s.String())
    77  	} else if s.LDSUpdatesRejection > 0 || s.CDSUpdatesRejection > 0 {
    78  		return fmt.Errorf("config received from XDS server, but was rejected: %s", s.String())
    79  	}
    80  	return fmt.Errorf("config not fully received from XDS server: %s", s.String())
    81  }
    82  
    83  // isEnvoyReady checks to ensure that Envoy is in the LIVE state and workers have started.
    84  func (p *Probe) isEnvoyReady() error {
    85  	if p.NoEnvoy {
    86  		return nil
    87  	}
    88  	if p.Context == nil {
    89  		return p.checkEnvoyReadiness()
    90  	}
    91  	select {
    92  	case <-p.Context.Done():
    93  		return fmt.Errorf("server is not live, current state is: %s", StateString(Draining))
    94  	default:
    95  		return p.checkEnvoyReadiness()
    96  	}
    97  }
    98  
    99  func (p *Probe) checkEnvoyReadiness() error {
   100  	// If Envoy is ready at least once i.e. server state is LIVE and workers
   101  	// have started, they will not go back in the life time of Envoy process.
   102  	// They will only change at hot restart or health check fails. Since istio
   103  	// does not use both of them, it is safe to cache this value. Since the
   104  	// actual readiness probe goes via Envoy, it ensures that Envoy is actively
   105  	// serving traffic and we can rely on that.
   106  	if p.atleastOnceReady {
   107  		return nil
   108  	}
   109  
   110  	err := checkEnvoyStats(p.LocalHostAddr, p.AdminPort)
   111  	if err == nil {
   112  		metrics.RecordStartupTime()
   113  		p.atleastOnceReady = true
   114  	}
   115  	return err
   116  }
   117  
   118  type ServerInfoState int32
   119  
   120  const (
   121  	// Server is live and serving traffic.
   122  	Live ServerInfoState = 0
   123  	// Server is draining listeners in response to external health checks failing.
   124  	Draining ServerInfoState = 1
   125  	// Server has not yet completed cluster manager initialization.
   126  	PreInitializing ServerInfoState = 2
   127  	// Server is running the cluster manager initialization callbacks (e.g., RDS).
   128  	Initializing ServerInfoState = 3
   129  )
   130  
   131  func StateString(state ServerInfoState) string {
   132  	switch state {
   133  	case Live:
   134  		return "LIVE"
   135  	case Draining:
   136  		return "DRAINING"
   137  	case PreInitializing:
   138  		return "PRE_INITIALIZING"
   139  	case Initializing:
   140  		return "INITIALIZING"
   141  	}
   142  	return "UNKNOWN"
   143  }
   144  
   145  // checkEnvoyStats actually executes the Stats Query on Envoy admin endpoint.
   146  func checkEnvoyStats(host string, port uint16) error {
   147  	state, ws, err := util.GetReadinessStats(host, port)
   148  	if err != nil {
   149  		return fmt.Errorf("failed to get readiness stats: %v", err)
   150  	}
   151  
   152  	if state != nil && ServerInfoState(*state) != Live {
   153  		return fmt.Errorf("server is not live, current state is: %v", StateString(ServerInfoState(*state)))
   154  	}
   155  
   156  	if !ws {
   157  		return fmt.Errorf("workers have not yet started")
   158  	}
   159  
   160  	return nil
   161  }