istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/cmd/pilot-agent/status/ready/probe.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package ready 16 17 import ( 18 "context" 19 "fmt" 20 21 "istio.io/istio/pilot/cmd/pilot-agent/metrics" 22 "istio.io/istio/pilot/cmd/pilot-agent/status/util" 23 ) 24 25 // Probe for readiness. 26 type Probe struct { 27 LocalHostAddr string 28 AdminPort uint16 29 receivedFirstUpdate bool 30 // Indicates that Envoy is ready at least once so that we can cache and reuse that probe. 31 atleastOnceReady bool 32 Context context.Context 33 // NoEnvoy so we only check config status 34 NoEnvoy bool 35 } 36 37 type Prober interface { 38 // Check executes the probe and returns an error if the probe fails. 39 Check() error 40 } 41 42 var _ Prober = &Probe{} 43 44 // Check executes the probe and returns an error if the probe fails. 45 func (p *Probe) Check() error { 46 // First, check that Envoy has received a configuration update from Pilot. 47 if err := p.checkConfigStatus(); err != nil { 48 return err 49 } 50 return p.isEnvoyReady() 51 } 52 53 // checkConfigStatus checks to make sure initial configs have been received from Pilot. 54 func (p *Probe) checkConfigStatus() error { 55 if p.NoEnvoy { 56 // TODO some way to verify XDS proxy -> control plane works 57 return nil 58 } 59 if p.receivedFirstUpdate { 60 return nil 61 } 62 63 s, err := util.GetUpdateStatusStats(p.LocalHostAddr, p.AdminPort) 64 if err != nil { 65 return err 66 } 67 68 CDSUpdated := s.CDSUpdatesSuccess > 0 69 LDSUpdated := s.LDSUpdatesSuccess > 0 70 if CDSUpdated && LDSUpdated { 71 p.receivedFirstUpdate = true 72 return nil 73 } 74 75 if !CDSUpdated && !LDSUpdated { 76 return fmt.Errorf("config not received from XDS server (is Istiod running?): %s", s.String()) 77 } else if s.LDSUpdatesRejection > 0 || s.CDSUpdatesRejection > 0 { 78 return fmt.Errorf("config received from XDS server, but was rejected: %s", s.String()) 79 } 80 return fmt.Errorf("config not fully received from XDS server: %s", s.String()) 81 } 82 83 // isEnvoyReady checks to ensure that Envoy is in the LIVE state and workers have started. 84 func (p *Probe) isEnvoyReady() error { 85 if p.NoEnvoy { 86 return nil 87 } 88 if p.Context == nil { 89 return p.checkEnvoyReadiness() 90 } 91 select { 92 case <-p.Context.Done(): 93 return fmt.Errorf("server is not live, current state is: %s", StateString(Draining)) 94 default: 95 return p.checkEnvoyReadiness() 96 } 97 } 98 99 func (p *Probe) checkEnvoyReadiness() error { 100 // If Envoy is ready at least once i.e. server state is LIVE and workers 101 // have started, they will not go back in the life time of Envoy process. 102 // They will only change at hot restart or health check fails. Since istio 103 // does not use both of them, it is safe to cache this value. Since the 104 // actual readiness probe goes via Envoy, it ensures that Envoy is actively 105 // serving traffic and we can rely on that. 106 if p.atleastOnceReady { 107 return nil 108 } 109 110 err := checkEnvoyStats(p.LocalHostAddr, p.AdminPort) 111 if err == nil { 112 metrics.RecordStartupTime() 113 p.atleastOnceReady = true 114 } 115 return err 116 } 117 118 type ServerInfoState int32 119 120 const ( 121 // Server is live and serving traffic. 122 Live ServerInfoState = 0 123 // Server is draining listeners in response to external health checks failing. 124 Draining ServerInfoState = 1 125 // Server has not yet completed cluster manager initialization. 126 PreInitializing ServerInfoState = 2 127 // Server is running the cluster manager initialization callbacks (e.g., RDS). 128 Initializing ServerInfoState = 3 129 ) 130 131 func StateString(state ServerInfoState) string { 132 switch state { 133 case Live: 134 return "LIVE" 135 case Draining: 136 return "DRAINING" 137 case PreInitializing: 138 return "PRE_INITIALIZING" 139 case Initializing: 140 return "INITIALIZING" 141 } 142 return "UNKNOWN" 143 } 144 145 // checkEnvoyStats actually executes the Stats Query on Envoy admin endpoint. 146 func checkEnvoyStats(host string, port uint16) error { 147 state, ws, err := util.GetReadinessStats(host, port) 148 if err != nil { 149 return fmt.Errorf("failed to get readiness stats: %v", err) 150 } 151 152 if state != nil && ServerInfoState(*state) != Live { 153 return fmt.Errorf("server is not live, current state is: %v", StateString(ServerInfoState(*state))) 154 } 155 156 if !ws { 157 return fmt.Errorf("workers have not yet started") 158 } 159 160 return nil 161 }