vitess.io/vitess@v0.16.2/go/vt/discovery/tablet_health_check.go (about) 1 /* 2 Copyright 2020 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package discovery 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 "vitess.io/vitess/go/sync2" 27 28 "vitess.io/vitess/go/vt/grpcclient" 29 "vitess.io/vitess/go/vt/log" 30 "vitess.io/vitess/go/vt/proto/vtrpc" 31 "vitess.io/vitess/go/vt/topo/topoproto" 32 "vitess.io/vitess/go/vt/topotools" 33 "vitess.io/vitess/go/vt/vterrors" 34 "vitess.io/vitess/go/vt/vttablet/queryservice" 35 "vitess.io/vitess/go/vt/vttablet/tabletconn" 36 37 "google.golang.org/protobuf/proto" 38 39 "vitess.io/vitess/go/vt/proto/query" 40 "vitess.io/vitess/go/vt/proto/topodata" 41 ) 42 43 // tabletHealthCheck maintains the health status of a tablet. A map of this 44 // structure is maintained in HealthCheck. 45 type tabletHealthCheck struct { 46 ctx context.Context 47 // cancelFunc must be called before discarding tabletHealthCheck. 48 // This will ensure that the associated checkConn goroutine will terminate. 49 cancelFunc context.CancelFunc 50 // Tablet is the tablet object that was sent to HealthCheck.AddTablet. 51 Tablet *topodata.Tablet 52 // mutex to protect Conn 53 connMu sync.Mutex 54 // Conn is the connection associated with the tablet. 55 Conn queryservice.QueryService 56 // Target is the current target as returned by the streaming 57 // StreamHealth RPC. 58 Target *query.Target 59 // Serving describes if the tablet can be serving traffic. 60 Serving bool 61 // PrimaryTermStartTime is the last time at which 62 // this tablet was either elected the primary, or received 63 // a TabletExternallyReparented event. It is set to 0 if the 64 // tablet doesn't think it's a primary. 65 PrimaryTermStartTime int64 66 // Stats is the current health status, as received by the 67 // StreamHealth RPC (replication lag, ...). 68 Stats *query.RealtimeStats 69 // LastError is the error we last saw when trying to get the 70 // tablet's healthcheck. 71 LastError error 72 // possibly delete both these 73 loggedServingState bool 74 lastResponseTimestamp time.Time // timestamp of the last healthcheck response 75 } 76 77 // String is defined because we want to print a []*tabletHealthCheck array nicely. 78 func (thc *tabletHealthCheck) String() string { 79 return fmt.Sprintf("tabletHealthCheck{Tablet: %v,Target: %v,Serving: %v, PrimaryTermStartTime: %v, Stats: %v, LastError: %v", 80 thc.Tablet, thc.Target, thc.Serving, thc.PrimaryTermStartTime, thc.Stats, thc.LastError) 81 } 82 83 // SimpleCopy returns a TabletHealth with all the necessary fields copied from tabletHealthCheck. 84 // Note that this is not a deep copy because we point to the same underlying RealtimeStats. 85 // That is fine because the RealtimeStats object is never changed after creation. 86 func (thc *tabletHealthCheck) SimpleCopy() *TabletHealth { 87 thc.connMu.Lock() 88 defer thc.connMu.Unlock() 89 return &TabletHealth{ 90 Conn: thc.Conn, 91 Tablet: thc.Tablet, 92 Target: thc.Target, 93 Stats: thc.Stats, 94 LastError: thc.LastError, 95 PrimaryTermStartTime: thc.PrimaryTermStartTime, 96 Serving: thc.Serving, 97 } 98 } 99 100 // setServingState sets the tablet state to the given value. 101 // 102 // If the state changes, it logs the change so that failures 103 // from the health check connection are logged the first time, 104 // but don't continue to log if the connection stays down. 105 // 106 // thc.mu must be locked before calling this function 107 func (thc *tabletHealthCheck) setServingState(serving bool, reason string) { 108 if !thc.loggedServingState || (serving != thc.Serving) { 109 // Emit the log from a separate goroutine to avoid holding 110 // the th lock while logging is happening 111 log.Infof("HealthCheckUpdate(Serving State): tablet: %v serving %v => %v for %v/%v (%v) reason: %s", 112 topotools.TabletIdent(thc.Tablet), 113 thc.Serving, 114 serving, 115 thc.Tablet.GetKeyspace(), 116 thc.Tablet.GetShard(), 117 thc.Target.GetTabletType(), 118 reason, 119 ) 120 thc.loggedServingState = true 121 } 122 thc.Serving = serving 123 } 124 125 // stream streams healthcheck responses to callback. 126 func (thc *tabletHealthCheck) stream(ctx context.Context, callback func(*query.StreamHealthResponse) error) error { 127 conn := thc.Connection() 128 if conn == nil { 129 // This signals the caller to retry 130 return nil 131 } 132 err := conn.StreamHealth(ctx, callback) 133 if err != nil { 134 // Depending on the specific error the caller can take action 135 thc.closeConnection(ctx, err) 136 } 137 return err 138 } 139 140 func (thc *tabletHealthCheck) Connection() queryservice.QueryService { 141 thc.connMu.Lock() 142 defer thc.connMu.Unlock() 143 return thc.connectionLocked() 144 } 145 146 func (thc *tabletHealthCheck) connectionLocked() queryservice.QueryService { 147 if thc.Conn == nil { 148 conn, err := tabletconn.GetDialer()(thc.Tablet, grpcclient.FailFast(true)) 149 if err != nil { 150 thc.LastError = err 151 return nil 152 } 153 thc.Conn = conn 154 thc.LastError = nil 155 } 156 return thc.Conn 157 } 158 159 // processResponse reads one health check response, and updates health 160 func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.StreamHealthResponse) error { 161 select { 162 case <-thc.ctx.Done(): 163 return thc.ctx.Err() 164 default: 165 } 166 167 // Check for invalid data, better than panicking. 168 if shr.Target == nil || shr.RealtimeStats == nil { 169 return fmt.Errorf("health stats is not valid: %v", shr) 170 } 171 172 // an app-level error from tablet, force serving state. 173 var healthErr error 174 serving := shr.Serving 175 if shr.RealtimeStats.HealthError != "" { 176 healthErr = fmt.Errorf("vttablet error: %v", shr.RealtimeStats.HealthError) 177 serving = false 178 } 179 180 if shr.TabletAlias != nil && !proto.Equal(shr.TabletAlias, thc.Tablet.Alias) { 181 // TabletAlias change means that the host:port has been taken over by another tablet 182 // We cancel / exit the healthcheck for this tablet right away 183 // With the next topo refresh we will get a new tablet with the new host/port 184 return vterrors.New(vtrpc.Code_FAILED_PRECONDITION, fmt.Sprintf("health stats mismatch, tablet %+v alias does not match response alias %v", thc.Tablet, shr.TabletAlias)) 185 } 186 187 prevTarget := thc.Target 188 // check whether this is a trivial update so as to update healthy map 189 trivialUpdate := thc.LastError == nil && thc.Serving && shr.RealtimeStats.HealthError == "" && shr.Serving && 190 prevTarget.TabletType != topodata.TabletType_PRIMARY && prevTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats) 191 thc.lastResponseTimestamp = time.Now() 192 thc.Target = shr.Target 193 thc.PrimaryTermStartTime = shr.TabletExternallyReparentedTimestamp 194 thc.Stats = shr.RealtimeStats 195 thc.LastError = healthErr 196 reason := "healthCheck update" 197 if healthErr != nil { 198 reason = "healthCheck update error: " + healthErr.Error() 199 } 200 thc.setServingState(serving, reason) 201 202 // notify downstream for primary change 203 hc.updateHealth(thc.SimpleCopy(), prevTarget, trivialUpdate, thc.Serving) 204 return nil 205 } 206 207 // isTrivialReplagChange returns true iff the old and new RealtimeStats 208 // haven't changed enough to warrant re-calling FilterLegacyStatsByReplicationLag. 209 func (thc *tabletHealthCheck) isTrivialReplagChange(newStats *query.RealtimeStats) bool { 210 // first time always return false 211 if thc.Stats == nil { 212 return false 213 } 214 // Skip replag filter when replag remains in the low rep lag range, 215 // which should be the case majority of the time. 216 lowRepLag := lowReplicationLag.Seconds() 217 oldRepLag := float64(thc.Stats.ReplicationLagSeconds) 218 newRepLag := float64(newStats.ReplicationLagSeconds) 219 if oldRepLag <= lowRepLag && newRepLag <= lowRepLag { 220 return true 221 } 222 // Skip replag filter when replag remains in the high rep lag range, 223 // and did not change beyond +/- 10%. 224 // when there is a high rep lag, it takes a long time for it to reduce, 225 // so it is not necessary to re-calculate every time. 226 // In that case, we won't save the new record, so we still 227 // remember the original replication lag. 228 if oldRepLag > lowRepLag && newRepLag > lowRepLag && newRepLag < oldRepLag*1.1 && newRepLag > oldRepLag*0.9 { 229 return true 230 } 231 return false 232 } 233 234 // checkConn performs health checking on the given tablet. 235 func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { 236 defer func() { 237 // TODO(deepthi): We should ensure any return from this func calls the equivalent of hc.deleteTablet 238 thc.finalizeConn() 239 hc.connsWG.Done() 240 }() 241 242 // Initialize error counter 243 hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 0) 244 245 retryDelay := hc.retryDelay 246 for { 247 streamCtx, streamCancel := context.WithCancel(thc.ctx) 248 249 // Setup a watcher that restarts the timer every time an update is received. 250 // If a timeout occurs for a serving tablet, we make it non-serving and send 251 // a status update. The stream is also terminated so it can be retried. 252 // servingStatus feeds into the serving var, which keeps track of the serving 253 // status transmitted by the tablet. 254 servingStatus := make(chan bool, 1) 255 // timedout is accessed atomically because there could be a race 256 // between the goroutine that sets it and the check for its value 257 // later. 258 timedout := sync2.NewAtomicBool(false) 259 go func() { 260 for { 261 select { 262 case <-servingStatus: 263 continue 264 case <-time.After(hc.healthCheckTimeout): 265 timedout.Set(true) 266 streamCancel() 267 return 268 case <-streamCtx.Done(): 269 // If the stream is done, stop watching. 270 return 271 } 272 } 273 }() 274 275 // Read stream health responses. 276 err := thc.stream(streamCtx, func(shr *query.StreamHealthResponse) error { 277 // We received a message. Reset the back-off. 278 retryDelay = hc.retryDelay 279 // Don't block on send to avoid deadlocks. 280 select { 281 case servingStatus <- shr.Serving: 282 default: 283 } 284 return thc.processResponse(hc, shr) 285 }) 286 287 // streamCancel to make sure the watcher goroutine terminates. 288 streamCancel() 289 290 if err != nil { 291 hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) 292 // This means that another tablet has taken over the host:port that we were connected to. 293 // So let's remove the tablet's data from the healthcheck, and if it is still a part of the 294 // cluster, the new tablet record will be fetched from the topology server and re-added to 295 // the healthcheck cache again via the topology watcher. 296 // WARNING: Under no other circumstances should we be deleting the tablet here. 297 if strings.Contains(err.Error(), "health stats mismatch") { 298 log.Warningf("deleting tablet %v from healthcheck due to health stats mismatch", thc.Tablet) 299 hc.deleteTablet(thc.Tablet) 300 return 301 } 302 // trivialUpdate = false because this is an error 303 // up = false because we did not get a healthy response 304 hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) 305 } 306 // If there was a timeout send an error. We do this after stream has returned. 307 // This will ensure that this update prevails over any previous message that 308 // stream could have sent. 309 if timedout.Get() { 310 thc.LastError = fmt.Errorf("healthcheck timed out (latest %v)", thc.lastResponseTimestamp) 311 thc.setServingState(false, thc.LastError.Error()) 312 hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) 313 // trivialUpdate = false because this is an error 314 // up = false because we did not get a healthy response within the timeout 315 hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) 316 } 317 318 // Streaming RPC failed e.g. because vttablet was restarted or took too long. 319 // Sleep until the next retry is up or the context is done/canceled. 320 select { 321 case <-thc.ctx.Done(): 322 return 323 case <-time.After(retryDelay): 324 // Exponentially back-off to prevent tight-loop. 325 retryDelay *= 2 326 // Limit the retry delay backoff to the health check timeout 327 if retryDelay > hc.healthCheckTimeout { 328 retryDelay = hc.healthCheckTimeout 329 } 330 } 331 } 332 } 333 334 func (thc *tabletHealthCheck) closeConnection(ctx context.Context, err error) { 335 log.Warningf("tablet %v healthcheck stream error: %v", thc.Tablet, err) 336 thc.setServingState(false, err.Error()) 337 thc.LastError = err 338 _ = thc.Conn.Close(ctx) 339 thc.Conn = nil 340 } 341 342 // finalizeConn closes the health checking connection. 343 // To be called only on exit from checkConn(). 344 func (thc *tabletHealthCheck) finalizeConn() { 345 thc.setServingState(false, "finalizeConn closing connection") 346 // Note: checkConn() exits only when thc.ctx.Done() is closed. Thus it's 347 // safe to simply get Err() value here and assign to LastError. 348 thc.LastError = thc.ctx.Err() 349 if thc.Conn != nil { 350 // Don't use thc.ctx because it's already closed. 351 // Use a separate context, and add a timeout to prevent unbounded waits. 352 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 353 defer cancel() 354 _ = thc.Conn.Close(ctx) 355 thc.Conn = nil 356 } 357 }