vitess.io/vitess@v0.16.2/go/vt/discovery/tablet_health_check.go (about)

     1  /*
     2  Copyright 2020 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package discovery
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	"vitess.io/vitess/go/sync2"
    27  
    28  	"vitess.io/vitess/go/vt/grpcclient"
    29  	"vitess.io/vitess/go/vt/log"
    30  	"vitess.io/vitess/go/vt/proto/vtrpc"
    31  	"vitess.io/vitess/go/vt/topo/topoproto"
    32  	"vitess.io/vitess/go/vt/topotools"
    33  	"vitess.io/vitess/go/vt/vterrors"
    34  	"vitess.io/vitess/go/vt/vttablet/queryservice"
    35  	"vitess.io/vitess/go/vt/vttablet/tabletconn"
    36  
    37  	"google.golang.org/protobuf/proto"
    38  
    39  	"vitess.io/vitess/go/vt/proto/query"
    40  	"vitess.io/vitess/go/vt/proto/topodata"
    41  )
    42  
    43  // tabletHealthCheck maintains the health status of a tablet. A map of this
    44  // structure is maintained in HealthCheck.
    45  type tabletHealthCheck struct {
    46  	ctx context.Context
    47  	// cancelFunc must be called before discarding tabletHealthCheck.
    48  	// This will ensure that the associated checkConn goroutine will terminate.
    49  	cancelFunc context.CancelFunc
    50  	// Tablet is the tablet object that was sent to HealthCheck.AddTablet.
    51  	Tablet *topodata.Tablet
    52  	// mutex to protect Conn
    53  	connMu sync.Mutex
    54  	// Conn is the connection associated with the tablet.
    55  	Conn queryservice.QueryService
    56  	// Target is the current target as returned by the streaming
    57  	// StreamHealth RPC.
    58  	Target *query.Target
    59  	// Serving describes if the tablet can be serving traffic.
    60  	Serving bool
    61  	// PrimaryTermStartTime is the last time at which
    62  	// this tablet was either elected the primary, or received
    63  	// a TabletExternallyReparented event. It is set to 0 if the
    64  	// tablet doesn't think it's a primary.
    65  	PrimaryTermStartTime int64
    66  	// Stats is the current health status, as received by the
    67  	// StreamHealth RPC (replication lag, ...).
    68  	Stats *query.RealtimeStats
    69  	// LastError is the error we last saw when trying to get the
    70  	// tablet's healthcheck.
    71  	LastError error
    72  	// possibly delete both these
    73  	loggedServingState    bool
    74  	lastResponseTimestamp time.Time // timestamp of the last healthcheck response
    75  }
    76  
    77  // String is defined because we want to print a []*tabletHealthCheck array nicely.
    78  func (thc *tabletHealthCheck) String() string {
    79  	return fmt.Sprintf("tabletHealthCheck{Tablet: %v,Target: %v,Serving: %v, PrimaryTermStartTime: %v, Stats: %v, LastError: %v",
    80  		thc.Tablet, thc.Target, thc.Serving, thc.PrimaryTermStartTime, thc.Stats, thc.LastError)
    81  }
    82  
    83  // SimpleCopy returns a TabletHealth with all the necessary fields copied from tabletHealthCheck.
    84  // Note that this is not a deep copy because we point to the same underlying RealtimeStats.
    85  // That is fine because the RealtimeStats object is never changed after creation.
    86  func (thc *tabletHealthCheck) SimpleCopy() *TabletHealth {
    87  	thc.connMu.Lock()
    88  	defer thc.connMu.Unlock()
    89  	return &TabletHealth{
    90  		Conn:                 thc.Conn,
    91  		Tablet:               thc.Tablet,
    92  		Target:               thc.Target,
    93  		Stats:                thc.Stats,
    94  		LastError:            thc.LastError,
    95  		PrimaryTermStartTime: thc.PrimaryTermStartTime,
    96  		Serving:              thc.Serving,
    97  	}
    98  }
    99  
   100  // setServingState sets the tablet state to the given value.
   101  //
   102  // If the state changes, it logs the change so that failures
   103  // from the health check connection are logged the first time,
   104  // but don't continue to log if the connection stays down.
   105  //
   106  // thc.mu must be locked before calling this function
   107  func (thc *tabletHealthCheck) setServingState(serving bool, reason string) {
   108  	if !thc.loggedServingState || (serving != thc.Serving) {
   109  		// Emit the log from a separate goroutine to avoid holding
   110  		// the th lock while logging is happening
   111  		log.Infof("HealthCheckUpdate(Serving State): tablet: %v serving %v => %v for %v/%v (%v) reason: %s",
   112  			topotools.TabletIdent(thc.Tablet),
   113  			thc.Serving,
   114  			serving,
   115  			thc.Tablet.GetKeyspace(),
   116  			thc.Tablet.GetShard(),
   117  			thc.Target.GetTabletType(),
   118  			reason,
   119  		)
   120  		thc.loggedServingState = true
   121  	}
   122  	thc.Serving = serving
   123  }
   124  
   125  // stream streams healthcheck responses to callback.
   126  func (thc *tabletHealthCheck) stream(ctx context.Context, callback func(*query.StreamHealthResponse) error) error {
   127  	conn := thc.Connection()
   128  	if conn == nil {
   129  		// This signals the caller to retry
   130  		return nil
   131  	}
   132  	err := conn.StreamHealth(ctx, callback)
   133  	if err != nil {
   134  		// Depending on the specific error the caller can take action
   135  		thc.closeConnection(ctx, err)
   136  	}
   137  	return err
   138  }
   139  
   140  func (thc *tabletHealthCheck) Connection() queryservice.QueryService {
   141  	thc.connMu.Lock()
   142  	defer thc.connMu.Unlock()
   143  	return thc.connectionLocked()
   144  }
   145  
   146  func (thc *tabletHealthCheck) connectionLocked() queryservice.QueryService {
   147  	if thc.Conn == nil {
   148  		conn, err := tabletconn.GetDialer()(thc.Tablet, grpcclient.FailFast(true))
   149  		if err != nil {
   150  			thc.LastError = err
   151  			return nil
   152  		}
   153  		thc.Conn = conn
   154  		thc.LastError = nil
   155  	}
   156  	return thc.Conn
   157  }
   158  
   159  // processResponse reads one health check response, and updates health
   160  func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.StreamHealthResponse) error {
   161  	select {
   162  	case <-thc.ctx.Done():
   163  		return thc.ctx.Err()
   164  	default:
   165  	}
   166  
   167  	// Check for invalid data, better than panicking.
   168  	if shr.Target == nil || shr.RealtimeStats == nil {
   169  		return fmt.Errorf("health stats is not valid: %v", shr)
   170  	}
   171  
   172  	// an app-level error from tablet, force serving state.
   173  	var healthErr error
   174  	serving := shr.Serving
   175  	if shr.RealtimeStats.HealthError != "" {
   176  		healthErr = fmt.Errorf("vttablet error: %v", shr.RealtimeStats.HealthError)
   177  		serving = false
   178  	}
   179  
   180  	if shr.TabletAlias != nil && !proto.Equal(shr.TabletAlias, thc.Tablet.Alias) {
   181  		// TabletAlias change means that the host:port has been taken over by another tablet
   182  		// We cancel / exit the healthcheck for this tablet right away
   183  		// With the next topo refresh we will get a new tablet with the new host/port
   184  		return vterrors.New(vtrpc.Code_FAILED_PRECONDITION, fmt.Sprintf("health stats mismatch, tablet %+v alias does not match response alias %v", thc.Tablet, shr.TabletAlias))
   185  	}
   186  
   187  	prevTarget := thc.Target
   188  	// check whether this is a trivial update so as to update healthy map
   189  	trivialUpdate := thc.LastError == nil && thc.Serving && shr.RealtimeStats.HealthError == "" && shr.Serving &&
   190  		prevTarget.TabletType != topodata.TabletType_PRIMARY && prevTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats)
   191  	thc.lastResponseTimestamp = time.Now()
   192  	thc.Target = shr.Target
   193  	thc.PrimaryTermStartTime = shr.TabletExternallyReparentedTimestamp
   194  	thc.Stats = shr.RealtimeStats
   195  	thc.LastError = healthErr
   196  	reason := "healthCheck update"
   197  	if healthErr != nil {
   198  		reason = "healthCheck update error: " + healthErr.Error()
   199  	}
   200  	thc.setServingState(serving, reason)
   201  
   202  	// notify downstream for primary change
   203  	hc.updateHealth(thc.SimpleCopy(), prevTarget, trivialUpdate, thc.Serving)
   204  	return nil
   205  }
   206  
   207  // isTrivialReplagChange returns true iff the old and new RealtimeStats
   208  // haven't changed enough to warrant re-calling FilterLegacyStatsByReplicationLag.
   209  func (thc *tabletHealthCheck) isTrivialReplagChange(newStats *query.RealtimeStats) bool {
   210  	// first time always return false
   211  	if thc.Stats == nil {
   212  		return false
   213  	}
   214  	// Skip replag filter when replag remains in the low rep lag range,
   215  	// which should be the case majority of the time.
   216  	lowRepLag := lowReplicationLag.Seconds()
   217  	oldRepLag := float64(thc.Stats.ReplicationLagSeconds)
   218  	newRepLag := float64(newStats.ReplicationLagSeconds)
   219  	if oldRepLag <= lowRepLag && newRepLag <= lowRepLag {
   220  		return true
   221  	}
   222  	// Skip replag filter when replag remains in the high rep lag range,
   223  	// and did not change beyond +/- 10%.
   224  	// when there is a high rep lag, it takes a long time for it to reduce,
   225  	// so it is not necessary to re-calculate every time.
   226  	// In that case, we won't save the new record, so we still
   227  	// remember the original replication lag.
   228  	if oldRepLag > lowRepLag && newRepLag > lowRepLag && newRepLag < oldRepLag*1.1 && newRepLag > oldRepLag*0.9 {
   229  		return true
   230  	}
   231  	return false
   232  }
   233  
   234  // checkConn performs health checking on the given tablet.
   235  func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {
   236  	defer func() {
   237  		// TODO(deepthi): We should ensure any return from this func calls the equivalent of hc.deleteTablet
   238  		thc.finalizeConn()
   239  		hc.connsWG.Done()
   240  	}()
   241  
   242  	// Initialize error counter
   243  	hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 0)
   244  
   245  	retryDelay := hc.retryDelay
   246  	for {
   247  		streamCtx, streamCancel := context.WithCancel(thc.ctx)
   248  
   249  		// Setup a watcher that restarts the timer every time an update is received.
   250  		// If a timeout occurs for a serving tablet, we make it non-serving and send
   251  		// a status update. The stream is also terminated so it can be retried.
   252  		// servingStatus feeds into the serving var, which keeps track of the serving
   253  		// status transmitted by the tablet.
   254  		servingStatus := make(chan bool, 1)
   255  		// timedout is accessed atomically because there could be a race
   256  		// between the goroutine that sets it and the check for its value
   257  		// later.
   258  		timedout := sync2.NewAtomicBool(false)
   259  		go func() {
   260  			for {
   261  				select {
   262  				case <-servingStatus:
   263  					continue
   264  				case <-time.After(hc.healthCheckTimeout):
   265  					timedout.Set(true)
   266  					streamCancel()
   267  					return
   268  				case <-streamCtx.Done():
   269  					// If the stream is done, stop watching.
   270  					return
   271  				}
   272  			}
   273  		}()
   274  
   275  		// Read stream health responses.
   276  		err := thc.stream(streamCtx, func(shr *query.StreamHealthResponse) error {
   277  			// We received a message. Reset the back-off.
   278  			retryDelay = hc.retryDelay
   279  			// Don't block on send to avoid deadlocks.
   280  			select {
   281  			case servingStatus <- shr.Serving:
   282  			default:
   283  			}
   284  			return thc.processResponse(hc, shr)
   285  		})
   286  
   287  		// streamCancel to make sure the watcher goroutine terminates.
   288  		streamCancel()
   289  
   290  		if err != nil {
   291  			hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1)
   292  			// This means that another tablet has taken over the host:port that we were connected to.
   293  			// So let's remove the tablet's data from the healthcheck, and if it is still a part of the
   294  			// cluster, the new tablet record will be fetched from the topology server and re-added to
   295  			// the healthcheck cache again via the topology watcher.
   296  			// WARNING: Under no other circumstances should we be deleting the tablet here.
   297  			if strings.Contains(err.Error(), "health stats mismatch") {
   298  				log.Warningf("deleting tablet %v from healthcheck due to health stats mismatch", thc.Tablet)
   299  				hc.deleteTablet(thc.Tablet)
   300  				return
   301  			}
   302  			// trivialUpdate = false because this is an error
   303  			// up = false because we did not get a healthy response
   304  			hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false)
   305  		}
   306  		// If there was a timeout send an error. We do this after stream has returned.
   307  		// This will ensure that this update prevails over any previous message that
   308  		// stream could have sent.
   309  		if timedout.Get() {
   310  			thc.LastError = fmt.Errorf("healthcheck timed out (latest %v)", thc.lastResponseTimestamp)
   311  			thc.setServingState(false, thc.LastError.Error())
   312  			hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1)
   313  			// trivialUpdate = false because this is an error
   314  			// up = false because we did not get a healthy response within the timeout
   315  			hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false)
   316  		}
   317  
   318  		// Streaming RPC failed e.g. because vttablet was restarted or took too long.
   319  		// Sleep until the next retry is up or the context is done/canceled.
   320  		select {
   321  		case <-thc.ctx.Done():
   322  			return
   323  		case <-time.After(retryDelay):
   324  			// Exponentially back-off to prevent tight-loop.
   325  			retryDelay *= 2
   326  			// Limit the retry delay backoff to the health check timeout
   327  			if retryDelay > hc.healthCheckTimeout {
   328  				retryDelay = hc.healthCheckTimeout
   329  			}
   330  		}
   331  	}
   332  }
   333  
   334  func (thc *tabletHealthCheck) closeConnection(ctx context.Context, err error) {
   335  	log.Warningf("tablet %v healthcheck stream error: %v", thc.Tablet, err)
   336  	thc.setServingState(false, err.Error())
   337  	thc.LastError = err
   338  	_ = thc.Conn.Close(ctx)
   339  	thc.Conn = nil
   340  }
   341  
   342  // finalizeConn closes the health checking connection.
   343  // To be called only on exit from checkConn().
   344  func (thc *tabletHealthCheck) finalizeConn() {
   345  	thc.setServingState(false, "finalizeConn closing connection")
   346  	// Note: checkConn() exits only when thc.ctx.Done() is closed. Thus it's
   347  	// safe to simply get Err() value here and assign to LastError.
   348  	thc.LastError = thc.ctx.Err()
   349  	if thc.Conn != nil {
   350  		// Don't use thc.ctx because it's already closed.
   351  		// Use a separate context, and add a timeout to prevent unbounded waits.
   352  		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   353  		defer cancel()
   354  		_ = thc.Conn.Close(ctx)
   355  		thc.Conn = nil
   356  	}
   357  }