github.com/KinWaiYuen/client-go/v2@v2.5.4/internal/locate/region_cache.go (about)

     1  // Copyright 2021 TiKV Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // NOTE: The code in this file is based on code from the
    16  // TiDB project, licensed under the Apache License v 2.0
    17  //
    18  // https://github.com/pingcap/tidb/tree/cc5e161ac06827589c4966674597c137cc9e809c/store/tikv/locate/region_cache.go
    19  //
    20  
    21  // Copyright 2016 PingCAP, Inc.
    22  //
    23  // Licensed under the Apache License, Version 2.0 (the "License");
    24  // you may not use this file except in compliance with the License.
    25  // You may obtain a copy of the License at
    26  //
    27  //     http://www.apache.org/licenses/LICENSE-2.0
    28  //
    29  // Unless required by applicable law or agreed to in writing, software
    30  // distributed under the License is distributed on an "AS IS" BASIS,
    31  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    32  // See the License for the specific language governing permissions and
    33  // limitations under the License.
    34  
    35  package locate
    36  
    37  import (
    38  	"bytes"
    39  	"context"
    40  	"fmt"
    41  	"math/rand"
    42  	"strings"
    43  	"sync"
    44  	"sync/atomic"
    45  	"time"
    46  	"unsafe"
    47  
    48  	"github.com/KinWaiYuen/client-go/v2/config"
    49  	"github.com/KinWaiYuen/client-go/v2/internal/client"
    50  	"github.com/KinWaiYuen/client-go/v2/internal/logutil"
    51  	"github.com/KinWaiYuen/client-go/v2/internal/retry"
    52  	"github.com/KinWaiYuen/client-go/v2/kv"
    53  	"github.com/KinWaiYuen/client-go/v2/metrics"
    54  	"github.com/KinWaiYuen/client-go/v2/tikvrpc"
    55  	"github.com/KinWaiYuen/client-go/v2/util"
    56  	"github.com/gogo/protobuf/proto"
    57  	"github.com/google/btree"
    58  	"github.com/opentracing/opentracing-go"
    59  	"github.com/pingcap/errors"
    60  	"github.com/pingcap/kvproto/pkg/metapb"
    61  	"github.com/pingcap/parser/terror"
    62  	pd "github.com/tikv/pd/client"
    63  	atomic2 "go.uber.org/atomic"
    64  	"go.uber.org/zap"
    65  	"golang.org/x/sync/singleflight"
    66  	"google.golang.org/grpc"
    67  	"google.golang.org/grpc/backoff"
    68  	"google.golang.org/grpc/credentials"
    69  	healthpb "google.golang.org/grpc/health/grpc_health_v1"
    70  	"google.golang.org/grpc/keepalive"
    71  )
    72  
    73  const (
    74  	btreeDegree               = 32
    75  	invalidatedLastAccessTime = -1
    76  	defaultRegionsPerBatch    = 128
    77  )
    78  
    79  // regionCacheTTLSec is the max idle time for regions in the region cache.
    80  var regionCacheTTLSec int64 = 600
    81  
    82  // SetRegionCacheTTLSec sets regionCacheTTLSec to t.
    83  func SetRegionCacheTTLSec(t int64) {
    84  	regionCacheTTLSec = t
    85  }
    86  
    87  const (
    88  	updated  int32 = iota // region is updated and no need to reload.
    89  	needSync              // need sync new region info.
    90  )
    91  
    92  // InvalidReason is the reason why a cached region is invalidated.
    93  // The region cache may take different strategies to handle different reasons.
    94  // For example, when a cached region is invalidated due to no leader, region cache
    95  // will always access to a different peer.
    96  type InvalidReason int32
    97  
    98  const (
    99  	// Ok indicates the cached region is valid
   100  	Ok InvalidReason = iota
   101  	// NoLeader indicates it's invalidated due to no leader
   102  	NoLeader
   103  	// RegionNotFound indicates it's invalidated due to region not found in the store
   104  	RegionNotFound
   105  	// EpochNotMatch indicates it's invalidated due to epoch not match
   106  	EpochNotMatch
   107  	// StoreNotFound indicates it's invalidated due to store not found in PD
   108  	StoreNotFound
   109  	// Other indicates it's invalidated due to other reasons, e.g., the store
   110  	// is removed from the cluster, fail to send requests to the store.
   111  	Other
   112  )
   113  
   114  // Region presents kv region
   115  type Region struct {
   116  	meta          *metapb.Region // raw region meta from PD immutable after init
   117  	store         unsafe.Pointer // point to region store info, see RegionStore
   118  	syncFlag      int32          // region need be sync in next turn
   119  	lastAccess    int64          // last region access time, see checkRegionCacheTTL
   120  	invalidReason InvalidReason  // the reason why the region is invalidated
   121  }
   122  
   123  // AccessIndex represent the index for accessIndex array
   124  type AccessIndex int
   125  
   126  // regionStore represents region stores info
   127  // it will be store as unsafe.Pointer and be load at once
   128  type regionStore struct {
   129  	workTiKVIdx    AccessIndex          // point to current work peer in meta.Peers and work store in stores(same idx) for tikv peer
   130  	proxyTiKVIdx   AccessIndex          // point to the tikv peer that can forward requests to the leader. -1 means not using proxy
   131  	workTiFlashIdx int32                // point to current work peer in meta.Peers and work store in stores(same idx) for tiflash peer
   132  	stores         []*Store             // stores in this region
   133  	storeEpochs    []uint32             // snapshots of store's epoch, need reload when `storeEpochs[curr] != stores[cur].fail`
   134  	accessIndex    [numAccessMode][]int // AccessMode => idx in stores
   135  }
   136  
   137  func (r *regionStore) accessStore(mode accessMode, idx AccessIndex) (int, *Store) {
   138  	sidx := r.accessIndex[mode][idx]
   139  	return sidx, r.stores[sidx]
   140  }
   141  
   142  func (r *regionStore) getAccessIndex(mode accessMode, store *Store) AccessIndex {
   143  	for index, sidx := range r.accessIndex[mode] {
   144  		if r.stores[sidx].storeID == store.storeID {
   145  			return AccessIndex(index)
   146  		}
   147  	}
   148  	return -1
   149  }
   150  
   151  func (r *regionStore) accessStoreNum(mode accessMode) int {
   152  	return len(r.accessIndex[mode])
   153  }
   154  
   155  // clone clones region store struct.
   156  func (r *regionStore) clone() *regionStore {
   157  	storeEpochs := make([]uint32, len(r.stores))
   158  	rs := &regionStore{
   159  		workTiFlashIdx: r.workTiFlashIdx,
   160  		proxyTiKVIdx:   r.proxyTiKVIdx,
   161  		workTiKVIdx:    r.workTiKVIdx,
   162  		stores:         r.stores,
   163  		storeEpochs:    storeEpochs,
   164  	}
   165  	copy(storeEpochs, r.storeEpochs)
   166  	for i := 0; i < int(numAccessMode); i++ {
   167  		rs.accessIndex[i] = make([]int, len(r.accessIndex[i]))
   168  		copy(rs.accessIndex[i], r.accessIndex[i])
   169  	}
   170  	return rs
   171  }
   172  
   173  // return next follower store's index
   174  func (r *regionStore) follower(seed uint32, op *storeSelectorOp) AccessIndex {
   175  	l := uint32(r.accessStoreNum(tiKVOnly))
   176  	if l <= 1 {
   177  		return r.workTiKVIdx
   178  	}
   179  
   180  	for retry := l - 1; retry > 0; retry-- {
   181  		followerIdx := AccessIndex(seed % (l - 1))
   182  		if followerIdx >= r.workTiKVIdx {
   183  			followerIdx++
   184  		}
   185  		storeIdx, s := r.accessStore(tiKVOnly, followerIdx)
   186  		if r.storeEpochs[storeIdx] == atomic.LoadUint32(&s.epoch) && r.filterStoreCandidate(followerIdx, op) {
   187  			return followerIdx
   188  		}
   189  		seed++
   190  	}
   191  	return r.workTiKVIdx
   192  }
   193  
   194  // return next leader or follower store's index
   195  func (r *regionStore) kvPeer(seed uint32, op *storeSelectorOp) AccessIndex {
   196  	if op.leaderOnly {
   197  		return r.workTiKVIdx
   198  	}
   199  	candidates := make([]AccessIndex, 0, r.accessStoreNum(tiKVOnly))
   200  	for i := 0; i < r.accessStoreNum(tiKVOnly); i++ {
   201  		accessIdx := AccessIndex(i)
   202  		storeIdx, s := r.accessStore(tiKVOnly, accessIdx)
   203  		if r.storeEpochs[storeIdx] != atomic.LoadUint32(&s.epoch) || !r.filterStoreCandidate(accessIdx, op) {
   204  			continue
   205  		}
   206  		candidates = append(candidates, accessIdx)
   207  	}
   208  	// If there is no candidates, send to current workTiKVIdx which generally is the leader.
   209  	if len(candidates) == 0 {
   210  		return r.workTiKVIdx
   211  	}
   212  	return candidates[seed%uint32(len(candidates))]
   213  }
   214  
   215  func (r *regionStore) filterStoreCandidate(aidx AccessIndex, op *storeSelectorOp) bool {
   216  	_, s := r.accessStore(tiKVOnly, aidx)
   217  	// filter label unmatched store
   218  	return s.IsLabelsMatch(op.labels)
   219  }
   220  
   221  // init initializes region after constructed.
   222  func (r *Region) init(bo *retry.Backoffer, c *RegionCache) error {
   223  	// region store pull used store from global store map
   224  	// to avoid acquire storeMu in later access.
   225  	rs := &regionStore{
   226  		workTiKVIdx:    0,
   227  		proxyTiKVIdx:   -1,
   228  		workTiFlashIdx: 0,
   229  		stores:         make([]*Store, 0, len(r.meta.Peers)),
   230  		storeEpochs:    make([]uint32, 0, len(r.meta.Peers)),
   231  	}
   232  	availablePeers := r.meta.GetPeers()[:0]
   233  	for _, p := range r.meta.Peers {
   234  		c.storeMu.RLock()
   235  		store, exists := c.storeMu.stores[p.StoreId]
   236  		c.storeMu.RUnlock()
   237  		if !exists {
   238  			store = c.getStoreByStoreID(p.StoreId)
   239  		}
   240  		addr, err := store.initResolve(bo, c)
   241  		if err != nil {
   242  			return err
   243  		}
   244  		// Filter the peer on a tombstone store.
   245  		if addr == "" {
   246  			continue
   247  		}
   248  		availablePeers = append(availablePeers, p)
   249  		switch store.storeType {
   250  		case tikvrpc.TiKV:
   251  			rs.accessIndex[tiKVOnly] = append(rs.accessIndex[tiKVOnly], len(rs.stores))
   252  		case tikvrpc.TiFlash:
   253  			rs.accessIndex[tiFlashOnly] = append(rs.accessIndex[tiFlashOnly], len(rs.stores))
   254  		}
   255  		rs.stores = append(rs.stores, store)
   256  		rs.storeEpochs = append(rs.storeEpochs, atomic.LoadUint32(&store.epoch))
   257  	}
   258  	// TODO(youjiali1995): It's possible the region info in PD is stale for now but it can recover.
   259  	// Maybe we need backoff here.
   260  	if len(availablePeers) == 0 {
   261  		return errors.Errorf("no available peers, region: {%v}", r.meta)
   262  	}
   263  	r.meta.Peers = availablePeers
   264  
   265  	atomic.StorePointer(&r.store, unsafe.Pointer(rs))
   266  
   267  	// mark region has been init accessed.
   268  	r.lastAccess = time.Now().Unix()
   269  	return nil
   270  }
   271  
   272  func (r *Region) getStore() (store *regionStore) {
   273  	store = (*regionStore)(atomic.LoadPointer(&r.store))
   274  	return
   275  }
   276  
   277  func (r *Region) compareAndSwapStore(oldStore, newStore *regionStore) bool {
   278  	return atomic.CompareAndSwapPointer(&r.store, unsafe.Pointer(oldStore), unsafe.Pointer(newStore))
   279  }
   280  
   281  func (r *Region) checkRegionCacheTTL(ts int64) bool {
   282  	// Only consider use percentage on this failpoint, for example, "2%return"
   283  	if _, err := util.EvalFailpoint("invalidateRegionCache"); err == nil {
   284  		r.invalidate(Other)
   285  	}
   286  	for {
   287  		lastAccess := atomic.LoadInt64(&r.lastAccess)
   288  		if ts-lastAccess > regionCacheTTLSec {
   289  			return false
   290  		}
   291  		if atomic.CompareAndSwapInt64(&r.lastAccess, lastAccess, ts) {
   292  			return true
   293  		}
   294  	}
   295  }
   296  
   297  // invalidate invalidates a region, next time it will got null result.
   298  func (r *Region) invalidate(reason InvalidReason) {
   299  	metrics.RegionCacheCounterWithInvalidateRegionFromCacheOK.Inc()
   300  	atomic.StoreInt32((*int32)(&r.invalidReason), int32(reason))
   301  	atomic.StoreInt64(&r.lastAccess, invalidatedLastAccessTime)
   302  }
   303  
   304  // scheduleReload schedules reload region request in next LocateKey.
   305  func (r *Region) scheduleReload() {
   306  	oldValue := atomic.LoadInt32(&r.syncFlag)
   307  	if oldValue != updated {
   308  		return
   309  	}
   310  	atomic.CompareAndSwapInt32(&r.syncFlag, oldValue, needSync)
   311  }
   312  
   313  // checkNeedReloadAndMarkUpdated returns whether the region need reload and marks the region to be updated.
   314  func (r *Region) checkNeedReloadAndMarkUpdated() bool {
   315  	oldValue := atomic.LoadInt32(&r.syncFlag)
   316  	if oldValue == updated {
   317  		return false
   318  	}
   319  	return atomic.CompareAndSwapInt32(&r.syncFlag, oldValue, updated)
   320  }
   321  
   322  func (r *Region) checkNeedReload() bool {
   323  	v := atomic.LoadInt32(&r.syncFlag)
   324  	return v != updated
   325  }
   326  
   327  func (r *Region) isValid() bool {
   328  	return r != nil && !r.checkNeedReload() && r.checkRegionCacheTTL(time.Now().Unix())
   329  }
   330  
   331  // RegionCache caches Regions loaded from PD.
   332  type RegionCache struct {
   333  	pdClient         pd.Client
   334  	enableForwarding bool
   335  
   336  	mu struct {
   337  		sync.RWMutex                           // mutex protect cached region
   338  		regions        map[RegionVerID]*Region // cached regions are organized as regionVerID to region ref mapping
   339  		latestVersions map[uint64]RegionVerID  // cache the map from regionID to its latest RegionVerID
   340  		sorted         *btree.BTree            // cache regions are organized as sorted key to region ref mapping
   341  	}
   342  	storeMu struct {
   343  		sync.RWMutex
   344  		stores map[uint64]*Store
   345  	}
   346  	notifyCheckCh chan struct{}
   347  	closeCh       chan struct{}
   348  
   349  	testingKnobs struct {
   350  		// Replace the requestLiveness function for test purpose. Note that in unit tests, if this is not set,
   351  		// requestLiveness always returns unreachable.
   352  		mockRequestLiveness func(s *Store, bo *retry.Backoffer) livenessState
   353  	}
   354  }
   355  
   356  // NewRegionCache creates a RegionCache.
   357  func NewRegionCache(pdClient pd.Client) *RegionCache {
   358  	c := &RegionCache{
   359  		pdClient: pdClient,
   360  	}
   361  	c.mu.regions = make(map[RegionVerID]*Region)
   362  	c.mu.latestVersions = make(map[uint64]RegionVerID)
   363  	c.mu.sorted = btree.New(btreeDegree)
   364  	c.storeMu.stores = make(map[uint64]*Store)
   365  	c.notifyCheckCh = make(chan struct{}, 1)
   366  	c.closeCh = make(chan struct{})
   367  	interval := config.GetGlobalConfig().StoresRefreshInterval
   368  	go c.asyncCheckAndResolveLoop(time.Duration(interval) * time.Second)
   369  	c.enableForwarding = config.GetGlobalConfig().EnableForwarding
   370  	return c
   371  }
   372  
   373  // clear clears all cached data in the RegionCache. It's only used in tests.
   374  func (c *RegionCache) clear() {
   375  	c.mu.Lock()
   376  	c.mu.regions = make(map[RegionVerID]*Region)
   377  	c.mu.latestVersions = make(map[uint64]RegionVerID)
   378  	c.mu.sorted = btree.New(btreeDegree)
   379  	c.mu.Unlock()
   380  	c.storeMu.Lock()
   381  	c.storeMu.stores = make(map[uint64]*Store)
   382  	c.storeMu.Unlock()
   383  }
   384  
   385  // Close releases region cache's resource.
   386  func (c *RegionCache) Close() {
   387  	close(c.closeCh)
   388  }
   389  
   390  // asyncCheckAndResolveLoop with
   391  func (c *RegionCache) asyncCheckAndResolveLoop(interval time.Duration) {
   392  	ticker := time.NewTicker(interval)
   393  	defer ticker.Stop()
   394  	var needCheckStores []*Store
   395  	for {
   396  		needCheckStores = needCheckStores[:0]
   397  		select {
   398  		case <-c.closeCh:
   399  			return
   400  		case <-c.notifyCheckCh:
   401  			c.checkAndResolve(needCheckStores, func(s *Store) bool {
   402  				return s.getResolveState() == needCheck
   403  			})
   404  		case <-ticker.C:
   405  			// refresh store to update labels.
   406  			c.checkAndResolve(needCheckStores, func(s *Store) bool {
   407  				state := s.getResolveState()
   408  				// Only valid stores should be reResolved. In fact, it's impossible
   409  				// there's a deleted store in the stores map which guaranteed by reReslve().
   410  				return state != unresolved && state != tombstone && state != deleted
   411  			})
   412  		}
   413  	}
   414  }
   415  
   416  // checkAndResolve checks and resolve addr of failed stores.
   417  // this method isn't thread-safe and only be used by one goroutine.
   418  func (c *RegionCache) checkAndResolve(needCheckStores []*Store, needCheck func(*Store) bool) {
   419  	defer func() {
   420  		r := recover()
   421  		if r != nil {
   422  			logutil.BgLogger().Error("panic in the checkAndResolve goroutine",
   423  				zap.Reflect("r", r),
   424  				zap.Stack("stack trace"))
   425  		}
   426  	}()
   427  
   428  	c.storeMu.RLock()
   429  	for _, store := range c.storeMu.stores {
   430  		if needCheck(store) {
   431  			needCheckStores = append(needCheckStores, store)
   432  		}
   433  	}
   434  	c.storeMu.RUnlock()
   435  
   436  	for _, store := range needCheckStores {
   437  		_, err := store.reResolve(c)
   438  		terror.Log(err)
   439  	}
   440  }
   441  
   442  // SetRegionCacheStore is used to set a store in region cache, for testing only
   443  func (c *RegionCache) SetRegionCacheStore(id uint64, storeType tikvrpc.EndpointType, state uint64, labels []*metapb.StoreLabel) {
   444  	c.storeMu.Lock()
   445  	defer c.storeMu.Unlock()
   446  	c.storeMu.stores[id] = &Store{
   447  		storeID:   id,
   448  		storeType: storeType,
   449  		state:     state,
   450  		labels:    labels,
   451  	}
   452  }
   453  
   454  // SetPDClient replaces pd client,for testing only
   455  func (c *RegionCache) SetPDClient(client pd.Client) {
   456  	c.pdClient = client
   457  }
   458  
   459  // RPCContext contains data that is needed to send RPC to a region.
   460  type RPCContext struct {
   461  	Region     RegionVerID
   462  	Meta       *metapb.Region
   463  	Peer       *metapb.Peer
   464  	AccessIdx  AccessIndex
   465  	Store      *Store
   466  	Addr       string
   467  	AccessMode accessMode
   468  	ProxyStore *Store // nil means proxy is not used
   469  	ProxyAddr  string // valid when ProxyStore is not nil
   470  	TiKVNum    int    // Number of TiKV nodes among the region's peers. Assuming non-TiKV peers are all TiFlash peers.
   471  }
   472  
   473  func (c *RPCContext) String() string {
   474  	var runStoreType string
   475  	if c.Store != nil {
   476  		runStoreType = c.Store.storeType.Name()
   477  	}
   478  	res := fmt.Sprintf("region ID: %d, meta: %s, peer: %s, addr: %s, idx: %d, reqStoreType: %s, runStoreType: %s",
   479  		c.Region.GetID(), c.Meta, c.Peer, c.Addr, c.AccessIdx, c.AccessMode, runStoreType)
   480  	if c.ProxyStore != nil {
   481  		res += fmt.Sprintf(", proxy store id: %d, proxy addr: %s", c.ProxyStore.storeID, c.ProxyStore.addr)
   482  	}
   483  	return res
   484  }
   485  
   486  type storeSelectorOp struct {
   487  	leaderOnly bool
   488  	labels     []*metapb.StoreLabel
   489  }
   490  
   491  // StoreSelectorOption configures storeSelectorOp.
   492  type StoreSelectorOption func(*storeSelectorOp)
   493  
   494  // WithMatchLabels indicates selecting stores with matched labels.
   495  func WithMatchLabels(labels []*metapb.StoreLabel) StoreSelectorOption {
   496  	return func(op *storeSelectorOp) {
   497  		op.labels = append(op.labels, labels...)
   498  	}
   499  }
   500  
   501  // WithLeaderOnly indicates selecting stores with leader only.
   502  func WithLeaderOnly() StoreSelectorOption {
   503  	return func(op *storeSelectorOp) {
   504  		op.leaderOnly = true
   505  	}
   506  }
   507  
   508  // GetTiKVRPCContext returns RPCContext for a region. If it returns nil, the region
   509  // must be out of date and already dropped from cache.
   510  func (c *RegionCache) GetTiKVRPCContext(bo *retry.Backoffer, id RegionVerID, replicaRead kv.ReplicaReadType, followerStoreSeed uint32, opts ...StoreSelectorOption) (*RPCContext, error) {
   511  	ts := time.Now().Unix()
   512  
   513  	cachedRegion := c.GetCachedRegionWithRLock(id)
   514  	if cachedRegion == nil {
   515  		return nil, nil
   516  	}
   517  
   518  	if cachedRegion.checkNeedReload() {
   519  		return nil, nil
   520  	}
   521  
   522  	if !cachedRegion.checkRegionCacheTTL(ts) {
   523  		return nil, nil
   524  	}
   525  
   526  	regionStore := cachedRegion.getStore()
   527  	var (
   528  		store     *Store
   529  		peer      *metapb.Peer
   530  		storeIdx  int
   531  		accessIdx AccessIndex
   532  	)
   533  	options := &storeSelectorOp{}
   534  	for _, op := range opts {
   535  		op(options)
   536  	}
   537  	isLeaderReq := false
   538  	switch replicaRead {
   539  	case kv.ReplicaReadFollower:
   540  		store, peer, accessIdx, storeIdx = cachedRegion.FollowerStorePeer(regionStore, followerStoreSeed, options)
   541  	case kv.ReplicaReadMixed:
   542  		store, peer, accessIdx, storeIdx = cachedRegion.AnyStorePeer(regionStore, followerStoreSeed, options)
   543  	default:
   544  		isLeaderReq = true
   545  		store, peer, accessIdx, storeIdx = cachedRegion.WorkStorePeer(regionStore)
   546  	}
   547  	addr, err := c.getStoreAddr(bo, cachedRegion, store)
   548  	if err != nil {
   549  		return nil, err
   550  	}
   551  	// enable by `curl -XPUT -d '1*return("[some-addr]")->return("")' http://host:port/tikvclient/injectWrongStoreAddr`
   552  	if val, err := util.EvalFailpoint("injectWrongStoreAddr"); err == nil {
   553  		if a, ok := val.(string); ok && len(a) > 0 {
   554  			addr = a
   555  		}
   556  	}
   557  	if store == nil || len(addr) == 0 {
   558  		// Store not found, region must be out of date.
   559  		cachedRegion.invalidate(StoreNotFound)
   560  		return nil, nil
   561  	}
   562  
   563  	storeFailEpoch := atomic.LoadUint32(&store.epoch)
   564  	if storeFailEpoch != regionStore.storeEpochs[storeIdx] {
   565  		cachedRegion.invalidate(Other)
   566  		logutil.BgLogger().Info("invalidate current region, because others failed on same store",
   567  			zap.Uint64("region", id.GetID()),
   568  			zap.String("store", store.addr))
   569  		return nil, nil
   570  	}
   571  
   572  	var (
   573  		proxyStore *Store
   574  		proxyAddr  string
   575  	)
   576  	if c.enableForwarding && isLeaderReq {
   577  		if atomic.LoadInt32(&store.unreachable) == 0 {
   578  			regionStore.unsetProxyStoreIfNeeded(cachedRegion)
   579  		} else {
   580  			proxyStore, _, _ = c.getProxyStore(cachedRegion, store, regionStore, accessIdx)
   581  			if proxyStore != nil {
   582  				proxyAddr, err = c.getStoreAddr(bo, cachedRegion, proxyStore)
   583  				if err != nil {
   584  					return nil, err
   585  				}
   586  			}
   587  		}
   588  	}
   589  
   590  	return &RPCContext{
   591  		Region:     id,
   592  		Meta:       cachedRegion.meta,
   593  		Peer:       peer,
   594  		AccessIdx:  accessIdx,
   595  		Store:      store,
   596  		Addr:       addr,
   597  		AccessMode: tiKVOnly,
   598  		ProxyStore: proxyStore,
   599  		ProxyAddr:  proxyAddr,
   600  		TiKVNum:    regionStore.accessStoreNum(tiKVOnly),
   601  	}, nil
   602  }
   603  
   604  // GetAllValidTiFlashStores returns the store ids of all valid TiFlash stores, the store id of currentStore is always the first one
   605  func (c *RegionCache) GetAllValidTiFlashStores(id RegionVerID, currentStore *Store) []uint64 {
   606  	// set the cap to 2 because usually, TiFlash table will have 2 replicas
   607  	allStores := make([]uint64, 0, 2)
   608  	// make sure currentStore id is always the first in allStores
   609  	allStores = append(allStores, currentStore.storeID)
   610  	ts := time.Now().Unix()
   611  	cachedRegion := c.GetCachedRegionWithRLock(id)
   612  	if cachedRegion == nil {
   613  		return allStores
   614  	}
   615  	if !cachedRegion.checkRegionCacheTTL(ts) {
   616  		return allStores
   617  	}
   618  	regionStore := cachedRegion.getStore()
   619  	currentIndex := regionStore.getAccessIndex(tiFlashOnly, currentStore)
   620  	if currentIndex == -1 {
   621  		return allStores
   622  	}
   623  	for startOffset := 1; startOffset < regionStore.accessStoreNum(tiFlashOnly); startOffset++ {
   624  		accessIdx := AccessIndex((int(currentIndex) + startOffset) % regionStore.accessStoreNum(tiFlashOnly))
   625  		storeIdx, store := regionStore.accessStore(tiFlashOnly, accessIdx)
   626  		if store.getResolveState() == needCheck {
   627  			continue
   628  		}
   629  		storeFailEpoch := atomic.LoadUint32(&store.epoch)
   630  		if storeFailEpoch != regionStore.storeEpochs[storeIdx] {
   631  			continue
   632  		}
   633  		allStores = append(allStores, store.storeID)
   634  	}
   635  	return allStores
   636  }
   637  
   638  // GetTiFlashRPCContext returns RPCContext for a region must access flash store. If it returns nil, the region
   639  // must be out of date and already dropped from cache or not flash store found.
   640  // `loadBalance` is an option. For MPP and batch cop, it is pointless and might cause try the failed store repeatly.
   641  func (c *RegionCache) GetTiFlashRPCContext(bo *retry.Backoffer, id RegionVerID, loadBalance bool) (*RPCContext, error) {
   642  	ts := time.Now().Unix()
   643  
   644  	cachedRegion := c.GetCachedRegionWithRLock(id)
   645  	if cachedRegion == nil {
   646  		return nil, nil
   647  	}
   648  	if !cachedRegion.checkRegionCacheTTL(ts) {
   649  		return nil, nil
   650  	}
   651  
   652  	regionStore := cachedRegion.getStore()
   653  
   654  	// sIdx is for load balance of TiFlash store.
   655  	var sIdx int
   656  	if loadBalance {
   657  		sIdx = int(atomic.AddInt32(&regionStore.workTiFlashIdx, 1))
   658  	} else {
   659  		sIdx = int(atomic.LoadInt32(&regionStore.workTiFlashIdx))
   660  	}
   661  	for i := 0; i < regionStore.accessStoreNum(tiFlashOnly); i++ {
   662  		accessIdx := AccessIndex((sIdx + i) % regionStore.accessStoreNum(tiFlashOnly))
   663  		storeIdx, store := regionStore.accessStore(tiFlashOnly, accessIdx)
   664  		addr, err := c.getStoreAddr(bo, cachedRegion, store)
   665  		if err != nil {
   666  			return nil, err
   667  		}
   668  		if len(addr) == 0 {
   669  			cachedRegion.invalidate(StoreNotFound)
   670  			return nil, nil
   671  		}
   672  		if store.getResolveState() == needCheck {
   673  			_, err := store.reResolve(c)
   674  			terror.Log(err)
   675  		}
   676  		atomic.StoreInt32(&regionStore.workTiFlashIdx, int32(accessIdx))
   677  		peer := cachedRegion.meta.Peers[storeIdx]
   678  		storeFailEpoch := atomic.LoadUint32(&store.epoch)
   679  		if storeFailEpoch != regionStore.storeEpochs[storeIdx] {
   680  			cachedRegion.invalidate(Other)
   681  			logutil.BgLogger().Info("invalidate current region, because others failed on same store",
   682  				zap.Uint64("region", id.GetID()),
   683  				zap.String("store", store.addr))
   684  			// TiFlash will always try to find out a valid peer, avoiding to retry too many times.
   685  			continue
   686  		}
   687  		return &RPCContext{
   688  			Region:     id,
   689  			Meta:       cachedRegion.meta,
   690  			Peer:       peer,
   691  			AccessIdx:  accessIdx,
   692  			Store:      store,
   693  			Addr:       addr,
   694  			AccessMode: tiFlashOnly,
   695  			TiKVNum:    regionStore.accessStoreNum(tiKVOnly),
   696  		}, nil
   697  	}
   698  
   699  	cachedRegion.invalidate(Other)
   700  	return nil, nil
   701  }
   702  
   703  // KeyLocation is the region and range that a key is located.
   704  type KeyLocation struct {
   705  	Region   RegionVerID
   706  	StartKey []byte
   707  	EndKey   []byte
   708  }
   709  
   710  // Contains checks if key is in [StartKey, EndKey).
   711  func (l *KeyLocation) Contains(key []byte) bool {
   712  	return bytes.Compare(l.StartKey, key) <= 0 &&
   713  		(bytes.Compare(key, l.EndKey) < 0 || len(l.EndKey) == 0)
   714  }
   715  
   716  // String implements fmt.Stringer interface.
   717  func (l *KeyLocation) String() string {
   718  	return fmt.Sprintf("region %s,startKey:%s,endKey:%s", l.Region.String(), kv.StrKey(l.StartKey), kv.StrKey(l.EndKey))
   719  }
   720  
   721  // LocateKey searches for the region and range that the key is located.
   722  func (c *RegionCache) LocateKey(bo *retry.Backoffer, key []byte) (*KeyLocation, error) {
   723  	r, err := c.findRegionByKey(bo, key, false)
   724  	if err != nil {
   725  		return nil, err
   726  	}
   727  	return &KeyLocation{
   728  		Region:   r.VerID(),
   729  		StartKey: r.StartKey(),
   730  		EndKey:   r.EndKey(),
   731  	}, nil
   732  }
   733  
   734  // LocateEndKey searches for the region and range that the key is located.
   735  // Unlike LocateKey, start key of a region is exclusive and end key is inclusive.
   736  func (c *RegionCache) LocateEndKey(bo *retry.Backoffer, key []byte) (*KeyLocation, error) {
   737  	r, err := c.findRegionByKey(bo, key, true)
   738  	if err != nil {
   739  		return nil, err
   740  	}
   741  	return &KeyLocation{
   742  		Region:   r.VerID(),
   743  		StartKey: r.StartKey(),
   744  		EndKey:   r.EndKey(),
   745  	}, nil
   746  }
   747  
   748  func (c *RegionCache) findRegionByKey(bo *retry.Backoffer, key []byte, isEndKey bool) (r *Region, err error) {
   749  	r = c.searchCachedRegion(key, isEndKey)
   750  	if r == nil {
   751  		// load region when it is not exists or expired.
   752  		lr, err := c.loadRegion(bo, key, isEndKey)
   753  		if err != nil {
   754  			// no region data, return error if failure.
   755  			return nil, err
   756  		}
   757  		logutil.Eventf(bo.GetCtx(), "load region %d from pd, due to cache-miss", lr.GetID())
   758  		r = lr
   759  		c.mu.Lock()
   760  		c.insertRegionToCache(r)
   761  		c.mu.Unlock()
   762  	} else if r.checkNeedReloadAndMarkUpdated() {
   763  		// load region when it be marked as need reload.
   764  		lr, err := c.loadRegion(bo, key, isEndKey)
   765  		if err != nil {
   766  			// ignore error and use old region info.
   767  			logutil.Logger(bo.GetCtx()).Error("load region failure",
   768  				zap.ByteString("key", key), zap.Error(err))
   769  		} else {
   770  			logutil.Eventf(bo.GetCtx(), "load region %d from pd, due to need-reload", lr.GetID())
   771  			r = lr
   772  			c.mu.Lock()
   773  			c.insertRegionToCache(r)
   774  			c.mu.Unlock()
   775  		}
   776  	}
   777  	return r, nil
   778  }
   779  
   780  // OnSendFailForTiFlash handles send request fail logic for tiflash.
   781  func (c *RegionCache) OnSendFailForTiFlash(bo *retry.Backoffer, store *Store, region RegionVerID, prev *metapb.Region, scheduleReload bool, err error, skipSwitchPeerLog bool) {
   782  
   783  	r := c.GetCachedRegionWithRLock(region)
   784  	if r == nil {
   785  		return
   786  	}
   787  
   788  	rs := r.getStore()
   789  	peersNum := len(r.GetMeta().Peers)
   790  	if len(prev.Peers) != peersNum {
   791  		logutil.Logger(bo.GetCtx()).Info("retry and refresh current region after send request fail and up/down stores length changed",
   792  			zap.Stringer("region", &region),
   793  			zap.Bool("needReload", scheduleReload),
   794  			zap.Reflect("oldPeers", prev.Peers),
   795  			zap.Reflect("newPeers", r.GetMeta().Peers),
   796  			zap.Error(err))
   797  		return
   798  	}
   799  
   800  	accessMode := tiFlashOnly
   801  	accessIdx := rs.getAccessIndex(accessMode, store)
   802  	if accessIdx == -1 {
   803  		logutil.Logger(bo.GetCtx()).Warn("can not get access index for region " + region.String())
   804  		return
   805  	}
   806  	if err != nil {
   807  		storeIdx, s := rs.accessStore(accessMode, accessIdx)
   808  		c.markRegionNeedBeRefill(s, storeIdx, rs)
   809  	}
   810  
   811  	// try next peer
   812  	rs.switchNextFlashPeer(r, accessIdx)
   813  	// In most scenarios, TiFlash will batch all the regions in one TiFlash store into one request, so when meet send failure,
   814  	// this function is called repeatedly for all the regions, since one TiFlash store might contain thousands of regions, we
   815  	// need a way to avoid generating too much useless log
   816  	if !skipSwitchPeerLog {
   817  		logutil.Logger(bo.GetCtx()).Info("switch region tiflash peer to next due to send request fail",
   818  			zap.Stringer("region", &region),
   819  			zap.Bool("needReload", scheduleReload),
   820  			zap.Error(err))
   821  	}
   822  
   823  	// force reload region when retry all known peers in region.
   824  	if scheduleReload {
   825  		r.scheduleReload()
   826  	}
   827  }
   828  
   829  func (c *RegionCache) markRegionNeedBeRefill(s *Store, storeIdx int, rs *regionStore) int {
   830  	incEpochStoreIdx := -1
   831  	// invalidate regions in store.
   832  	epoch := rs.storeEpochs[storeIdx]
   833  	if atomic.CompareAndSwapUint32(&s.epoch, epoch, epoch+1) {
   834  		logutil.BgLogger().Info("mark store's regions need be refill", zap.String("store", s.addr))
   835  		incEpochStoreIdx = storeIdx
   836  		metrics.RegionCacheCounterWithInvalidateStoreRegionsOK.Inc()
   837  	}
   838  	// schedule a store addr resolve.
   839  	s.markNeedCheck(c.notifyCheckCh)
   840  	return incEpochStoreIdx
   841  }
   842  
   843  // OnSendFail handles send request fail logic.
   844  func (c *RegionCache) OnSendFail(bo *retry.Backoffer, ctx *RPCContext, scheduleReload bool, err error) {
   845  	metrics.RegionCacheCounterWithSendFail.Inc()
   846  	r := c.GetCachedRegionWithRLock(ctx.Region)
   847  	if r == nil {
   848  		return
   849  	}
   850  	peersNum := len(r.meta.Peers)
   851  	if len(ctx.Meta.Peers) != peersNum {
   852  		logutil.Logger(bo.GetCtx()).Info("retry and refresh current ctx after send request fail and up/down stores length changed",
   853  			zap.Stringer("current", ctx),
   854  			zap.Bool("needReload", scheduleReload),
   855  			zap.Reflect("oldPeers", ctx.Meta.Peers),
   856  			zap.Reflect("newPeers", r.meta.Peers),
   857  			zap.Error(err))
   858  		return
   859  	}
   860  
   861  	rs := r.getStore()
   862  
   863  	if err != nil {
   864  		storeIdx, s := rs.accessStore(ctx.AccessMode, ctx.AccessIdx)
   865  
   866  		// invalidate regions in store.
   867  		c.markRegionNeedBeRefill(s, storeIdx, rs)
   868  	}
   869  
   870  	// try next peer to found new leader.
   871  	if ctx.AccessMode == tiKVOnly {
   872  		rs.switchNextTiKVPeer(r, ctx.AccessIdx)
   873  		logutil.Logger(bo.GetCtx()).Info("switch region peer to next due to send request fail",
   874  			zap.Stringer("current", ctx),
   875  			zap.Bool("needReload", scheduleReload),
   876  			zap.Error(err))
   877  	} else {
   878  		rs.switchNextFlashPeer(r, ctx.AccessIdx)
   879  		logutil.Logger(bo.GetCtx()).Info("switch region tiflash peer to next due to send request fail",
   880  			zap.Stringer("current", ctx),
   881  			zap.Bool("needReload", scheduleReload),
   882  			zap.Error(err))
   883  	}
   884  
   885  	// force reload region when retry all known peers in region.
   886  	if scheduleReload {
   887  		r.scheduleReload()
   888  	}
   889  
   890  }
   891  
   892  // LocateRegionByID searches for the region with ID.
   893  func (c *RegionCache) LocateRegionByID(bo *retry.Backoffer, regionID uint64) (*KeyLocation, error) {
   894  	c.mu.RLock()
   895  	r := c.getRegionByIDFromCache(regionID)
   896  	c.mu.RUnlock()
   897  	if r != nil {
   898  		if r.checkNeedReloadAndMarkUpdated() {
   899  			lr, err := c.loadRegionByID(bo, regionID)
   900  			if err != nil {
   901  				// ignore error and use old region info.
   902  				logutil.Logger(bo.GetCtx()).Error("load region failure",
   903  					zap.Uint64("regionID", regionID), zap.Error(err))
   904  			} else {
   905  				r = lr
   906  				c.mu.Lock()
   907  				c.insertRegionToCache(r)
   908  				c.mu.Unlock()
   909  			}
   910  		}
   911  		loc := &KeyLocation{
   912  			Region:   r.VerID(),
   913  			StartKey: r.StartKey(),
   914  			EndKey:   r.EndKey(),
   915  		}
   916  		return loc, nil
   917  	}
   918  
   919  	r, err := c.loadRegionByID(bo, regionID)
   920  	if err != nil {
   921  		return nil, errors.Trace(err)
   922  	}
   923  
   924  	c.mu.Lock()
   925  	c.insertRegionToCache(r)
   926  	c.mu.Unlock()
   927  	return &KeyLocation{
   928  		Region:   r.VerID(),
   929  		StartKey: r.StartKey(),
   930  		EndKey:   r.EndKey(),
   931  	}, nil
   932  }
   933  
   934  // GroupKeysByRegion separates keys into groups by their belonging Regions.
   935  // Specially it also returns the first key's region which may be used as the
   936  // 'PrimaryLockKey' and should be committed ahead of others.
   937  // filter is used to filter some unwanted keys.
   938  func (c *RegionCache) GroupKeysByRegion(bo *retry.Backoffer, keys [][]byte, filter func(key, regionStartKey []byte) bool) (map[RegionVerID][][]byte, RegionVerID, error) {
   939  	groups := make(map[RegionVerID][][]byte)
   940  	var first RegionVerID
   941  	var lastLoc *KeyLocation
   942  	for i, k := range keys {
   943  		if lastLoc == nil || !lastLoc.Contains(k) {
   944  			var err error
   945  			lastLoc, err = c.LocateKey(bo, k)
   946  			if err != nil {
   947  				return nil, first, errors.Trace(err)
   948  			}
   949  			if filter != nil && filter(k, lastLoc.StartKey) {
   950  				continue
   951  			}
   952  		}
   953  		id := lastLoc.Region
   954  		if i == 0 {
   955  			first = id
   956  		}
   957  		groups[id] = append(groups[id], k)
   958  	}
   959  	return groups, first, nil
   960  }
   961  
   962  // ListRegionIDsInKeyRange lists ids of regions in [start_key,end_key].
   963  func (c *RegionCache) ListRegionIDsInKeyRange(bo *retry.Backoffer, startKey, endKey []byte) (regionIDs []uint64, err error) {
   964  	for {
   965  		curRegion, err := c.LocateKey(bo, startKey)
   966  		if err != nil {
   967  			return nil, errors.Trace(err)
   968  		}
   969  		regionIDs = append(regionIDs, curRegion.Region.id)
   970  		if curRegion.Contains(endKey) {
   971  			break
   972  		}
   973  		startKey = curRegion.EndKey
   974  	}
   975  	return regionIDs, nil
   976  }
   977  
   978  // LoadRegionsInKeyRange lists regions in [start_key,end_key].
   979  func (c *RegionCache) LoadRegionsInKeyRange(bo *retry.Backoffer, startKey, endKey []byte) (regions []*Region, err error) {
   980  	var batchRegions []*Region
   981  	for {
   982  		batchRegions, err = c.BatchLoadRegionsWithKeyRange(bo, startKey, endKey, defaultRegionsPerBatch)
   983  		if err != nil {
   984  			return nil, errors.Trace(err)
   985  		}
   986  		if len(batchRegions) == 0 {
   987  			// should never happen
   988  			break
   989  		}
   990  		regions = append(regions, batchRegions...)
   991  		endRegion := batchRegions[len(batchRegions)-1]
   992  		if endRegion.ContainsByEnd(endKey) {
   993  			break
   994  		}
   995  		startKey = endRegion.EndKey()
   996  	}
   997  	return
   998  }
   999  
  1000  // BatchLoadRegionsWithKeyRange loads at most given numbers of regions to the RegionCache,
  1001  // within the given key range from the startKey to endKey. Returns the loaded regions.
  1002  func (c *RegionCache) BatchLoadRegionsWithKeyRange(bo *retry.Backoffer, startKey []byte, endKey []byte, count int) (regions []*Region, err error) {
  1003  	regions, err = c.scanRegions(bo, startKey, endKey, count)
  1004  	if err != nil {
  1005  		return
  1006  	}
  1007  	if len(regions) == 0 {
  1008  		err = errors.New("PD returned no region")
  1009  		return
  1010  	}
  1011  
  1012  	c.mu.Lock()
  1013  	defer c.mu.Unlock()
  1014  
  1015  	for _, region := range regions {
  1016  		c.insertRegionToCache(region)
  1017  	}
  1018  
  1019  	return
  1020  }
  1021  
  1022  // BatchLoadRegionsFromKey loads at most given numbers of regions to the RegionCache, from the given startKey. Returns
  1023  // the endKey of the last loaded region. If some of the regions has no leader, their entries in RegionCache will not be
  1024  // updated.
  1025  func (c *RegionCache) BatchLoadRegionsFromKey(bo *retry.Backoffer, startKey []byte, count int) ([]byte, error) {
  1026  	regions, err := c.BatchLoadRegionsWithKeyRange(bo, startKey, nil, count)
  1027  	if err != nil {
  1028  		return nil, errors.Trace(err)
  1029  	}
  1030  	return regions[len(regions)-1].EndKey(), nil
  1031  }
  1032  
  1033  // InvalidateCachedRegion removes a cached Region.
  1034  func (c *RegionCache) InvalidateCachedRegion(id RegionVerID) {
  1035  	c.InvalidateCachedRegionWithReason(id, Other)
  1036  }
  1037  
  1038  // InvalidateCachedRegionWithReason removes a cached Region with the reason why it's invalidated.
  1039  func (c *RegionCache) InvalidateCachedRegionWithReason(id RegionVerID, reason InvalidReason) {
  1040  	cachedRegion := c.GetCachedRegionWithRLock(id)
  1041  	if cachedRegion == nil {
  1042  		return
  1043  	}
  1044  	cachedRegion.invalidate(reason)
  1045  }
  1046  
  1047  // UpdateLeader update some region cache with newer leader info.
  1048  func (c *RegionCache) UpdateLeader(regionID RegionVerID, leader *metapb.Peer, currentPeerIdx AccessIndex) {
  1049  	r := c.GetCachedRegionWithRLock(regionID)
  1050  	if r == nil {
  1051  		logutil.BgLogger().Debug("regionCache: cannot find region when updating leader",
  1052  			zap.Uint64("regionID", regionID.GetID()))
  1053  		return
  1054  	}
  1055  
  1056  	if leader == nil {
  1057  		rs := r.getStore()
  1058  		rs.switchNextTiKVPeer(r, currentPeerIdx)
  1059  		logutil.BgLogger().Info("switch region peer to next due to NotLeader with NULL leader",
  1060  			zap.Int("currIdx", int(currentPeerIdx)),
  1061  			zap.Uint64("regionID", regionID.GetID()))
  1062  		return
  1063  	}
  1064  
  1065  	if !c.switchWorkLeaderToPeer(r, leader) {
  1066  		logutil.BgLogger().Info("invalidate region cache due to cannot find peer when updating leader",
  1067  			zap.Uint64("regionID", regionID.GetID()),
  1068  			zap.Int("currIdx", int(currentPeerIdx)),
  1069  			zap.Uint64("leaderStoreID", leader.GetStoreId()))
  1070  		r.invalidate(StoreNotFound)
  1071  	} else {
  1072  		logutil.BgLogger().Info("switch region leader to specific leader due to kv return NotLeader",
  1073  			zap.Uint64("regionID", regionID.GetID()),
  1074  			zap.Int("currIdx", int(currentPeerIdx)),
  1075  			zap.Uint64("leaderStoreID", leader.GetStoreId()))
  1076  	}
  1077  }
  1078  
  1079  // removeVersionFromCache removes a RegionVerID from cache, tries to cleanup
  1080  // both c.mu.regions and c.mu.versions. Note this function is not thread-safe.
  1081  func (c *RegionCache) removeVersionFromCache(oldVer RegionVerID, regionID uint64) {
  1082  	delete(c.mu.regions, oldVer)
  1083  	if ver, ok := c.mu.latestVersions[regionID]; ok && ver.Equals(oldVer) {
  1084  		delete(c.mu.latestVersions, regionID)
  1085  	}
  1086  }
  1087  
  1088  // insertRegionToCache tries to insert the Region to cache.
  1089  // It should be protected by c.mu.Lock().
  1090  func (c *RegionCache) insertRegionToCache(cachedRegion *Region) {
  1091  	old := c.mu.sorted.ReplaceOrInsert(newBtreeItem(cachedRegion))
  1092  	if old != nil {
  1093  		store := cachedRegion.getStore()
  1094  		oldRegion := old.(*btreeItem).cachedRegion
  1095  		oldRegionStore := oldRegion.getStore()
  1096  		// TODO(youjiali1995): remove this because the new retry logic can handle this issue.
  1097  		//
  1098  		// Joint consensus is enabled in v5.0, which is possible to make a leader step down as a learner during a conf change.
  1099  		// And if hibernate region is enabled, after the leader step down, there can be a long time that there is no leader
  1100  		// in the region and the leader info in PD is stale until requests are sent to followers or hibernate timeout.
  1101  		// To solve it, one solution is always to try a different peer if the invalid reason of the old cached region is no-leader.
  1102  		// There is a small probability that the current peer who reports no-leader becomes a leader and TiDB has to retry once in this case.
  1103  		if InvalidReason(atomic.LoadInt32((*int32)(&oldRegion.invalidReason))) == NoLeader {
  1104  			store.workTiKVIdx = (oldRegionStore.workTiKVIdx + 1) % AccessIndex(store.accessStoreNum(tiKVOnly))
  1105  		}
  1106  		// Invalidate the old region in case it's not invalidated and some requests try with the stale region information.
  1107  		oldRegion.invalidate(Other)
  1108  		// Don't refresh TiFlash work idx for region. Otherwise, it will always goto a invalid store which
  1109  		// is under transferring regions.
  1110  		store.workTiFlashIdx = atomic.LoadInt32(&oldRegionStore.workTiFlashIdx)
  1111  		c.removeVersionFromCache(oldRegion.VerID(), cachedRegion.VerID().id)
  1112  	}
  1113  	c.mu.regions[cachedRegion.VerID()] = cachedRegion
  1114  	newVer := cachedRegion.VerID()
  1115  	latest, ok := c.mu.latestVersions[cachedRegion.VerID().id]
  1116  	if !ok || latest.GetVer() < newVer.GetVer() || latest.GetConfVer() < newVer.GetConfVer() {
  1117  		c.mu.latestVersions[cachedRegion.VerID().id] = newVer
  1118  	}
  1119  }
  1120  
  1121  // searchCachedRegion finds a region from cache by key. Like `getCachedRegion`,
  1122  // it should be called with c.mu.RLock(), and the returned Region should not be
  1123  // used after c.mu is RUnlock().
  1124  // If the given key is the end key of the region that you want, you may set the second argument to true. This is useful
  1125  // when processing in reverse order.
  1126  func (c *RegionCache) searchCachedRegion(key []byte, isEndKey bool) *Region {
  1127  	ts := time.Now().Unix()
  1128  	var r *Region
  1129  	c.mu.RLock()
  1130  	c.mu.sorted.DescendLessOrEqual(newBtreeSearchItem(key), func(item btree.Item) bool {
  1131  		r = item.(*btreeItem).cachedRegion
  1132  		if isEndKey && bytes.Equal(r.StartKey(), key) {
  1133  			r = nil     // clear result
  1134  			return true // iterate next item
  1135  		}
  1136  		if !r.checkRegionCacheTTL(ts) {
  1137  			r = nil
  1138  			return true
  1139  		}
  1140  		return false
  1141  	})
  1142  	c.mu.RUnlock()
  1143  	if r != nil && (!isEndKey && r.Contains(key) || isEndKey && r.ContainsByEnd(key)) {
  1144  		return r
  1145  	}
  1146  	return nil
  1147  }
  1148  
  1149  // getRegionByIDFromCache tries to get region by regionID from cache. Like
  1150  // `getCachedRegion`, it should be called with c.mu.RLock(), and the returned
  1151  // Region should not be used after c.mu is RUnlock().
  1152  func (c *RegionCache) getRegionByIDFromCache(regionID uint64) *Region {
  1153  	ts := time.Now().Unix()
  1154  	ver, ok := c.mu.latestVersions[regionID]
  1155  	if !ok {
  1156  		return nil
  1157  	}
  1158  	latestRegion, ok := c.mu.regions[ver]
  1159  	if !ok {
  1160  		// should not happen
  1161  		logutil.BgLogger().Warn("region version not found",
  1162  			zap.Uint64("regionID", regionID), zap.Stringer("version", &ver))
  1163  		return nil
  1164  	}
  1165  	lastAccess := atomic.LoadInt64(&latestRegion.lastAccess)
  1166  	if ts-lastAccess > regionCacheTTLSec {
  1167  		return nil
  1168  	}
  1169  	if latestRegion != nil {
  1170  		atomic.CompareAndSwapInt64(&latestRegion.lastAccess, atomic.LoadInt64(&latestRegion.lastAccess), ts)
  1171  	}
  1172  	return latestRegion
  1173  }
  1174  
  1175  // GetStoresByType gets stores by type `typ`
  1176  // TODO: revise it by get store by closure.
  1177  func (c *RegionCache) GetStoresByType(typ tikvrpc.EndpointType) []*Store {
  1178  	c.storeMu.Lock()
  1179  	defer c.storeMu.Unlock()
  1180  	stores := make([]*Store, 0)
  1181  	for _, store := range c.storeMu.stores {
  1182  		if store.getResolveState() != resolved {
  1183  			continue
  1184  		}
  1185  		if store.storeType == typ {
  1186  			//TODO: revise it with store.clone()
  1187  			storeLabel := make([]*metapb.StoreLabel, 0)
  1188  			for _, label := range store.labels {
  1189  				storeLabel = append(storeLabel, &metapb.StoreLabel{
  1190  					Key:   label.Key,
  1191  					Value: label.Value,
  1192  				})
  1193  			}
  1194  			stores = append(stores, &Store{
  1195  				addr:    store.addr,
  1196  				storeID: store.storeID,
  1197  				labels:  storeLabel,
  1198  			})
  1199  		}
  1200  	}
  1201  	return stores
  1202  }
  1203  
  1204  func filterUnavailablePeers(region *pd.Region) {
  1205  	if len(region.DownPeers) == 0 {
  1206  		return
  1207  	}
  1208  	new := region.Meta.Peers[:0]
  1209  	for _, p := range region.Meta.Peers {
  1210  		available := true
  1211  		for _, downPeer := range region.DownPeers {
  1212  			if p.Id == downPeer.Id && p.StoreId == downPeer.StoreId {
  1213  				available = false
  1214  				break
  1215  			}
  1216  		}
  1217  		if available {
  1218  			new = append(new, p)
  1219  		}
  1220  	}
  1221  	region.Meta.Peers = new
  1222  }
  1223  
  1224  // loadRegion loads region from pd client, and picks the first peer as leader.
  1225  // If the given key is the end key of the region that you want, you may set the second argument to true. This is useful
  1226  // when processing in reverse order.
  1227  func (c *RegionCache) loadRegion(bo *retry.Backoffer, key []byte, isEndKey bool) (*Region, error) {
  1228  	ctx := bo.GetCtx()
  1229  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
  1230  		span1 := span.Tracer().StartSpan("loadRegion", opentracing.ChildOf(span.Context()))
  1231  		defer span1.Finish()
  1232  		ctx = opentracing.ContextWithSpan(ctx, span1)
  1233  	}
  1234  
  1235  	var backoffErr error
  1236  	searchPrev := false
  1237  	for {
  1238  		if backoffErr != nil {
  1239  			err := bo.Backoff(retry.BoPDRPC, backoffErr)
  1240  			if err != nil {
  1241  				return nil, errors.Trace(err)
  1242  			}
  1243  		}
  1244  		var reg *pd.Region
  1245  		var err error
  1246  		if searchPrev {
  1247  			reg, err = c.pdClient.GetPrevRegion(ctx, key)
  1248  		} else {
  1249  			reg, err = c.pdClient.GetRegion(ctx, key)
  1250  		}
  1251  		if err != nil {
  1252  			metrics.RegionCacheCounterWithGetRegionError.Inc()
  1253  		} else {
  1254  			metrics.RegionCacheCounterWithGetRegionOK.Inc()
  1255  		}
  1256  		if err != nil {
  1257  			if isDecodeError(err) {
  1258  				return nil, errors.Errorf("failed to decode region range key, key: %q, err: %v", key, err)
  1259  			}
  1260  			backoffErr = errors.Errorf("loadRegion from PD failed, key: %q, err: %v", key, err)
  1261  			continue
  1262  		}
  1263  		if reg == nil || reg.Meta == nil {
  1264  			backoffErr = errors.Errorf("region not found for key %q", key)
  1265  			continue
  1266  		}
  1267  		filterUnavailablePeers(reg)
  1268  		if len(reg.Meta.Peers) == 0 {
  1269  			return nil, errors.New("receive Region with no available peer")
  1270  		}
  1271  		if isEndKey && !searchPrev && bytes.Equal(reg.Meta.StartKey, key) && len(reg.Meta.StartKey) != 0 {
  1272  			searchPrev = true
  1273  			continue
  1274  		}
  1275  		region := &Region{meta: reg.Meta}
  1276  		err = region.init(bo, c)
  1277  		if err != nil {
  1278  			return nil, err
  1279  		}
  1280  		if reg.Leader != nil {
  1281  			c.switchWorkLeaderToPeer(region, reg.Leader)
  1282  		}
  1283  		return region, nil
  1284  	}
  1285  }
  1286  
  1287  // loadRegionByID loads region from pd client, and picks the first peer as leader.
  1288  func (c *RegionCache) loadRegionByID(bo *retry.Backoffer, regionID uint64) (*Region, error) {
  1289  	ctx := bo.GetCtx()
  1290  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
  1291  		span1 := span.Tracer().StartSpan("loadRegionByID", opentracing.ChildOf(span.Context()))
  1292  		defer span1.Finish()
  1293  		ctx = opentracing.ContextWithSpan(ctx, span1)
  1294  	}
  1295  	var backoffErr error
  1296  	for {
  1297  		if backoffErr != nil {
  1298  			err := bo.Backoff(retry.BoPDRPC, backoffErr)
  1299  			if err != nil {
  1300  				return nil, errors.Trace(err)
  1301  			}
  1302  		}
  1303  		reg, err := c.pdClient.GetRegionByID(ctx, regionID)
  1304  		if err != nil {
  1305  			metrics.RegionCacheCounterWithGetRegionByIDError.Inc()
  1306  		} else {
  1307  			metrics.RegionCacheCounterWithGetRegionByIDOK.Inc()
  1308  		}
  1309  		if err != nil {
  1310  			if isDecodeError(err) {
  1311  				return nil, errors.Errorf("failed to decode region range key, regionID: %q, err: %v", regionID, err)
  1312  			}
  1313  			backoffErr = errors.Errorf("loadRegion from PD failed, regionID: %v, err: %v", regionID, err)
  1314  			continue
  1315  		}
  1316  		if reg == nil || reg.Meta == nil {
  1317  			return nil, errors.Errorf("region not found for regionID %d", regionID)
  1318  		}
  1319  		filterUnavailablePeers(reg)
  1320  		if len(reg.Meta.Peers) == 0 {
  1321  			return nil, errors.New("receive Region with no available peer")
  1322  		}
  1323  		region := &Region{meta: reg.Meta}
  1324  		err = region.init(bo, c)
  1325  		if err != nil {
  1326  			return nil, err
  1327  		}
  1328  		if reg.Leader != nil {
  1329  			c.switchWorkLeaderToPeer(region, reg.Leader)
  1330  		}
  1331  		return region, nil
  1332  	}
  1333  }
  1334  
  1335  // scanRegions scans at most `limit` regions from PD, starts from the region containing `startKey` and in key order.
  1336  // Regions with no leader will not be returned.
  1337  func (c *RegionCache) scanRegions(bo *retry.Backoffer, startKey, endKey []byte, limit int) ([]*Region, error) {
  1338  	if limit == 0 {
  1339  		return nil, nil
  1340  	}
  1341  	ctx := bo.GetCtx()
  1342  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
  1343  		span1 := span.Tracer().StartSpan("scanRegions", opentracing.ChildOf(span.Context()))
  1344  		defer span1.Finish()
  1345  		ctx = opentracing.ContextWithSpan(ctx, span1)
  1346  	}
  1347  
  1348  	var backoffErr error
  1349  	for {
  1350  		if backoffErr != nil {
  1351  			err := bo.Backoff(retry.BoPDRPC, backoffErr)
  1352  			if err != nil {
  1353  				return nil, errors.Trace(err)
  1354  			}
  1355  		}
  1356  		regionsInfo, err := c.pdClient.ScanRegions(ctx, startKey, endKey, limit)
  1357  		if err != nil {
  1358  			if isDecodeError(err) {
  1359  				return nil, errors.Errorf("failed to decode region range key, startKey: %q, limit: %q, err: %v", startKey, limit, err)
  1360  			}
  1361  			metrics.RegionCacheCounterWithScanRegionsError.Inc()
  1362  			backoffErr = errors.Errorf(
  1363  				"scanRegion from PD failed, startKey: %q, limit: %q, err: %v",
  1364  				startKey,
  1365  				limit,
  1366  				err)
  1367  			continue
  1368  		}
  1369  
  1370  		metrics.RegionCacheCounterWithScanRegionsOK.Inc()
  1371  
  1372  		if len(regionsInfo) == 0 {
  1373  			return nil, errors.New("PD returned no region")
  1374  		}
  1375  		regions := make([]*Region, 0, len(regionsInfo))
  1376  		for _, r := range regionsInfo {
  1377  			region := &Region{meta: r.Meta}
  1378  			err := region.init(bo, c)
  1379  			if err != nil {
  1380  				return nil, err
  1381  			}
  1382  			leader := r.Leader
  1383  			// Leader id = 0 indicates no leader.
  1384  			if leader != nil && leader.GetId() != 0 {
  1385  				c.switchWorkLeaderToPeer(region, leader)
  1386  				regions = append(regions, region)
  1387  			}
  1388  		}
  1389  		if len(regions) == 0 {
  1390  			return nil, errors.New("receive Regions with no peer")
  1391  		}
  1392  		if len(regions) < len(regionsInfo) {
  1393  			logutil.Logger(context.Background()).Debug(
  1394  				"regionCache: scanRegion finished but some regions has no leader.")
  1395  		}
  1396  		return regions, nil
  1397  	}
  1398  }
  1399  
  1400  // GetCachedRegionWithRLock returns region with lock.
  1401  func (c *RegionCache) GetCachedRegionWithRLock(regionID RegionVerID) (r *Region) {
  1402  	c.mu.RLock()
  1403  	r = c.mu.regions[regionID]
  1404  	c.mu.RUnlock()
  1405  	return
  1406  }
  1407  
  1408  func (c *RegionCache) getStoreAddr(bo *retry.Backoffer, region *Region, store *Store) (addr string, err error) {
  1409  	state := store.getResolveState()
  1410  	switch state {
  1411  	case resolved, needCheck:
  1412  		addr = store.addr
  1413  		return
  1414  	case unresolved:
  1415  		addr, err = store.initResolve(bo, c)
  1416  		return
  1417  	case deleted:
  1418  		addr = c.changeToActiveStore(region, store)
  1419  		return
  1420  	case tombstone:
  1421  		return "", nil
  1422  	default:
  1423  		panic("unsupported resolve state")
  1424  	}
  1425  }
  1426  
  1427  func (c *RegionCache) getProxyStore(region *Region, store *Store, rs *regionStore, workStoreIdx AccessIndex) (proxyStore *Store, proxyAccessIdx AccessIndex, proxyStoreIdx int) {
  1428  	if !c.enableForwarding || store.storeType != tikvrpc.TiKV || atomic.LoadInt32(&store.unreachable) == 0 {
  1429  		return
  1430  	}
  1431  
  1432  	if rs.proxyTiKVIdx >= 0 {
  1433  		storeIdx, proxyStore := rs.accessStore(tiKVOnly, rs.proxyTiKVIdx)
  1434  		return proxyStore, rs.proxyTiKVIdx, storeIdx
  1435  	}
  1436  
  1437  	tikvNum := rs.accessStoreNum(tiKVOnly)
  1438  	if tikvNum <= 1 {
  1439  		return
  1440  	}
  1441  
  1442  	// Randomly select an non-leader peer
  1443  	first := rand.Intn(tikvNum - 1)
  1444  	if first >= int(workStoreIdx) {
  1445  		first = (first + 1) % tikvNum
  1446  	}
  1447  
  1448  	// If the current selected peer is not reachable, switch to the next one, until a reachable peer is found or all
  1449  	// peers are checked.
  1450  	for i := 0; i < tikvNum; i++ {
  1451  		index := (i + first) % tikvNum
  1452  		// Skip work store which is the actual store to be accessed
  1453  		if index == int(workStoreIdx) {
  1454  			continue
  1455  		}
  1456  		storeIdx, store := rs.accessStore(tiKVOnly, AccessIndex(index))
  1457  		// Skip unreachable stores.
  1458  		if atomic.LoadInt32(&store.unreachable) != 0 {
  1459  			continue
  1460  		}
  1461  
  1462  		rs.setProxyStoreIdx(region, AccessIndex(index))
  1463  		return store, AccessIndex(index), storeIdx
  1464  	}
  1465  
  1466  	return nil, 0, 0
  1467  }
  1468  
  1469  // changeToActiveStore replace the deleted store in the region by an up-to-date store in the stores map.
  1470  // The order is guaranteed by reResolve() which adds the new store before marking old store deleted.
  1471  func (c *RegionCache) changeToActiveStore(region *Region, store *Store) (addr string) {
  1472  	c.storeMu.RLock()
  1473  	store = c.storeMu.stores[store.storeID]
  1474  	c.storeMu.RUnlock()
  1475  	for {
  1476  		oldRegionStore := region.getStore()
  1477  		newRegionStore := oldRegionStore.clone()
  1478  		newRegionStore.stores = make([]*Store, 0, len(oldRegionStore.stores))
  1479  		for _, s := range oldRegionStore.stores {
  1480  			if s.storeID == store.storeID {
  1481  				newRegionStore.stores = append(newRegionStore.stores, store)
  1482  			} else {
  1483  				newRegionStore.stores = append(newRegionStore.stores, s)
  1484  			}
  1485  		}
  1486  		if region.compareAndSwapStore(oldRegionStore, newRegionStore) {
  1487  			break
  1488  		}
  1489  	}
  1490  	addr = store.addr
  1491  	return
  1492  }
  1493  
  1494  func (c *RegionCache) getStoreByStoreID(storeID uint64) (store *Store) {
  1495  	var ok bool
  1496  	c.storeMu.Lock()
  1497  	store, ok = c.storeMu.stores[storeID]
  1498  	if ok {
  1499  		c.storeMu.Unlock()
  1500  		return
  1501  	}
  1502  	store = &Store{storeID: storeID}
  1503  	c.storeMu.stores[storeID] = store
  1504  	c.storeMu.Unlock()
  1505  	return
  1506  }
  1507  
  1508  func (c *RegionCache) getStoresByLabels(labels []*metapb.StoreLabel) []*Store {
  1509  	c.storeMu.RLock()
  1510  	defer c.storeMu.RUnlock()
  1511  	s := make([]*Store, 0)
  1512  	for _, store := range c.storeMu.stores {
  1513  		if store.IsLabelsMatch(labels) {
  1514  			s = append(s, store)
  1515  		}
  1516  	}
  1517  	return s
  1518  }
  1519  
  1520  // OnRegionEpochNotMatch removes the old region and inserts new regions into the cache.
  1521  // It returns whether retries the request because it's possible the region epoch is ahead of TiKV's due to slow appling.
  1522  func (c *RegionCache) OnRegionEpochNotMatch(bo *retry.Backoffer, ctx *RPCContext, currentRegions []*metapb.Region) (bool, error) {
  1523  	if len(currentRegions) == 0 {
  1524  		c.InvalidateCachedRegionWithReason(ctx.Region, EpochNotMatch)
  1525  		return false, nil
  1526  	}
  1527  
  1528  	// Find whether the region epoch in `ctx` is ahead of TiKV's. If so, backoff.
  1529  	for _, meta := range currentRegions {
  1530  		if meta.GetId() == ctx.Region.id &&
  1531  			(meta.GetRegionEpoch().GetConfVer() < ctx.Region.confVer ||
  1532  				meta.GetRegionEpoch().GetVersion() < ctx.Region.ver) {
  1533  			err := errors.Errorf("region epoch is ahead of tikv. rpc ctx: %+v, currentRegions: %+v", ctx, currentRegions)
  1534  			logutil.BgLogger().Info("region epoch is ahead of tikv", zap.Error(err))
  1535  			return true, bo.Backoff(retry.BoRegionMiss, err)
  1536  		}
  1537  	}
  1538  
  1539  	needInvalidateOld := true
  1540  	newRegions := make([]*Region, 0, len(currentRegions))
  1541  	// If the region epoch is not ahead of TiKV's, replace region meta in region cache.
  1542  	for _, meta := range currentRegions {
  1543  		if _, ok := c.pdClient.(*CodecPDClient); ok {
  1544  			var err error
  1545  			if meta, err = decodeRegionMetaKeyWithShallowCopy(meta); err != nil {
  1546  				return false, errors.Errorf("newRegion's range key is not encoded: %v, %v", meta, err)
  1547  			}
  1548  		}
  1549  		region := &Region{meta: meta}
  1550  		err := region.init(bo, c)
  1551  		if err != nil {
  1552  			return false, err
  1553  		}
  1554  		var initLeaderStoreID uint64
  1555  		if ctx.Store.storeType == tikvrpc.TiFlash {
  1556  			initLeaderStoreID = region.findElectableStoreID()
  1557  		} else {
  1558  			initLeaderStoreID = ctx.Store.storeID
  1559  		}
  1560  		c.switchWorkLeaderToPeer(region, region.getPeerOnStore(initLeaderStoreID))
  1561  		newRegions = append(newRegions, region)
  1562  		if ctx.Region == region.VerID() {
  1563  			needInvalidateOld = false
  1564  		}
  1565  	}
  1566  	c.mu.Lock()
  1567  	for _, region := range newRegions {
  1568  		c.insertRegionToCache(region)
  1569  	}
  1570  	if needInvalidateOld {
  1571  		cachedRegion, ok := c.mu.regions[ctx.Region]
  1572  		if ok {
  1573  			cachedRegion.invalidate(EpochNotMatch)
  1574  		}
  1575  	}
  1576  	c.mu.Unlock()
  1577  	return false, nil
  1578  }
  1579  
  1580  // PDClient returns the pd.Client in RegionCache.
  1581  func (c *RegionCache) PDClient() pd.Client {
  1582  	return c.pdClient
  1583  }
  1584  
  1585  // GetTiFlashStores returns the information of all tiflash nodes.
  1586  func (c *RegionCache) GetTiFlashStores() []*Store {
  1587  	c.storeMu.RLock()
  1588  	defer c.storeMu.RUnlock()
  1589  	var stores []*Store
  1590  	for _, s := range c.storeMu.stores {
  1591  		if s.storeType == tikvrpc.TiFlash {
  1592  			stores = append(stores, s)
  1593  		}
  1594  	}
  1595  	return stores
  1596  }
  1597  
  1598  // btreeItem is BTree's Item that uses []byte to compare.
  1599  type btreeItem struct {
  1600  	key          []byte
  1601  	cachedRegion *Region
  1602  }
  1603  
  1604  func newBtreeItem(cr *Region) *btreeItem {
  1605  	return &btreeItem{
  1606  		key:          cr.StartKey(),
  1607  		cachedRegion: cr,
  1608  	}
  1609  }
  1610  
  1611  func newBtreeSearchItem(key []byte) *btreeItem {
  1612  	return &btreeItem{
  1613  		key: key,
  1614  	}
  1615  }
  1616  
  1617  func (item *btreeItem) Less(other btree.Item) bool {
  1618  	return bytes.Compare(item.key, other.(*btreeItem).key) < 0
  1619  }
  1620  
  1621  // GetID returns id.
  1622  func (r *Region) GetID() uint64 {
  1623  	return r.meta.GetId()
  1624  }
  1625  
  1626  // GetMeta returns region meta.
  1627  func (r *Region) GetMeta() *metapb.Region {
  1628  	return proto.Clone(r.meta).(*metapb.Region)
  1629  }
  1630  
  1631  // GetLeaderPeerID returns leader peer ID.
  1632  func (r *Region) GetLeaderPeerID() uint64 {
  1633  	store := r.getStore()
  1634  	if int(store.workTiKVIdx) >= store.accessStoreNum(tiKVOnly) {
  1635  		return 0
  1636  	}
  1637  	storeIdx, _ := store.accessStore(tiKVOnly, store.workTiKVIdx)
  1638  	return r.meta.Peers[storeIdx].Id
  1639  }
  1640  
  1641  // GetLeaderStoreID returns the store ID of the leader region.
  1642  func (r *Region) GetLeaderStoreID() uint64 {
  1643  	store := r.getStore()
  1644  	if int(store.workTiKVIdx) >= store.accessStoreNum(tiKVOnly) {
  1645  		return 0
  1646  	}
  1647  	storeIdx, _ := store.accessStore(tiKVOnly, store.workTiKVIdx)
  1648  	return r.meta.Peers[storeIdx].StoreId
  1649  }
  1650  
  1651  func (r *Region) getKvStorePeer(rs *regionStore, aidx AccessIndex) (store *Store, peer *metapb.Peer, accessIdx AccessIndex, storeIdx int) {
  1652  	storeIdx, store = rs.accessStore(tiKVOnly, aidx)
  1653  	peer = r.meta.Peers[storeIdx]
  1654  	accessIdx = aidx
  1655  	return
  1656  }
  1657  
  1658  // WorkStorePeer returns current work store with work peer.
  1659  func (r *Region) WorkStorePeer(rs *regionStore) (store *Store, peer *metapb.Peer, accessIdx AccessIndex, storeIdx int) {
  1660  	return r.getKvStorePeer(rs, rs.workTiKVIdx)
  1661  }
  1662  
  1663  // FollowerStorePeer returns a follower store with follower peer.
  1664  func (r *Region) FollowerStorePeer(rs *regionStore, followerStoreSeed uint32, op *storeSelectorOp) (store *Store, peer *metapb.Peer, accessIdx AccessIndex, storeIdx int) {
  1665  	return r.getKvStorePeer(rs, rs.follower(followerStoreSeed, op))
  1666  }
  1667  
  1668  // AnyStorePeer returns a leader or follower store with the associated peer.
  1669  func (r *Region) AnyStorePeer(rs *regionStore, followerStoreSeed uint32, op *storeSelectorOp) (store *Store, peer *metapb.Peer, accessIdx AccessIndex, storeIdx int) {
  1670  	return r.getKvStorePeer(rs, rs.kvPeer(followerStoreSeed, op))
  1671  }
  1672  
  1673  // RegionVerID is a unique ID that can identify a Region at a specific version.
  1674  type RegionVerID struct {
  1675  	id      uint64
  1676  	confVer uint64
  1677  	ver     uint64
  1678  }
  1679  
  1680  // NewRegionVerID creates a region ver id, which used for invalidating regions.
  1681  func NewRegionVerID(id, confVer, ver uint64) RegionVerID {
  1682  	return RegionVerID{id, confVer, ver}
  1683  }
  1684  
  1685  // GetID returns the id of the region
  1686  func (r *RegionVerID) GetID() uint64 {
  1687  	return r.id
  1688  }
  1689  
  1690  // GetVer returns the version of the region's epoch
  1691  func (r *RegionVerID) GetVer() uint64 {
  1692  	return r.ver
  1693  }
  1694  
  1695  // GetConfVer returns the conf ver of the region's epoch
  1696  func (r *RegionVerID) GetConfVer() uint64 {
  1697  	return r.confVer
  1698  }
  1699  
  1700  // String formats the RegionVerID to string
  1701  func (r *RegionVerID) String() string {
  1702  	return fmt.Sprintf("{ region id: %v, ver: %v, confVer: %v }", r.id, r.ver, r.confVer)
  1703  }
  1704  
  1705  // Equals checks whether the RegionVerID equals to another one
  1706  func (r *RegionVerID) Equals(another RegionVerID) bool {
  1707  	return r.id == another.id && r.confVer == another.confVer && r.ver == another.ver
  1708  }
  1709  
  1710  // VerID returns the Region's RegionVerID.
  1711  func (r *Region) VerID() RegionVerID {
  1712  	return RegionVerID{
  1713  		id:      r.meta.GetId(),
  1714  		confVer: r.meta.GetRegionEpoch().GetConfVer(),
  1715  		ver:     r.meta.GetRegionEpoch().GetVersion(),
  1716  	}
  1717  }
  1718  
  1719  // StartKey returns StartKey.
  1720  func (r *Region) StartKey() []byte {
  1721  	return r.meta.StartKey
  1722  }
  1723  
  1724  // EndKey returns EndKey.
  1725  func (r *Region) EndKey() []byte {
  1726  	return r.meta.EndKey
  1727  }
  1728  
  1729  // switchWorkLeaderToPeer switches current store to the one on specific store. It returns
  1730  // false if no peer matches the peer.
  1731  func (c *RegionCache) switchWorkLeaderToPeer(r *Region, peer *metapb.Peer) (found bool) {
  1732  	globalStoreIdx, found := c.getPeerStoreIndex(r, peer)
  1733  	if !found {
  1734  		return
  1735  	}
  1736  retry:
  1737  	// switch to new leader.
  1738  	oldRegionStore := r.getStore()
  1739  	var leaderIdx AccessIndex
  1740  	for i, gIdx := range oldRegionStore.accessIndex[tiKVOnly] {
  1741  		if gIdx == globalStoreIdx {
  1742  			leaderIdx = AccessIndex(i)
  1743  		}
  1744  	}
  1745  	if oldRegionStore.workTiKVIdx == leaderIdx {
  1746  		return
  1747  	}
  1748  	newRegionStore := oldRegionStore.clone()
  1749  	newRegionStore.workTiKVIdx = leaderIdx
  1750  	if !r.compareAndSwapStore(oldRegionStore, newRegionStore) {
  1751  		goto retry
  1752  	}
  1753  	return
  1754  }
  1755  
  1756  func (r *regionStore) switchNextFlashPeer(rr *Region, currentPeerIdx AccessIndex) {
  1757  	nextIdx := (currentPeerIdx + 1) % AccessIndex(r.accessStoreNum(tiFlashOnly))
  1758  	newRegionStore := r.clone()
  1759  	newRegionStore.workTiFlashIdx = int32(nextIdx)
  1760  	rr.compareAndSwapStore(r, newRegionStore)
  1761  }
  1762  
  1763  func (r *regionStore) switchNextTiKVPeer(rr *Region, currentPeerIdx AccessIndex) {
  1764  	if r.workTiKVIdx != currentPeerIdx {
  1765  		return
  1766  	}
  1767  	nextIdx := (currentPeerIdx + 1) % AccessIndex(r.accessStoreNum(tiKVOnly))
  1768  	newRegionStore := r.clone()
  1769  	newRegionStore.workTiKVIdx = nextIdx
  1770  	rr.compareAndSwapStore(r, newRegionStore)
  1771  }
  1772  
  1773  func (r *regionStore) setProxyStoreIdx(rr *Region, idx AccessIndex) {
  1774  	if r.proxyTiKVIdx == idx {
  1775  		return
  1776  	}
  1777  
  1778  	newRegionStore := r.clone()
  1779  	newRegionStore.proxyTiKVIdx = idx
  1780  	success := rr.compareAndSwapStore(r, newRegionStore)
  1781  	logutil.BgLogger().Debug("try set proxy store index",
  1782  		zap.Uint64("region", rr.GetID()),
  1783  		zap.Int("index", int(idx)),
  1784  		zap.Bool("success", success))
  1785  }
  1786  
  1787  func (r *regionStore) unsetProxyStoreIfNeeded(rr *Region) {
  1788  	r.setProxyStoreIdx(rr, -1)
  1789  }
  1790  
  1791  func (r *Region) findElectableStoreID() uint64 {
  1792  	if len(r.meta.Peers) == 0 {
  1793  		return 0
  1794  	}
  1795  	for _, p := range r.meta.Peers {
  1796  		if p.Role != metapb.PeerRole_Learner {
  1797  			return p.StoreId
  1798  		}
  1799  	}
  1800  	return 0
  1801  }
  1802  
  1803  func (r *Region) getPeerOnStore(storeID uint64) *metapb.Peer {
  1804  	for _, p := range r.meta.Peers {
  1805  		if p.StoreId == storeID {
  1806  			return p
  1807  		}
  1808  	}
  1809  	return nil
  1810  }
  1811  
  1812  func (c *RegionCache) getPeerStoreIndex(r *Region, peer *metapb.Peer) (idx int, found bool) {
  1813  	if len(r.meta.Peers) == 0 || peer == nil {
  1814  		return
  1815  	}
  1816  	for i, p := range r.meta.Peers {
  1817  		if isSamePeer(p, peer) {
  1818  			idx = i
  1819  			found = true
  1820  			return
  1821  		}
  1822  	}
  1823  	return
  1824  }
  1825  
  1826  // Contains checks whether the key is in the region, for the maximum region endKey is empty.
  1827  // startKey <= key < endKey.
  1828  func (r *Region) Contains(key []byte) bool {
  1829  	return bytes.Compare(r.meta.GetStartKey(), key) <= 0 &&
  1830  		(bytes.Compare(key, r.meta.GetEndKey()) < 0 || len(r.meta.GetEndKey()) == 0)
  1831  }
  1832  
  1833  // ContainsByEnd check the region contains the greatest key that is less than key.
  1834  // for the maximum region endKey is empty.
  1835  // startKey < key <= endKey.
  1836  func (r *Region) ContainsByEnd(key []byte) bool {
  1837  	return bytes.Compare(r.meta.GetStartKey(), key) < 0 &&
  1838  		(bytes.Compare(key, r.meta.GetEndKey()) <= 0 || len(r.meta.GetEndKey()) == 0)
  1839  }
  1840  
  1841  // Store contains a kv process's address.
  1842  type Store struct {
  1843  	addr         string               // loaded store address
  1844  	saddr        string               // loaded store status address
  1845  	storeID      uint64               // store's id
  1846  	state        uint64               // unsafe store storeState
  1847  	labels       []*metapb.StoreLabel // stored store labels
  1848  	resolveMutex sync.Mutex           // protect pd from concurrent init requests
  1849  	epoch        uint32               // store fail epoch, see RegionStore.storeEpochs
  1850  	storeType    tikvrpc.EndpointType // type of the store
  1851  	tokenCount   atomic2.Int64        // used store token count
  1852  
  1853  	// whether the store is unreachable due to some reason, therefore requests to the store needs to be
  1854  	// forwarded by other stores. this is also the flag that a checkUntilHealth goroutine is running for this store.
  1855  	// this mechanism is currently only applicable for TiKV stores.
  1856  	unreachable      int32
  1857  	unreachableSince time.Time
  1858  }
  1859  
  1860  type resolveState uint64
  1861  
  1862  const (
  1863  	// The store is just created and normally is being resolved.
  1864  	// Store in this state will only be resolved by initResolve().
  1865  	unresolved resolveState = iota
  1866  	// The store is resolved and its address is valid.
  1867  	resolved
  1868  	// Request failed on this store and it will be re-resolved by asyncCheckAndResolveLoop().
  1869  	needCheck
  1870  	// The store's address or label is changed and marked deleted.
  1871  	// There is a new store struct replaced it in the RegionCache and should
  1872  	// call changeToActiveStore() to get the new struct.
  1873  	deleted
  1874  	// The store is a tombstone. Should invalidate the region if tries to access it.
  1875  	tombstone
  1876  )
  1877  
  1878  // IsTiFlash returns true if the storeType is TiFlash
  1879  func (s *Store) IsTiFlash() bool {
  1880  	return s.storeType == tikvrpc.TiFlash
  1881  }
  1882  
  1883  // StoreID returns storeID.
  1884  func (s *Store) StoreID() uint64 {
  1885  	return s.storeID
  1886  }
  1887  
  1888  // initResolve resolves the address of the store that never resolved and returns an
  1889  // empty string if it's a tombstone.
  1890  func (s *Store) initResolve(bo *retry.Backoffer, c *RegionCache) (addr string, err error) {
  1891  	s.resolveMutex.Lock()
  1892  	state := s.getResolveState()
  1893  	defer s.resolveMutex.Unlock()
  1894  	if state != unresolved {
  1895  		if state != tombstone {
  1896  			addr = s.addr
  1897  		}
  1898  		return
  1899  	}
  1900  	var store *metapb.Store
  1901  	for {
  1902  		store, err = c.pdClient.GetStore(bo.GetCtx(), s.storeID)
  1903  		if err != nil {
  1904  			metrics.RegionCacheCounterWithGetStoreError.Inc()
  1905  		} else {
  1906  			metrics.RegionCacheCounterWithGetStoreOK.Inc()
  1907  		}
  1908  		if bo.GetCtx().Err() != nil && errors.Cause(bo.GetCtx().Err()) == context.Canceled {
  1909  			return
  1910  		}
  1911  		if err != nil && !isStoreNotFoundError(err) {
  1912  			// TODO: more refine PD error status handle.
  1913  			err = errors.Errorf("loadStore from PD failed, id: %d, err: %v", s.storeID, err)
  1914  			if err = bo.Backoff(retry.BoPDRPC, err); err != nil {
  1915  				return
  1916  			}
  1917  			continue
  1918  		}
  1919  		// The store is a tombstone.
  1920  		if store == nil {
  1921  			s.setResolveState(tombstone)
  1922  			return "", nil
  1923  		}
  1924  		addr = store.GetAddress()
  1925  		if addr == "" {
  1926  			return "", errors.Errorf("empty store(%d) address", s.storeID)
  1927  		}
  1928  		s.addr = addr
  1929  		s.saddr = store.GetStatusAddress()
  1930  		s.storeType = tikvrpc.GetStoreTypeByMeta(store)
  1931  		s.labels = store.GetLabels()
  1932  		// Shouldn't have other one changing its state concurrently, but we still use changeResolveStateTo for safety.
  1933  		s.changeResolveStateTo(unresolved, resolved)
  1934  		return s.addr, nil
  1935  	}
  1936  }
  1937  
  1938  // A quick and dirty solution to find out whether an err is caused by StoreNotFound.
  1939  // todo: A better solution, maybe some err-code based error handling?
  1940  func isStoreNotFoundError(err error) bool {
  1941  	return strings.Contains(err.Error(), "invalid store ID") && strings.Contains(err.Error(), "not found")
  1942  }
  1943  
  1944  // reResolve try to resolve addr for store that need check. Returns false if the region is in tombstone state or is
  1945  // deleted.
  1946  func (s *Store) reResolve(c *RegionCache) (bool, error) {
  1947  	var addr string
  1948  	store, err := c.pdClient.GetStore(context.Background(), s.storeID)
  1949  	if err != nil {
  1950  		metrics.RegionCacheCounterWithGetStoreError.Inc()
  1951  	} else {
  1952  		metrics.RegionCacheCounterWithGetStoreOK.Inc()
  1953  	}
  1954  	// `err` here can mean either "load Store from PD failed" or "store not found"
  1955  	// If load Store from PD is successful but PD didn't find the store
  1956  	// the err should be handled by next `if` instead of here
  1957  	if err != nil && !isStoreNotFoundError(err) {
  1958  		logutil.BgLogger().Error("loadStore from PD failed", zap.Uint64("id", s.storeID), zap.Error(err))
  1959  		// we cannot do backoff in reResolve loop but try check other store and wait tick.
  1960  		return false, err
  1961  	}
  1962  	if store == nil {
  1963  		// store has be removed in PD, we should invalidate all regions using those store.
  1964  		logutil.BgLogger().Info("invalidate regions in removed store",
  1965  			zap.Uint64("store", s.storeID), zap.String("add", s.addr))
  1966  		atomic.AddUint32(&s.epoch, 1)
  1967  		s.setResolveState(tombstone)
  1968  		metrics.RegionCacheCounterWithInvalidateStoreRegionsOK.Inc()
  1969  		return false, nil
  1970  	}
  1971  
  1972  	storeType := tikvrpc.GetStoreTypeByMeta(store)
  1973  	addr = store.GetAddress()
  1974  	if s.addr != addr || !s.IsSameLabels(store.GetLabels()) {
  1975  		newStore := &Store{storeID: s.storeID, addr: addr, saddr: store.GetStatusAddress(), storeType: storeType, labels: store.GetLabels(), state: uint64(resolved)}
  1976  		c.storeMu.Lock()
  1977  		c.storeMu.stores[newStore.storeID] = newStore
  1978  		c.storeMu.Unlock()
  1979  		s.setResolveState(deleted)
  1980  		return false, nil
  1981  	}
  1982  	s.changeResolveStateTo(needCheck, resolved)
  1983  	return true, nil
  1984  }
  1985  
  1986  func (s *Store) getResolveState() resolveState {
  1987  	var state resolveState
  1988  	if s == nil {
  1989  		return state
  1990  	}
  1991  	return resolveState(atomic.LoadUint64(&s.state))
  1992  }
  1993  
  1994  func (s *Store) setResolveState(state resolveState) {
  1995  	atomic.StoreUint64(&s.state, uint64(state))
  1996  }
  1997  
  1998  // changeResolveStateTo changes the store resolveState from the old state to the new state.
  1999  // Returns true if it changes the state successfully, and false if the store's state
  2000  // is changed by another one.
  2001  func (s *Store) changeResolveStateTo(from, to resolveState) bool {
  2002  	for {
  2003  		state := s.getResolveState()
  2004  		if state == to {
  2005  			return true
  2006  		}
  2007  		if state != from {
  2008  			return false
  2009  		}
  2010  		if atomic.CompareAndSwapUint64(&s.state, uint64(from), uint64(to)) {
  2011  			return true
  2012  		}
  2013  	}
  2014  }
  2015  
  2016  // markNeedCheck marks resolved store to be async resolve to check store addr change.
  2017  func (s *Store) markNeedCheck(notifyCheckCh chan struct{}) {
  2018  	if s.changeResolveStateTo(resolved, needCheck) {
  2019  		select {
  2020  		case notifyCheckCh <- struct{}{}:
  2021  		default:
  2022  		}
  2023  	}
  2024  }
  2025  
  2026  // IsSameLabels returns whether the store have the same labels with target labels
  2027  func (s *Store) IsSameLabels(labels []*metapb.StoreLabel) bool {
  2028  	if len(s.labels) != len(labels) {
  2029  		return false
  2030  	}
  2031  	return s.IsLabelsMatch(labels)
  2032  }
  2033  
  2034  // IsLabelsMatch return whether the store's labels match the target labels
  2035  func (s *Store) IsLabelsMatch(labels []*metapb.StoreLabel) bool {
  2036  	if len(labels) < 1 {
  2037  		return true
  2038  	}
  2039  	for _, targetLabel := range labels {
  2040  		match := false
  2041  		for _, label := range s.labels {
  2042  			if targetLabel.Key == label.Key && targetLabel.Value == label.Value {
  2043  				match = true
  2044  				break
  2045  			}
  2046  		}
  2047  		if !match {
  2048  			return false
  2049  		}
  2050  	}
  2051  	return true
  2052  }
  2053  
  2054  type livenessState uint32
  2055  
  2056  var (
  2057  	livenessSf singleflight.Group
  2058  	// storeLivenessTimeout is the max duration of resolving liveness of a TiKV instance.
  2059  	storeLivenessTimeout time.Duration
  2060  )
  2061  
  2062  // SetStoreLivenessTimeout sets storeLivenessTimeout to t.
  2063  func SetStoreLivenessTimeout(t time.Duration) {
  2064  	storeLivenessTimeout = t
  2065  }
  2066  
  2067  // GetStoreLivenessTimeout returns storeLivenessTimeout.
  2068  func GetStoreLivenessTimeout() time.Duration {
  2069  	return storeLivenessTimeout
  2070  }
  2071  
  2072  const (
  2073  	unknown livenessState = iota
  2074  	reachable
  2075  	unreachable
  2076  )
  2077  
  2078  func (s *Store) startHealthCheckLoopIfNeeded(c *RegionCache) {
  2079  	// This mechanism doesn't support non-TiKV stores currently.
  2080  	if s.storeType != tikvrpc.TiKV {
  2081  		logutil.BgLogger().Info("[health check] skip running health check loop for non-tikv store",
  2082  			zap.Uint64("storeID", s.storeID), zap.String("addr", s.addr))
  2083  		return
  2084  	}
  2085  
  2086  	// It may be already started by another thread.
  2087  	if atomic.CompareAndSwapInt32(&s.unreachable, 0, 1) {
  2088  		s.unreachableSince = time.Now()
  2089  		go s.checkUntilHealth(c)
  2090  	}
  2091  }
  2092  
  2093  func (s *Store) checkUntilHealth(c *RegionCache) {
  2094  	defer atomic.CompareAndSwapInt32(&s.unreachable, 1, 0)
  2095  
  2096  	ticker := time.NewTicker(time.Second)
  2097  	lastCheckPDTime := time.Now()
  2098  
  2099  	// TODO(MyonKeminta): Set a more proper ctx here so that it can be interrupted immediately when the RegionCache is
  2100  	// shutdown.
  2101  	ctx := context.Background()
  2102  	for {
  2103  		select {
  2104  		case <-c.closeCh:
  2105  			return
  2106  		case <-ticker.C:
  2107  			if time.Since(lastCheckPDTime) > time.Second*30 {
  2108  				lastCheckPDTime = time.Now()
  2109  
  2110  				valid, err := s.reResolve(c)
  2111  				if err != nil {
  2112  					logutil.BgLogger().Warn("[health check] failed to re-resolve unhealthy store", zap.Error(err))
  2113  				} else if !valid {
  2114  					logutil.BgLogger().Info("[health check] store meta deleted, stop checking", zap.Uint64("storeID", s.storeID), zap.String("addr", s.addr))
  2115  					return
  2116  				}
  2117  			}
  2118  
  2119  			bo := retry.NewNoopBackoff(ctx)
  2120  			l := s.requestLiveness(bo, c)
  2121  			if l == reachable {
  2122  				logutil.BgLogger().Info("[health check] store became reachable", zap.Uint64("storeID", s.storeID))
  2123  
  2124  				return
  2125  			}
  2126  		}
  2127  	}
  2128  }
  2129  
  2130  func (s *Store) requestLiveness(bo *retry.Backoffer, c *RegionCache) (l livenessState) {
  2131  	if c != nil && c.testingKnobs.mockRequestLiveness != nil {
  2132  		return c.testingKnobs.mockRequestLiveness(s, bo)
  2133  	}
  2134  
  2135  	if storeLivenessTimeout == 0 {
  2136  		return unreachable
  2137  	}
  2138  
  2139  	if s.getResolveState() != resolved {
  2140  		l = unknown
  2141  		return
  2142  	}
  2143  	addr := s.addr
  2144  	rsCh := livenessSf.DoChan(addr, func() (interface{}, error) {
  2145  		return invokeKVStatusAPI(addr, storeLivenessTimeout), nil
  2146  	})
  2147  	var ctx context.Context
  2148  	if bo != nil {
  2149  		ctx = bo.GetCtx()
  2150  	} else {
  2151  		ctx = context.Background()
  2152  	}
  2153  	select {
  2154  	case rs := <-rsCh:
  2155  		l = rs.Val.(livenessState)
  2156  	case <-ctx.Done():
  2157  		l = unknown
  2158  		return
  2159  	}
  2160  	return
  2161  }
  2162  
  2163  // GetAddr returns the address of the store
  2164  func (s *Store) GetAddr() string {
  2165  	return s.addr
  2166  }
  2167  
  2168  func invokeKVStatusAPI(addr string, timeout time.Duration) (l livenessState) {
  2169  	start := time.Now()
  2170  	defer func() {
  2171  		if l == reachable {
  2172  			metrics.StatusCountWithOK.Inc()
  2173  		} else {
  2174  			metrics.StatusCountWithError.Inc()
  2175  		}
  2176  		metrics.TiKVStatusDuration.WithLabelValues(addr).Observe(time.Since(start).Seconds())
  2177  	}()
  2178  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
  2179  	defer cancel()
  2180  
  2181  	conn, cli, err := createKVHealthClient(ctx, addr)
  2182  	if err != nil {
  2183  		logutil.BgLogger().Info("[health check] create grpc connection failed", zap.String("store", addr), zap.Error(err))
  2184  		l = unreachable
  2185  		return
  2186  	}
  2187  	defer func() {
  2188  		err := conn.Close()
  2189  		if err != nil {
  2190  			logutil.BgLogger().Info("[health check] failed to close the grpc connection for health check", zap.String("store", addr), zap.Error(err))
  2191  		}
  2192  	}()
  2193  
  2194  	req := &healthpb.HealthCheckRequest{}
  2195  	resp, err := cli.Check(ctx, req)
  2196  	if err != nil {
  2197  		logutil.BgLogger().Info("[health check] check health error", zap.String("store", addr), zap.Error(err))
  2198  		l = unreachable
  2199  		return
  2200  	}
  2201  
  2202  	status := resp.GetStatus()
  2203  	if status == healthpb.HealthCheckResponse_UNKNOWN {
  2204  		logutil.BgLogger().Info("[health check] check health returns unknown", zap.String("store", addr))
  2205  		l = unknown
  2206  		return
  2207  	}
  2208  
  2209  	if status != healthpb.HealthCheckResponse_SERVING {
  2210  		logutil.BgLogger().Info("[health check] service not serving", zap.Stringer("status", status))
  2211  		l = unreachable
  2212  		return
  2213  	}
  2214  
  2215  	l = reachable
  2216  	return
  2217  }
  2218  
  2219  func createKVHealthClient(ctx context.Context, addr string) (*grpc.ClientConn, healthpb.HealthClient, error) {
  2220  	// Temporarily directly load the config from the global config, however it's not a good idea to let RegionCache to
  2221  	// access it.
  2222  	// TODO: Pass the config in a better way, or use the connArray inner the client directly rather than creating new
  2223  	// connection.
  2224  
  2225  	cfg := config.GetGlobalConfig()
  2226  
  2227  	opt := grpc.WithInsecure()
  2228  	if len(cfg.Security.ClusterSSLCA) != 0 {
  2229  		tlsConfig, err := cfg.Security.ToTLSConfig()
  2230  		if err != nil {
  2231  			return nil, nil, errors.Trace(err)
  2232  		}
  2233  		opt = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig))
  2234  	}
  2235  	keepAlive := cfg.TiKVClient.GrpcKeepAliveTime
  2236  	keepAliveTimeout := cfg.TiKVClient.GrpcKeepAliveTimeout
  2237  	conn, err := grpc.DialContext(
  2238  		ctx,
  2239  		addr,
  2240  		opt,
  2241  		grpc.WithInitialWindowSize(client.GrpcInitialWindowSize),
  2242  		grpc.WithInitialConnWindowSize(client.GrpcInitialConnWindowSize),
  2243  		grpc.WithConnectParams(grpc.ConnectParams{
  2244  			Backoff: backoff.Config{
  2245  				BaseDelay:  100 * time.Millisecond, // Default was 1s.
  2246  				Multiplier: 1.6,                    // Default
  2247  				Jitter:     0.2,                    // Default
  2248  				MaxDelay:   3 * time.Second,        // Default was 120s.
  2249  			},
  2250  			MinConnectTimeout: 5 * time.Second,
  2251  		}),
  2252  		grpc.WithKeepaliveParams(keepalive.ClientParameters{
  2253  			Time:                time.Duration(keepAlive) * time.Second,
  2254  			Timeout:             time.Duration(keepAliveTimeout) * time.Second,
  2255  			PermitWithoutStream: true,
  2256  		}),
  2257  	)
  2258  	if err != nil {
  2259  		return nil, nil, errors.Trace(err)
  2260  	}
  2261  	cli := healthpb.NewHealthClient(conn)
  2262  	return conn, cli, nil
  2263  }
  2264  
  2265  func isSamePeer(lhs *metapb.Peer, rhs *metapb.Peer) bool {
  2266  	return lhs == rhs || (lhs.GetId() == rhs.GetId() && lhs.GetStoreId() == rhs.GetStoreId())
  2267  }