github.com/KinWaiYuen/client-go/v2@v2.5.4/internal/locate/region_request.go

github.com/KinWaiYuen/client-go/v2@v2.5.4/internal/locate/region_request.go (about)

     1  // Copyright 2021 TiKV Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // NOTE: The code in this file is based on code from the
    16  // TiDB project, licensed under the Apache License v 2.0
    17  //
    18  // https://github.com/pingcap/tidb/tree/cc5e161ac06827589c4966674597c137cc9e809c/store/tikv/locate/region_request.go
    19  //
    20  
    21  // Copyright 2016 PingCAP, Inc.
    22  //
    23  // Licensed under the Apache License, Version 2.0 (the "License");
    24  // you may not use this file except in compliance with the License.
    25  // You may obtain a copy of the License at
    26  //
    27  //     http://www.apache.org/licenses/LICENSE-2.0
    28  //
    29  // Unless required by applicable law or agreed to in writing, software
    30  // distributed under the License is distributed on an "AS IS" BASIS,
    31  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    32  // See the License for the specific language governing permissions and
    33  // limitations under the License.
    34  
    35  package locate
    36  
    37  import (
    38  	"context"
    39  	"fmt"
    40  	"math/rand"
    41  	"strconv"
    42  	"strings"
    43  	"sync"
    44  	"sync/atomic"
    45  	"time"
    46  
    47  	"go.uber.org/zap"
    48  	"google.golang.org/grpc/codes"
    49  	"google.golang.org/grpc/status"
    50  
    51  	tikverr "github.com/KinWaiYuen/client-go/v2/error"
    52  	"github.com/KinWaiYuen/client-go/v2/internal/client"
    53  	"github.com/KinWaiYuen/client-go/v2/internal/logutil"
    54  	"github.com/KinWaiYuen/client-go/v2/internal/retry"
    55  	"github.com/KinWaiYuen/client-go/v2/kv"
    56  	"github.com/KinWaiYuen/client-go/v2/metrics"
    57  	"github.com/KinWaiYuen/client-go/v2/tikvrpc"
    58  	"github.com/KinWaiYuen/client-go/v2/util"
    59  	"github.com/opentracing/opentracing-go"
    60  	"github.com/pingcap/errors"
    61  	"github.com/pingcap/kvproto/pkg/coprocessor"
    62  	"github.com/pingcap/kvproto/pkg/errorpb"
    63  	"github.com/pingcap/kvproto/pkg/kvrpcpb"
    64  	"github.com/pingcap/kvproto/pkg/metapb"
    65  )
    66  
    67  // shuttingDown is a flag to indicate tidb-server is exiting (Ctrl+C signal
    68  // receved for example). If this flag is set, tikv client should not retry on
    69  // network error because tidb-server expect tikv client to exit as soon as possible.
    70  var shuttingDown uint32
    71  
    72  // StoreShuttingDown atomically stores ShuttingDown into v.
    73  func StoreShuttingDown(v uint32) {
    74  	atomic.StoreUint32(&shuttingDown, v)
    75  }
    76  
    77  // LoadShuttingDown atomically loads ShuttingDown.
    78  func LoadShuttingDown() uint32 {
    79  	return atomic.LoadUint32(&shuttingDown)
    80  }
    81  
    82  // RegionRequestSender sends KV/Cop requests to tikv server. It handles network
    83  // errors and some region errors internally.
    84  //
    85  // Typically, a KV/Cop request is bind to a region, all keys that are involved
    86  // in the request should be located in the region.
    87  // The sending process begins with looking for the address of leader store's
    88  // address of the target region from cache, and the request is then sent to the
    89  // destination tikv server over TCP connection.
    90  // If region is updated, can be caused by leader transfer, region split, region
    91  // merge, or region balance, tikv server may not able to process request and
    92  // send back a RegionError.
    93  // RegionRequestSender takes care of errors that does not relevant to region
    94  // range, such as 'I/O timeout', 'NotLeader', and 'ServerIsBusy'. If fails to
    95  // send the request to all replicas, a fake rregion error may be returned.
    96  // Caller which receives the error should retry the request.
    97  //
    98  // For other region errors, since region range have changed, the request may need to
    99  // split, so we simply return the error to caller.
   100  type RegionRequestSender struct {
   101  	regionCache       *RegionCache
   102  	client            client.Client
   103  	storeAddr         string
   104  	rpcError          error
   105  	replicaSelector   *replicaSelector
   106  	failStoreIDs      map[uint64]struct{}
   107  	failProxyStoreIDs map[uint64]struct{}
   108  	RegionRequestRuntimeStats
   109  }
   110  
   111  // RegionRequestRuntimeStats records the runtime stats of send region requests.
   112  type RegionRequestRuntimeStats struct {
   113  	Stats map[tikvrpc.CmdType]*RPCRuntimeStats
   114  }
   115  
   116  // NewRegionRequestRuntimeStats returns a new RegionRequestRuntimeStats.
   117  func NewRegionRequestRuntimeStats() RegionRequestRuntimeStats {
   118  	return RegionRequestRuntimeStats{
   119  		Stats: make(map[tikvrpc.CmdType]*RPCRuntimeStats),
   120  	}
   121  }
   122  
   123  // RPCRuntimeStats indicates the RPC request count and consume time.
   124  type RPCRuntimeStats struct {
   125  	Count int64
   126  	// Send region request consume time.
   127  	Consume int64
   128  }
   129  
   130  // String implements fmt.Stringer interface.
   131  func (r *RegionRequestRuntimeStats) String() string {
   132  	var builder strings.Builder
   133  	for k, v := range r.Stats {
   134  		if builder.Len() > 0 {
   135  			builder.WriteByte(',')
   136  		}
   137  		// append string: fmt.Sprintf("%s:{num_rpc:%v, total_time:%s}", k.String(), v.Count, util.FormatDuration(time.Duration(v.Consume))")
   138  		builder.WriteString(k.String())
   139  		builder.WriteString(":{num_rpc:")
   140  		builder.WriteString(strconv.FormatInt(v.Count, 10))
   141  		builder.WriteString(", total_time:")
   142  		builder.WriteString(util.FormatDuration(time.Duration(v.Consume)))
   143  		builder.WriteString("}")
   144  	}
   145  	return builder.String()
   146  }
   147  
   148  // Clone returns a copy of itself.
   149  func (r *RegionRequestRuntimeStats) Clone() RegionRequestRuntimeStats {
   150  	newRs := NewRegionRequestRuntimeStats()
   151  	for cmd, v := range r.Stats {
   152  		newRs.Stats[cmd] = &RPCRuntimeStats{
   153  			Count:   v.Count,
   154  			Consume: v.Consume,
   155  		}
   156  	}
   157  	return newRs
   158  }
   159  
   160  // Merge merges other RegionRequestRuntimeStats.
   161  func (r *RegionRequestRuntimeStats) Merge(rs RegionRequestRuntimeStats) {
   162  	for cmd, v := range rs.Stats {
   163  		stat, ok := r.Stats[cmd]
   164  		if !ok {
   165  			r.Stats[cmd] = &RPCRuntimeStats{
   166  				Count:   v.Count,
   167  				Consume: v.Consume,
   168  			}
   169  			continue
   170  		}
   171  		stat.Count += v.Count
   172  		stat.Consume += v.Consume
   173  	}
   174  }
   175  
   176  // RecordRegionRequestRuntimeStats records request runtime stats.
   177  func RecordRegionRequestRuntimeStats(stats map[tikvrpc.CmdType]*RPCRuntimeStats, cmd tikvrpc.CmdType, d time.Duration) {
   178  	stat, ok := stats[cmd]
   179  	if !ok {
   180  		stats[cmd] = &RPCRuntimeStats{
   181  			Count:   1,
   182  			Consume: int64(d),
   183  		}
   184  		return
   185  	}
   186  	stat.Count++
   187  	stat.Consume += int64(d)
   188  }
   189  
   190  // NewRegionRequestSender creates a new sender.
   191  func NewRegionRequestSender(regionCache *RegionCache, client client.Client) *RegionRequestSender {
   192  	return &RegionRequestSender{
   193  		regionCache: regionCache,
   194  		client:      client,
   195  	}
   196  }
   197  
   198  // GetRegionCache returns the region cache.
   199  func (s *RegionRequestSender) GetRegionCache() *RegionCache {
   200  	return s.regionCache
   201  }
   202  
   203  // GetClient returns the RPC client.
   204  func (s *RegionRequestSender) GetClient() client.Client {
   205  	return s.client
   206  }
   207  
   208  // SetStoreAddr specifies the dest store address.
   209  func (s *RegionRequestSender) SetStoreAddr(addr string) {
   210  	s.storeAddr = addr
   211  }
   212  
   213  // GetStoreAddr returns the dest store address.
   214  func (s *RegionRequestSender) GetStoreAddr() string {
   215  	return s.storeAddr
   216  }
   217  
   218  // GetRPCError returns the RPC error.
   219  func (s *RegionRequestSender) GetRPCError() error {
   220  	return s.rpcError
   221  }
   222  
   223  // SetRPCError rewrite the rpc error.
   224  func (s *RegionRequestSender) SetRPCError(err error) {
   225  	s.rpcError = err
   226  }
   227  
   228  // SendReq sends a request to tikv server. If fails to send the request to all replicas,
   229  // a fake region error may be returned. Caller which receives the error should retry the request.
   230  func (s *RegionRequestSender) SendReq(bo *retry.Backoffer, req *tikvrpc.Request, regionID RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) {
   231  	resp, _, err := s.SendReqCtx(bo, req, regionID, timeout, tikvrpc.TiKV)
   232  	return resp, err
   233  }
   234  
   235  type replica struct {
   236  	store    *Store
   237  	peer     *metapb.Peer
   238  	epoch    uint32
   239  	attempts int
   240  }
   241  
   242  func (r *replica) isEpochStale() bool {
   243  	return r.epoch != atomic.LoadUint32(&r.store.epoch)
   244  }
   245  
   246  func (r *replica) isExhausted(maxAttempt int) bool {
   247  	return r.attempts >= maxAttempt
   248  }
   249  
   250  type replicaSelector struct {
   251  	regionCache *RegionCache
   252  	region      *Region
   253  	regionStore *regionStore
   254  	replicas    []*replica
   255  	state       selectorState
   256  	// replicas[targetIdx] is the replica handling the request this time
   257  	targetIdx AccessIndex
   258  	// replicas[proxyIdx] is the store used to redirect requests this time
   259  	proxyIdx AccessIndex
   260  }
   261  
   262  // selectorState is the interface of states of the replicaSelector.
   263  // Here is the main state transition diagram:
   264  //
   265  //                                    exceeding maxReplicaAttempt
   266  //           +-------------------+   || RPC failure && unreachable && no forwarding
   267  // +-------->+ accessKnownLeader +----------------+
   268  // |         +------+------------+                |
   269  // |                |                             |
   270  // |                | RPC failure                 v
   271  // |                | && unreachable        +-----+-----+
   272  // |                | && enable forwarding  |tryFollower+------+
   273  // |                |                       +-----------+      |
   274  // | leader becomes v                                          | all followers
   275  // | reachable +----+-------------+                            | are tried
   276  // +-----------+accessByKnownProxy|                            |
   277  // ^           +------+-----------+                            |
   278  // |                  |                           +-------+    |
   279  // |                  | RPC failure               |backoff+<---+
   280  // | leader becomes   v                           +---+---+
   281  // | reachable  +-----+-----+ all proxies are tried   ^
   282  // +------------+tryNewProxy+-------------------------+
   283  //              +-----------+
   284  type selectorState interface {
   285  	next(*retry.Backoffer, *replicaSelector) (*RPCContext, error)
   286  	onSendSuccess(*replicaSelector)
   287  	onSendFailure(*retry.Backoffer, *replicaSelector, error)
   288  	onNoLeader(*replicaSelector)
   289  }
   290  
   291  type stateChanged struct{}
   292  
   293  func (c stateChanged) Error() string {
   294  	return "replicaSelector state changed"
   295  }
   296  
   297  type stateBase struct{}
   298  
   299  func (s stateBase) next(bo *retry.Backoffer, selector *replicaSelector) (*RPCContext, error) {
   300  	return nil, nil
   301  }
   302  
   303  func (s stateBase) onSendSuccess(selector *replicaSelector) {
   304  }
   305  
   306  func (s stateBase) onSendFailure(backoffer *retry.Backoffer, selector *replicaSelector, err error) {
   307  }
   308  
   309  func (s stateBase) onNoLeader(selector *replicaSelector) {
   310  }
   311  
   312  // accessKnownLeader is the state where we are sending requests
   313  // to the leader we suppose to be.
   314  //
   315  // After attempting maxReplicaAttempt times without success
   316  // and without receiving new leader from the responses error,
   317  // we should switch to tryFollower state.
   318  type accessKnownLeader struct {
   319  	stateBase
   320  	leaderIdx AccessIndex
   321  }
   322  
   323  func (state *accessKnownLeader) next(bo *retry.Backoffer, selector *replicaSelector) (*RPCContext, error) {
   324  	leader := selector.replicas[state.leaderIdx]
   325  	if leader.isExhausted(maxReplicaAttempt) {
   326  		selector.state = &tryFollower{leaderIdx: state.leaderIdx, lastIdx: state.leaderIdx}
   327  		return nil, stateChanged{}
   328  	}
   329  	selector.targetIdx = state.leaderIdx
   330  	return selector.buildRPCContext(bo)
   331  }
   332  
   333  func (state *accessKnownLeader) onSendFailure(bo *retry.Backoffer, selector *replicaSelector, cause error) {
   334  	liveness := selector.checkLiveness(bo, selector.targetReplica())
   335  	if liveness != reachable && len(selector.replicas) > 1 && selector.regionCache.enableForwarding {
   336  		selector.state = &accessByKnownProxy{leaderIdx: state.leaderIdx}
   337  		return
   338  	}
   339  	if liveness != reachable || selector.targetReplica().isExhausted(maxReplicaAttempt) {
   340  		selector.state = &tryFollower{leaderIdx: state.leaderIdx, lastIdx: state.leaderIdx}
   341  	}
   342  	if liveness != reachable {
   343  		selector.invalidateReplicaStore(selector.targetReplica(), cause)
   344  	}
   345  }
   346  
   347  func (state *accessKnownLeader) onNoLeader(selector *replicaSelector) {
   348  	selector.state = &tryFollower{leaderIdx: state.leaderIdx, lastIdx: state.leaderIdx}
   349  }
   350  
   351  // tryFollower is the state where we cannot access the known leader
   352  // but still try other replicas in case they have become the leader.
   353  //
   354  // In this state, a follower that is not tried will be used. If all
   355  // followers are tried, we think we have exhausted the replicas.
   356  // On sending failure in this state, if leader info is returned,
   357  // the leader will be updated to replicas[0] and give it another chance.
   358  type tryFollower struct {
   359  	stateBase
   360  	leaderIdx AccessIndex
   361  	lastIdx   AccessIndex
   362  }
   363  
   364  func (state *tryFollower) next(bo *retry.Backoffer, selector *replicaSelector) (*RPCContext, error) {
   365  	var targetReplica *replica
   366  	// Search replica that is not attempted from the last accessed replica
   367  	for i := 1; i < len(selector.replicas); i++ {
   368  		idx := AccessIndex((int(state.lastIdx) + i) % len(selector.replicas))
   369  		if idx == state.leaderIdx {
   370  			continue
   371  		}
   372  		targetReplica = selector.replicas[idx]
   373  		// Each follower is only tried once
   374  		if !targetReplica.isExhausted(1) {
   375  			state.lastIdx = idx
   376  			selector.targetIdx = idx
   377  			break
   378  		}
   379  	}
   380  	// If all followers are tried and fail, backoff and retry.
   381  	if selector.targetIdx < 0 {
   382  		metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc()
   383  		selector.invalidateRegion()
   384  		return nil, nil
   385  	}
   386  	return selector.buildRPCContext(bo)
   387  }
   388  
   389  func (state *tryFollower) onSendSuccess(selector *replicaSelector) {
   390  	if !selector.regionCache.switchWorkLeaderToPeer(selector.region, selector.targetReplica().peer) {
   391  		panic("the store must exist")
   392  	}
   393  }
   394  
   395  func (state *tryFollower) onSendFailure(bo *retry.Backoffer, selector *replicaSelector, cause error) {
   396  	if selector.checkLiveness(bo, selector.targetReplica()) != reachable {
   397  		selector.invalidateReplicaStore(selector.targetReplica(), cause)
   398  	}
   399  }
   400  
   401  // accessByKnownProxy is the state where we are sending requests through
   402  // regionStore.proxyTiKVIdx as a proxy.
   403  type accessByKnownProxy struct {
   404  	stateBase
   405  	leaderIdx AccessIndex
   406  }
   407  
   408  func (state *accessByKnownProxy) next(bo *retry.Backoffer, selector *replicaSelector) (*RPCContext, error) {
   409  	leader := selector.replicas[state.leaderIdx]
   410  	if atomic.LoadInt32(&leader.store.unreachable) == 0 {
   411  		selector.regionStore.unsetProxyStoreIfNeeded(selector.region)
   412  		selector.state = &accessKnownLeader{leaderIdx: state.leaderIdx}
   413  		return nil, stateChanged{}
   414  	}
   415  
   416  	if selector.regionStore.proxyTiKVIdx >= 0 {
   417  		selector.targetIdx = state.leaderIdx
   418  		selector.proxyIdx = selector.regionStore.proxyTiKVIdx
   419  		return selector.buildRPCContext(bo)
   420  	}
   421  
   422  	selector.state = &tryNewProxy{leaderIdx: state.leaderIdx}
   423  	return nil, stateChanged{}
   424  }
   425  
   426  func (state *accessByKnownProxy) onSendFailure(bo *retry.Backoffer, selector *replicaSelector, cause error) {
   427  	selector.state = &tryNewProxy{leaderIdx: state.leaderIdx}
   428  	if selector.checkLiveness(bo, selector.proxyReplica()) != reachable {
   429  		selector.invalidateReplicaStore(selector.proxyReplica(), cause)
   430  	}
   431  }
   432  
   433  func (state *accessByKnownProxy) onNoLeader(selector *replicaSelector) {
   434  	selector.state = &invalidLeader{}
   435  }
   436  
   437  // tryNewProxy is the state where we try to find a node from followers as proxy.
   438  type tryNewProxy struct {
   439  	stateBase
   440  	leaderIdx AccessIndex
   441  }
   442  
   443  func (state *tryNewProxy) next(bo *retry.Backoffer, selector *replicaSelector) (*RPCContext, error) {
   444  	leader := selector.replicas[state.leaderIdx]
   445  	if atomic.LoadInt32(&leader.store.unreachable) == 0 {
   446  		selector.regionStore.unsetProxyStoreIfNeeded(selector.region)
   447  		selector.state = &accessKnownLeader{leaderIdx: state.leaderIdx}
   448  		return nil, stateChanged{}
   449  	}
   450  
   451  	candidateNum := 0
   452  	for idx, replica := range selector.replicas {
   453  		if state.isCandidate(AccessIndex(idx), replica) {
   454  			candidateNum++
   455  		}
   456  	}
   457  
   458  	// If all followers are tried as a proxy and fail, mark the leader store invalid, then backoff and retry.
   459  	if candidateNum == 0 {
   460  		metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc()
   461  		selector.invalidateReplicaStore(leader, errors.Errorf("all followers are tried as proxy but fail"))
   462  		selector.region.scheduleReload()
   463  		return nil, nil
   464  	}
   465  
   466  	// Skip advanceCnt valid candidates to find a proxy peer randomly
   467  	advanceCnt := rand.Intn(candidateNum)
   468  	for idx, replica := range selector.replicas {
   469  		if !state.isCandidate(AccessIndex(idx), replica) {
   470  			continue
   471  		}
   472  		if advanceCnt == 0 {
   473  			selector.targetIdx = state.leaderIdx
   474  			selector.proxyIdx = AccessIndex(idx)
   475  			break
   476  		}
   477  		advanceCnt--
   478  	}
   479  	return selector.buildRPCContext(bo)
   480  }
   481  
   482  func (state *tryNewProxy) isCandidate(idx AccessIndex, replica *replica) bool {
   483  	// Try each peer only once
   484  	return idx != state.leaderIdx && !replica.isExhausted(1)
   485  }
   486  
   487  func (state *tryNewProxy) onSendSuccess(selector *replicaSelector) {
   488  	selector.regionStore.setProxyStoreIdx(selector.region, selector.proxyIdx)
   489  }
   490  
   491  func (state *tryNewProxy) onSendFailure(bo *retry.Backoffer, selector *replicaSelector, cause error) {
   492  	if selector.checkLiveness(bo, selector.proxyReplica()) != reachable {
   493  		selector.invalidateReplicaStore(selector.proxyReplica(), cause)
   494  	}
   495  }
   496  
   497  func (state *tryNewProxy) onNoLeader(selector *replicaSelector) {
   498  	selector.state = &invalidLeader{}
   499  }
   500  
   501  // accessFollower is the state where we are sending requests to TiKV followers.
   502  // If there is no suitable follower, requests will be sent to the leader as a fallback.
   503  type accessFollower struct {
   504  	stateBase
   505  	// If tryLeader is true, the request can also be sent to the leader.
   506  	tryLeader         bool
   507  	isGlobalStaleRead bool
   508  	option            storeSelectorOp
   509  	leaderIdx         AccessIndex
   510  	lastIdx           AccessIndex
   511  }
   512  
   513  func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector) (*RPCContext, error) {
   514  	if state.lastIdx < 0 {
   515  		if state.tryLeader {
   516  			state.lastIdx = AccessIndex(rand.Intn(len(selector.replicas)))
   517  		} else {
   518  			if len(selector.replicas) <= 1 {
   519  				state.lastIdx = state.leaderIdx
   520  			} else {
   521  				// Randomly select a non-leader peer
   522  				state.lastIdx = AccessIndex(rand.Intn(len(selector.replicas) - 1))
   523  				if state.lastIdx >= state.leaderIdx {
   524  					state.lastIdx++
   525  				}
   526  			}
   527  		}
   528  	} else {
   529  		// Stale Read request will retry the leader or next peer on error,
   530  		// if txnScope is global, we will only retry the leader by using the WithLeaderOnly option,
   531  		// if txnScope is local, we will retry both other peers and the leader by the strategy of replicaSelector.
   532  		if state.isGlobalStaleRead {
   533  			WithLeaderOnly()(&state.option)
   534  		}
   535  		state.lastIdx++
   536  	}
   537  
   538  	for i := 0; i < len(selector.replicas) && !state.option.leaderOnly; i++ {
   539  		idx := AccessIndex((int(state.lastIdx) + i) % len(selector.replicas))
   540  		if state.isCandidate(idx, selector.replicas[idx]) {
   541  			state.lastIdx = idx
   542  			selector.targetIdx = idx
   543  			break
   544  		}
   545  	}
   546  	// If there is no candidate, fallback to the leader.
   547  	if selector.targetIdx < 0 {
   548  		if len(state.option.labels) > 0 {
   549  			logutil.BgLogger().Warn("unable to find stores with given labels")
   550  		}
   551  		leader := selector.replicas[state.leaderIdx]
   552  		if leader.isEpochStale() || leader.isExhausted(1) {
   553  			metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc()
   554  			selector.invalidateRegion()
   555  			return nil, nil
   556  		}
   557  		state.lastIdx = state.leaderIdx
   558  		selector.targetIdx = state.leaderIdx
   559  	}
   560  	return selector.buildRPCContext(bo)
   561  }
   562  
   563  func (state *accessFollower) onSendFailure(bo *retry.Backoffer, selector *replicaSelector, cause error) {
   564  	if selector.checkLiveness(bo, selector.targetReplica()) != reachable {
   565  		selector.invalidateReplicaStore(selector.targetReplica(), cause)
   566  	}
   567  }
   568  
   569  func (state *accessFollower) isCandidate(idx AccessIndex, replica *replica) bool {
   570  	return !replica.isEpochStale() && !replica.isExhausted(1) &&
   571  		// The request can only be sent to the leader.
   572  		((state.option.leaderOnly && idx == state.leaderIdx) ||
   573  			// Choose a replica with matched labels.
   574  			(!state.option.leaderOnly && (state.tryLeader || idx != state.leaderIdx) && replica.store.IsLabelsMatch(state.option.labels)))
   575  }
   576  
   577  type invalidStore struct {
   578  	stateBase
   579  }
   580  
   581  func (state *invalidStore) next(_ *retry.Backoffer, _ *replicaSelector) (*RPCContext, error) {
   582  	metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("invalidStore").Inc()
   583  	return nil, nil
   584  }
   585  
   586  // TODO(sticnarf): If using request forwarding and the leader is unknown, try other followers
   587  // instead of just switching to this state to backoff and retry.
   588  type invalidLeader struct {
   589  	stateBase
   590  }
   591  
   592  func (state *invalidLeader) next(_ *retry.Backoffer, _ *replicaSelector) (*RPCContext, error) {
   593  	metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("invalidLeader").Inc()
   594  	return nil, nil
   595  }
   596  
   597  // newReplicaSelector creates a replicaSelector which selects replicas according to reqType and opts.
   598  // opts is currently only effective for follower read.
   599  func newReplicaSelector(regionCache *RegionCache, regionID RegionVerID, req *tikvrpc.Request, opts ...StoreSelectorOption) (*replicaSelector, error) {
   600  	cachedRegion := regionCache.GetCachedRegionWithRLock(regionID)
   601  	if cachedRegion == nil || !cachedRegion.isValid() {
   602  		return nil, nil
   603  	}
   604  	regionStore := cachedRegion.getStore()
   605  	replicas := make([]*replica, 0, regionStore.accessStoreNum(tiKVOnly))
   606  	for _, storeIdx := range regionStore.accessIndex[tiKVOnly] {
   607  		replicas = append(replicas, &replica{
   608  			store:    regionStore.stores[storeIdx],
   609  			peer:     cachedRegion.meta.Peers[storeIdx],
   610  			epoch:    regionStore.storeEpochs[storeIdx],
   611  			attempts: 0,
   612  		})
   613  	}
   614  	var state selectorState
   615  	if !req.ReplicaReadType.IsFollowerRead() {
   616  		if regionCache.enableForwarding && regionStore.proxyTiKVIdx >= 0 {
   617  			state = &accessByKnownProxy{leaderIdx: regionStore.workTiKVIdx}
   618  		} else {
   619  			state = &accessKnownLeader{leaderIdx: regionStore.workTiKVIdx}
   620  		}
   621  	} else {
   622  		option := storeSelectorOp{}
   623  		for _, op := range opts {
   624  			op(&option)
   625  		}
   626  		state = &accessFollower{
   627  			tryLeader:         req.ReplicaReadType == kv.ReplicaReadMixed,
   628  			isGlobalStaleRead: req.IsGlobalStaleRead(),
   629  			option:            option,
   630  			leaderIdx:         regionStore.workTiKVIdx,
   631  			lastIdx:           -1,
   632  		}
   633  	}
   634  
   635  	return &replicaSelector{
   636  		regionCache,
   637  		cachedRegion,
   638  		regionStore,
   639  		replicas,
   640  		state,
   641  		-1,
   642  		-1,
   643  	}, nil
   644  }
   645  
   646  const maxReplicaAttempt = 10
   647  
   648  // next creates the RPCContext of the current candidate replica.
   649  // It returns a SendError if runs out of all replicas or the cached region is invalidated.
   650  func (s *replicaSelector) next(bo *retry.Backoffer) (rpcCtx *RPCContext, err error) {
   651  	if !s.region.isValid() {
   652  		metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("invalid").Inc()
   653  		return nil, nil
   654  	}
   655  
   656  	s.targetIdx = -1
   657  	s.proxyIdx = -1
   658  	s.refreshRegionStore()
   659  	for {
   660  		rpcCtx, err = s.state.next(bo, s)
   661  		if _, isStateChanged := err.(stateChanged); !isStateChanged {
   662  			return
   663  		}
   664  	}
   665  }
   666  
   667  func (s *replicaSelector) targetReplica() *replica {
   668  	if s.targetIdx >= 0 && int(s.targetIdx) < len(s.replicas) {
   669  		return s.replicas[s.targetIdx]
   670  	}
   671  	return nil
   672  }
   673  
   674  func (s *replicaSelector) proxyReplica() *replica {
   675  	if s.proxyIdx >= 0 && int(s.proxyIdx) < len(s.replicas) {
   676  		return s.replicas[s.proxyIdx]
   677  	}
   678  	return nil
   679  }
   680  
   681  func (s *replicaSelector) refreshRegionStore() {
   682  	oldRegionStore := s.regionStore
   683  	newRegionStore := s.region.getStore()
   684  	if oldRegionStore == newRegionStore {
   685  		return
   686  	}
   687  	s.regionStore = newRegionStore
   688  
   689  	// In the current implementation, if stores change, the address of it must change.
   690  	// So we just compare the address here.
   691  	// When stores change, we mark this replicaSelector as invalid to let the caller
   692  	// recreate a new replicaSelector.
   693  	if &oldRegionStore.stores != &newRegionStore.stores {
   694  		s.state = &invalidStore{}
   695  		return
   696  	}
   697  
   698  	// If leader has changed, it means a recent request succeeds an RPC
   699  	// on the new leader.
   700  	if oldRegionStore.workTiKVIdx != newRegionStore.workTiKVIdx {
   701  		switch state := s.state.(type) {
   702  		case *accessFollower:
   703  			state.leaderIdx = newRegionStore.workTiKVIdx
   704  		default:
   705  			// Try the new leader and give it an addition chance if the
   706  			// request is sent to the leader.
   707  			newLeaderIdx := newRegionStore.workTiKVIdx
   708  			s.state = &accessKnownLeader{leaderIdx: newLeaderIdx}
   709  			if s.replicas[newLeaderIdx].attempts == maxReplicaAttempt {
   710  				s.replicas[newLeaderIdx].attempts--
   711  			}
   712  		}
   713  	}
   714  }
   715  
   716  func (s *replicaSelector) buildRPCContext(bo *retry.Backoffer) (*RPCContext, error) {
   717  	targetReplica, proxyReplica := s.targetReplica(), s.proxyReplica()
   718  
   719  	// Backoff and retry if no replica is selected or the selected replica is stale
   720  	if targetReplica == nil || targetReplica.isEpochStale() ||
   721  		(proxyReplica != nil && proxyReplica.isEpochStale()) {
   722  		// TODO(youjiali1995): Is it necessary to invalidate the region?
   723  		metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("stale_store").Inc()
   724  		s.invalidateRegion()
   725  		return nil, nil
   726  	}
   727  
   728  	rpcCtx := &RPCContext{
   729  		Region:     s.region.VerID(),
   730  		Meta:       s.region.meta,
   731  		Peer:       targetReplica.peer,
   732  		Store:      targetReplica.store,
   733  		AccessMode: tiKVOnly,
   734  		TiKVNum:    len(s.replicas),
   735  	}
   736  
   737  	// Set leader addr
   738  	addr, err := s.regionCache.getStoreAddr(bo, s.region, targetReplica.store)
   739  	if err != nil {
   740  		return nil, err
   741  	}
   742  	if len(addr) == 0 {
   743  		return nil, nil
   744  	}
   745  	rpcCtx.Addr = addr
   746  	targetReplica.attempts++
   747  
   748  	// Set proxy addr
   749  	if proxyReplica != nil {
   750  		addr, err = s.regionCache.getStoreAddr(bo, s.region, proxyReplica.store)
   751  		if err != nil {
   752  			return nil, err
   753  		}
   754  		if len(addr) == 0 {
   755  			return nil, nil
   756  		}
   757  		rpcCtx.ProxyStore = proxyReplica.store
   758  		rpcCtx.ProxyAddr = addr
   759  		proxyReplica.attempts++
   760  	}
   761  
   762  	return rpcCtx, nil
   763  }
   764  
   765  func (s *replicaSelector) onSendFailure(bo *retry.Backoffer, err error) {
   766  	metrics.RegionCacheCounterWithSendFail.Inc()
   767  	s.state.onSendFailure(bo, s, err)
   768  }
   769  
   770  func (s *replicaSelector) checkLiveness(bo *retry.Backoffer, accessReplica *replica) livenessState {
   771  	store := accessReplica.store
   772  	liveness := store.requestLiveness(bo, s.regionCache)
   773  	// We only check health in loop if forwarding is enabled now.
   774  	// The restriction might be relaxed if necessary, but the implementation
   775  	// may be checked carefully again.
   776  	if liveness != reachable && s.regionCache.enableForwarding {
   777  		store.startHealthCheckLoopIfNeeded(s.regionCache)
   778  	}
   779  	return liveness
   780  }
   781  
   782  func (s *replicaSelector) invalidateReplicaStore(replica *replica, cause error) {
   783  	store := replica.store
   784  	if atomic.CompareAndSwapUint32(&store.epoch, replica.epoch, replica.epoch+1) {
   785  		logutil.BgLogger().Info("mark store's regions need be refill", zap.Uint64("id", store.storeID), zap.String("addr", store.addr), zap.Error(cause))
   786  		metrics.RegionCacheCounterWithInvalidateStoreRegionsOK.Inc()
   787  		// schedule a store addr resolve.
   788  		store.markNeedCheck(s.regionCache.notifyCheckCh)
   789  	}
   790  }
   791  
   792  func (s *replicaSelector) onSendSuccess() {
   793  	s.state.onSendSuccess(s)
   794  }
   795  
   796  func (s *replicaSelector) onNotLeader(bo *retry.Backoffer, ctx *RPCContext, notLeader *errorpb.NotLeader) (shouldRetry bool, err error) {
   797  	leader := notLeader.GetLeader()
   798  	if leader == nil {
   799  		// The region may be during transferring leader.
   800  		s.state.onNoLeader(s)
   801  		if err = bo.Backoff(retry.BoRegionScheduling, errors.Errorf("no leader, ctx: %v", ctx)); err != nil {
   802  			return false, errors.Trace(err)
   803  		}
   804  	} else {
   805  		s.updateLeader(notLeader.GetLeader())
   806  	}
   807  	return true, nil
   808  }
   809  
   810  // updateLeader updates the leader of the cached region.
   811  // If the leader peer isn't found in the region, the region will be invalidated.
   812  func (s *replicaSelector) updateLeader(leader *metapb.Peer) {
   813  	if leader == nil {
   814  		return
   815  	}
   816  	for i, replica := range s.replicas {
   817  		if isSamePeer(replica.peer, leader) {
   818  			if replica.isExhausted(maxReplicaAttempt) {
   819  				// Give the replica one more chance and because each follower is tried only once,
   820  				// it won't result in infinite retry.
   821  				replica.attempts = maxReplicaAttempt - 1
   822  			}
   823  			s.state = &accessKnownLeader{leaderIdx: AccessIndex(i)}
   824  			// Update the workTiKVIdx so that following requests can be sent to the leader immediately.
   825  			if !s.regionCache.switchWorkLeaderToPeer(s.region, leader) {
   826  				panic("the store must exist")
   827  			}
   828  			logutil.BgLogger().Debug("switch region leader to specific leader due to kv return NotLeader",
   829  				zap.Uint64("regionID", s.region.GetID()),
   830  				zap.Uint64("leaderStoreID", leader.GetStoreId()))
   831  			return
   832  		}
   833  	}
   834  	// Invalidate the region since the new leader is not in the cached version.
   835  	s.region.invalidate(StoreNotFound)
   836  }
   837  
   838  func (s *replicaSelector) invalidateRegion() {
   839  	if s.region != nil {
   840  		s.region.invalidate(Other)
   841  	}
   842  }
   843  
   844  func (s *RegionRequestSender) getRPCContext(
   845  	bo *retry.Backoffer,
   846  	req *tikvrpc.Request,
   847  	regionID RegionVerID,
   848  	et tikvrpc.EndpointType,
   849  	opts ...StoreSelectorOption,
   850  ) (*RPCContext, error) {
   851  	switch et {
   852  	case tikvrpc.TiKV:
   853  		if s.replicaSelector == nil {
   854  			selector, err := newReplicaSelector(s.regionCache, regionID, req, opts...)
   855  			if selector == nil || err != nil {
   856  				return nil, err
   857  			}
   858  			s.replicaSelector = selector
   859  		}
   860  		return s.replicaSelector.next(bo)
   861  	case tikvrpc.TiFlash:
   862  		return s.regionCache.GetTiFlashRPCContext(bo, regionID, true)
   863  	case tikvrpc.TiDB:
   864  		return &RPCContext{Addr: s.storeAddr}, nil
   865  	default:
   866  		return nil, errors.Errorf("unsupported storage type: %v", et)
   867  	}
   868  }
   869  
   870  func (s *RegionRequestSender) reset() {
   871  	s.replicaSelector = nil
   872  	s.failStoreIDs = nil
   873  	s.failProxyStoreIDs = nil
   874  }
   875  
   876  // IsFakeRegionError returns true if err is fack region error.
   877  func IsFakeRegionError(err *errorpb.Error) bool {
   878  	return err != nil && err.GetEpochNotMatch() != nil && len(err.GetEpochNotMatch().CurrentRegions) == 0
   879  }
   880  
   881  // SendReqCtx sends a request to tikv server and return response and RPCCtx of this RPC.
   882  func (s *RegionRequestSender) SendReqCtx(
   883  	bo *retry.Backoffer,
   884  	req *tikvrpc.Request,
   885  	regionID RegionVerID,
   886  	timeout time.Duration,
   887  	et tikvrpc.EndpointType,
   888  	opts ...StoreSelectorOption,
   889  ) (
   890  	resp *tikvrpc.Response,
   891  	rpcCtx *RPCContext,
   892  	err error,
   893  ) {
   894  	if span := opentracing.SpanFromContext(bo.GetCtx()); span != nil && span.Tracer() != nil {
   895  		span1 := span.Tracer().StartSpan("regionRequest.SendReqCtx", opentracing.ChildOf(span.Context()))
   896  		defer span1.Finish()
   897  		bo.SetCtx(opentracing.ContextWithSpan(bo.GetCtx(), span1))
   898  	}
   899  
   900  	if val, err := util.EvalFailpoint("tikvStoreSendReqResult"); err == nil {
   901  		switch val.(string) {
   902  		case "timeout":
   903  			return nil, nil, errors.New("timeout")
   904  		case "GCNotLeader":
   905  			if req.Type == tikvrpc.CmdGC {
   906  				return &tikvrpc.Response{
   907  					Resp: &kvrpcpb.GCResponse{RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{}}},
   908  				}, nil, nil
   909  			}
   910  		case "GCServerIsBusy":
   911  			if req.Type == tikvrpc.CmdGC {
   912  				return &tikvrpc.Response{
   913  					Resp: &kvrpcpb.GCResponse{RegionError: &errorpb.Error{ServerIsBusy: &errorpb.ServerIsBusy{}}},
   914  				}, nil, nil
   915  			}
   916  		case "busy":
   917  			return &tikvrpc.Response{
   918  				Resp: &kvrpcpb.GCResponse{RegionError: &errorpb.Error{ServerIsBusy: &errorpb.ServerIsBusy{}}},
   919  			}, nil, nil
   920  		case "requestTiDBStoreError":
   921  			if et == tikvrpc.TiDB {
   922  				return nil, nil, tikverr.ErrTiKVServerTimeout
   923  			}
   924  		case "requestTiFlashError":
   925  			if et == tikvrpc.TiFlash {
   926  				return nil, nil, tikverr.ErrTiFlashServerTimeout
   927  			}
   928  		}
   929  	}
   930  
   931  	// If the MaxExecutionDurationMs is not set yet, we set it to be the RPC timeout duration
   932  	// so TiKV can give up the requests whose response TiDB cannot receive due to timeout.
   933  	if req.Context.MaxExecutionDurationMs == 0 {
   934  		req.Context.MaxExecutionDurationMs = uint64(timeout.Milliseconds())
   935  	}
   936  
   937  	s.reset()
   938  	tryTimes := 0
   939  	defer func() {
   940  		if tryTimes > 0 {
   941  			metrics.TiKVRequestRetryTimesHistogram.Observe(float64(tryTimes))
   942  		}
   943  	}()
   944  	for {
   945  		if tryTimes > 0 {
   946  			req.IsRetryRequest = true
   947  			if tryTimes%100 == 0 {
   948  				logutil.Logger(bo.GetCtx()).Warn("retry", zap.Uint64("region", regionID.GetID()), zap.Int("times", tryTimes))
   949  			}
   950  		}
   951  
   952  		rpcCtx, err = s.getRPCContext(bo, req, regionID, et, opts...)
   953  		if err != nil {
   954  			return nil, nil, err
   955  		}
   956  
   957  		if _, err := util.EvalFailpoint("invalidCacheAndRetry"); err == nil {
   958  			// cooperate with tikvclient/setGcResolveMaxBackoff
   959  			if c := bo.GetCtx().Value("injectedBackoff"); c != nil {
   960  				resp, err = tikvrpc.GenRegionErrorResp(req, &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}})
   961  				return resp, nil, err
   962  			}
   963  		}
   964  		if rpcCtx == nil {
   965  			// TODO(youjiali1995): remove it when using the replica selector for all requests.
   966  			// If the region is not found in cache, it must be out
   967  			// of date and already be cleaned up. We can skip the
   968  			// RPC by returning RegionError directly.
   969  
   970  			// TODO: Change the returned error to something like "region missing in cache",
   971  			// and handle this error like EpochNotMatch, which means to re-split the request and retry.
   972  			logutil.Logger(bo.GetCtx()).Debug("throwing pseudo region error due to region not found in cache", zap.Stringer("region", &regionID))
   973  			resp, err = tikvrpc.GenRegionErrorResp(req, &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}})
   974  			return resp, nil, err
   975  		}
   976  
   977  		logutil.Eventf(bo.GetCtx(), "send %s request to region %d at %s", req.Type, regionID.id, rpcCtx.Addr)
   978  		s.storeAddr = rpcCtx.Addr
   979  		var retry bool
   980  		resp, retry, err = s.sendReqToRegion(bo, rpcCtx, req, timeout)
   981  		if err != nil {
   982  			return nil, nil, errors.Trace(err)
   983  		}
   984  
   985  		// recheck whether the session/query is killed during the Next()
   986  		boVars := bo.GetVars()
   987  		if boVars != nil && boVars.Killed != nil && atomic.LoadUint32(boVars.Killed) == 1 {
   988  			return nil, nil, tikverr.ErrQueryInterrupted
   989  		}
   990  		if val, err := util.EvalFailpoint("mockRetrySendReqToRegion"); err == nil {
   991  			if val.(bool) {
   992  				retry = true
   993  			}
   994  		}
   995  		if retry {
   996  			tryTimes++
   997  			continue
   998  		}
   999  
  1000  		var regionErr *errorpb.Error
  1001  		regionErr, err = resp.GetRegionError()
  1002  		if err != nil {
  1003  			return nil, nil, errors.Trace(err)
  1004  		}
  1005  		if regionErr != nil {
  1006  			retry, err = s.onRegionError(bo, rpcCtx, req, regionErr)
  1007  			if err != nil {
  1008  				return nil, nil, errors.Trace(err)
  1009  			}
  1010  			if retry {
  1011  				tryTimes++
  1012  				continue
  1013  			}
  1014  		} else {
  1015  			if s.replicaSelector != nil {
  1016  				s.replicaSelector.onSendSuccess()
  1017  			}
  1018  		}
  1019  		return resp, rpcCtx, nil
  1020  	}
  1021  }
  1022  
  1023  // RPCCancellerCtxKey is context key attach rpc send cancelFunc collector to ctx.
  1024  type RPCCancellerCtxKey struct{}
  1025  
  1026  // RPCCanceller is rpc send cancelFunc collector.
  1027  type RPCCanceller struct {
  1028  	sync.Mutex
  1029  	allocID   int
  1030  	cancels   map[int]func()
  1031  	cancelled bool
  1032  }
  1033  
  1034  // NewRPCanceller creates RPCCanceller with init state.
  1035  func NewRPCanceller() *RPCCanceller {
  1036  	return &RPCCanceller{cancels: make(map[int]func())}
  1037  }
  1038  
  1039  // WithCancel generates new context with cancel func.
  1040  func (h *RPCCanceller) WithCancel(ctx context.Context) (context.Context, func()) {
  1041  	nctx, cancel := context.WithCancel(ctx)
  1042  	h.Lock()
  1043  	if h.cancelled {
  1044  		h.Unlock()
  1045  		cancel()
  1046  		return nctx, func() {}
  1047  	}
  1048  	id := h.allocID
  1049  	h.allocID++
  1050  	h.cancels[id] = cancel
  1051  	h.Unlock()
  1052  	return nctx, func() {
  1053  		cancel()
  1054  		h.Lock()
  1055  		delete(h.cancels, id)
  1056  		h.Unlock()
  1057  	}
  1058  }
  1059  
  1060  // CancelAll cancels all inflight rpc context.
  1061  func (h *RPCCanceller) CancelAll() {
  1062  	h.Lock()
  1063  	for _, c := range h.cancels {
  1064  		c()
  1065  	}
  1066  	h.cancelled = true
  1067  	h.Unlock()
  1068  }
  1069  
  1070  func (s *RegionRequestSender) sendReqToRegion(bo *retry.Backoffer, rpcCtx *RPCContext, req *tikvrpc.Request, timeout time.Duration) (resp *tikvrpc.Response, retry bool, err error) {
  1071  	if e := tikvrpc.SetContext(req, rpcCtx.Meta, rpcCtx.Peer); e != nil {
  1072  		return nil, false, errors.Trace(e)
  1073  	}
  1074  	// judge the store limit switch.
  1075  	if limit := kv.StoreLimit.Load(); limit > 0 {
  1076  		if err := s.getStoreToken(rpcCtx.Store, limit); err != nil {
  1077  			return nil, false, err
  1078  		}
  1079  		defer s.releaseStoreToken(rpcCtx.Store)
  1080  	}
  1081  
  1082  	ctx := bo.GetCtx()
  1083  	if rawHook := ctx.Value(RPCCancellerCtxKey{}); rawHook != nil {
  1084  		var cancel context.CancelFunc
  1085  		ctx, cancel = rawHook.(*RPCCanceller).WithCancel(ctx)
  1086  		defer cancel()
  1087  	}
  1088  
  1089  	// sendToAddr is the first target address that will receive the request. If proxy is used, sendToAddr will point to
  1090  	// the proxy that will forward the request to the final target.
  1091  	sendToAddr := rpcCtx.Addr
  1092  	if rpcCtx.ProxyStore == nil {
  1093  		req.ForwardedHost = ""
  1094  	} else {
  1095  		req.ForwardedHost = rpcCtx.Addr
  1096  		sendToAddr = rpcCtx.ProxyAddr
  1097  	}
  1098  
  1099  	var sessionID uint64
  1100  	if v := bo.GetCtx().Value(util.SessionID); v != nil {
  1101  		sessionID = v.(uint64)
  1102  	}
  1103  
  1104  	injectFailOnSend := false
  1105  	if val, e := util.EvalFailpoint("rpcFailOnSend"); e == nil {
  1106  		inject := true
  1107  		// Optional filters
  1108  		if s, ok := val.(string); ok {
  1109  			if s == "greengc" && !req.IsGreenGCRequest() {
  1110  				inject = false
  1111  			} else if s == "write" && !req.IsTxnWriteRequest() {
  1112  				inject = false
  1113  			}
  1114  		} else if sessionID == 0 {
  1115  			inject = false
  1116  		}
  1117  
  1118  		if inject {
  1119  			logutil.Logger(ctx).Info("[failpoint] injected RPC error on send", zap.Stringer("type", req.Type),
  1120  				zap.Stringer("req", req.Req.(fmt.Stringer)), zap.Stringer("ctx", &req.Context))
  1121  			injectFailOnSend = true
  1122  			err = errors.New("injected RPC error on send")
  1123  		}
  1124  	}
  1125  
  1126  	if !injectFailOnSend {
  1127  		start := time.Now()
  1128  		resp, err = s.client.SendRequest(ctx, sendToAddr, req, timeout)
  1129  		if s.Stats != nil {
  1130  			RecordRegionRequestRuntimeStats(s.Stats, req.Type, time.Since(start))
  1131  			if val, err := util.EvalFailpoint("tikvStoreRespResult"); err == nil {
  1132  				if val.(bool) {
  1133  					if req.Type == tikvrpc.CmdCop && bo.GetTotalSleep() == 0 {
  1134  						return &tikvrpc.Response{
  1135  							Resp: &coprocessor.Response{RegionError: &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}}},
  1136  						}, false, nil
  1137  					}
  1138  				}
  1139  			}
  1140  		}
  1141  
  1142  		if val, e := util.EvalFailpoint("rpcFailOnRecv"); e == nil {
  1143  			inject := true
  1144  			// Optional filters
  1145  			if s, ok := val.(string); ok {
  1146  				if s == "greengc" && !req.IsGreenGCRequest() {
  1147  					inject = false
  1148  				} else if s == "write" && !req.IsTxnWriteRequest() {
  1149  					inject = false
  1150  				}
  1151  			} else if sessionID == 0 {
  1152  				inject = false
  1153  			}
  1154  
  1155  			if inject {
  1156  				logutil.Logger(ctx).Info("[failpoint] injected RPC error on recv", zap.Stringer("type", req.Type),
  1157  					zap.Stringer("req", req.Req.(fmt.Stringer)), zap.Stringer("ctx", &req.Context))
  1158  				err = errors.New("injected RPC error on recv")
  1159  				resp = nil
  1160  			}
  1161  		}
  1162  
  1163  		if val, e := util.EvalFailpoint("rpcContextCancelErr"); e == nil {
  1164  			if val.(bool) {
  1165  				ctx1, cancel := context.WithCancel(context.Background())
  1166  				cancel()
  1167  				<-ctx1.Done()
  1168  				ctx = ctx1
  1169  				err = ctx.Err()
  1170  				resp = nil
  1171  			}
  1172  		}
  1173  	}
  1174  
  1175  	if rpcCtx.ProxyStore != nil {
  1176  		fromStore := strconv.FormatUint(rpcCtx.ProxyStore.storeID, 10)
  1177  		toStore := strconv.FormatUint(rpcCtx.Store.storeID, 10)
  1178  		result := "ok"
  1179  		if err != nil {
  1180  			result = "fail"
  1181  		}
  1182  		metrics.TiKVForwardRequestCounter.WithLabelValues(fromStore, toStore, req.Type.String(), result).Inc()
  1183  	}
  1184  
  1185  	if err != nil {
  1186  		s.rpcError = err
  1187  
  1188  		// Because in rpc logic, context.Cancel() will be transferred to rpcContext.Cancel error. For rpcContext cancel,
  1189  		// we need to retry the request. But for context cancel active, for example, limitExec gets the required rows,
  1190  		// we shouldn't retry the request, it will go to backoff and hang in retry logic.
  1191  		if ctx.Err() != nil && errors.Cause(ctx.Err()) == context.Canceled {
  1192  			return nil, false, errors.Trace(ctx.Err())
  1193  		}
  1194  
  1195  		if val, e := util.EvalFailpoint("noRetryOnRpcError"); e == nil {
  1196  			if val.(bool) {
  1197  				return nil, false, err
  1198  			}
  1199  		}
  1200  		if e := s.onSendFail(bo, rpcCtx, err); e != nil {
  1201  			return nil, false, errors.Trace(e)
  1202  		}
  1203  		return nil, true, nil
  1204  	}
  1205  	return
  1206  }
  1207  
  1208  func (s *RegionRequestSender) getStoreToken(st *Store, limit int64) error {
  1209  	// Checking limit is not thread safe, preferring this for avoiding load in loop.
  1210  	count := st.tokenCount.Load()
  1211  	if count < limit {
  1212  		// Adding tokenCount is no thread safe, preferring this for avoiding check in loop.
  1213  		st.tokenCount.Add(1)
  1214  		return nil
  1215  	}
  1216  	metrics.TiKVStoreLimitErrorCounter.WithLabelValues(st.addr, strconv.FormatUint(st.storeID, 10)).Inc()
  1217  	return &tikverr.ErrTokenLimit{StoreID: st.storeID}
  1218  }
  1219  
  1220  func (s *RegionRequestSender) releaseStoreToken(st *Store) {
  1221  	count := st.tokenCount.Load()
  1222  	// Decreasing tokenCount is no thread safe, preferring this for avoiding check in loop.
  1223  	if count > 0 {
  1224  		st.tokenCount.Sub(1)
  1225  		return
  1226  	}
  1227  	logutil.BgLogger().Warn("release store token failed, count equals to 0")
  1228  }
  1229  
  1230  func (s *RegionRequestSender) onSendFail(bo *retry.Backoffer, ctx *RPCContext, err error) error {
  1231  	if span := opentracing.SpanFromContext(bo.GetCtx()); span != nil && span.Tracer() != nil {
  1232  		span1 := span.Tracer().StartSpan("regionRequest.onSendFail", opentracing.ChildOf(span.Context()))
  1233  		defer span1.Finish()
  1234  		bo.SetCtx(opentracing.ContextWithSpan(bo.GetCtx(), span1))
  1235  	}
  1236  	// If it failed because the context is cancelled by ourself, don't retry.
  1237  	if errors.Cause(err) == context.Canceled {
  1238  		return errors.Trace(err)
  1239  	} else if LoadShuttingDown() > 0 {
  1240  		return tikverr.ErrTiDBShuttingDown
  1241  	}
  1242  	if status.Code(errors.Cause(err)) == codes.Canceled {
  1243  		select {
  1244  		case <-bo.GetCtx().Done():
  1245  			return errors.Trace(err)
  1246  		default:
  1247  			// If we don't cancel, but the error code is Canceled, it must be from grpc remote.
  1248  			// This may happen when tikv is killed and exiting.
  1249  			// Backoff and retry in this case.
  1250  			logutil.BgLogger().Warn("receive a grpc cancel signal from remote", zap.Error(err))
  1251  		}
  1252  	}
  1253  
  1254  	if ctx.Meta != nil {
  1255  		if s.replicaSelector != nil {
  1256  			s.replicaSelector.onSendFailure(bo, err)
  1257  		} else {
  1258  			s.regionCache.OnSendFail(bo, ctx, s.NeedReloadRegion(ctx), err)
  1259  		}
  1260  	}
  1261  
  1262  	// Retry on send request failure when it's not canceled.
  1263  	// When a store is not available, the leader of related region should be elected quickly.
  1264  	// TODO: the number of retry time should be limited:since region may be unavailable
  1265  	// when some unrecoverable disaster happened.
  1266  	if ctx.Store != nil && ctx.Store.storeType == tikvrpc.TiFlash {
  1267  		err = bo.Backoff(retry.BoTiFlashRPC, errors.Errorf("send tiflash request error: %v, ctx: %v, try next peer later", err, ctx))
  1268  	} else {
  1269  		err = bo.Backoff(retry.BoTiKVRPC, errors.Errorf("send tikv request error: %v, ctx: %v, try next peer later", err, ctx))
  1270  	}
  1271  	return errors.Trace(err)
  1272  }
  1273  
  1274  // NeedReloadRegion checks is all peers has sent failed, if so need reload.
  1275  func (s *RegionRequestSender) NeedReloadRegion(ctx *RPCContext) (need bool) {
  1276  	if s.failStoreIDs == nil {
  1277  		s.failStoreIDs = make(map[uint64]struct{})
  1278  	}
  1279  	if s.failProxyStoreIDs == nil {
  1280  		s.failProxyStoreIDs = make(map[uint64]struct{})
  1281  	}
  1282  	s.failStoreIDs[ctx.Store.storeID] = struct{}{}
  1283  	if ctx.ProxyStore != nil {
  1284  		s.failProxyStoreIDs[ctx.ProxyStore.storeID] = struct{}{}
  1285  	}
  1286  
  1287  	if ctx.AccessMode == tiKVOnly && len(s.failStoreIDs)+len(s.failProxyStoreIDs) >= ctx.TiKVNum {
  1288  		need = true
  1289  	} else if ctx.AccessMode == tiFlashOnly && len(s.failStoreIDs) >= len(ctx.Meta.Peers)-ctx.TiKVNum {
  1290  		need = true
  1291  	} else if len(s.failStoreIDs)+len(s.failProxyStoreIDs) >= len(ctx.Meta.Peers) {
  1292  		need = true
  1293  	}
  1294  
  1295  	if need {
  1296  		s.failStoreIDs = nil
  1297  		s.failProxyStoreIDs = nil
  1298  	}
  1299  	return
  1300  }
  1301  
  1302  func regionErrorToLabel(e *errorpb.Error) string {
  1303  	if e.GetNotLeader() != nil {
  1304  		return "not_leader"
  1305  	} else if e.GetRegionNotFound() != nil {
  1306  		return "region_not_found"
  1307  	} else if e.GetKeyNotInRegion() != nil {
  1308  		return "key_not_in_region"
  1309  	} else if e.GetEpochNotMatch() != nil {
  1310  		return "epoch_not_match"
  1311  	} else if e.GetServerIsBusy() != nil {
  1312  		return "server_is_busy"
  1313  	} else if e.GetStaleCommand() != nil {
  1314  		return "stale_command"
  1315  	} else if e.GetStoreNotMatch() != nil {
  1316  		return "store_not_match"
  1317  	} else if e.GetRaftEntryTooLarge() != nil {
  1318  		return "raft_entry_too_large"
  1319  	} else if e.GetMaxTimestampNotSynced() != nil {
  1320  		return "max_timestamp_not_synced"
  1321  	} else if e.GetReadIndexNotReady() != nil {
  1322  		return "read_index_not_ready"
  1323  	} else if e.GetProposalInMergingMode() != nil {
  1324  		return "proposal_in_merging_mode"
  1325  	} else if e.GetDataIsNotReady() != nil {
  1326  		return "data_is_not_ready"
  1327  	} else if e.GetRegionNotInitialized() != nil {
  1328  		return "region_not_initialized"
  1329  	} else if e.GetDiskFull() != nil {
  1330  		return "disk_full"
  1331  	}
  1332  	return "unknown"
  1333  }
  1334  
  1335  func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, regionErr *errorpb.Error) (shouldRetry bool, err error) {
  1336  	if span := opentracing.SpanFromContext(bo.GetCtx()); span != nil && span.Tracer() != nil {
  1337  		span1 := span.Tracer().StartSpan("tikv.onRegionError", opentracing.ChildOf(span.Context()))
  1338  		defer span1.Finish()
  1339  		bo.SetCtx(opentracing.ContextWithSpan(bo.GetCtx(), span1))
  1340  	}
  1341  
  1342  	// NOTE: Please add the region error handler in the same order of errorpb.Error.
  1343  	metrics.TiKVRegionErrorCounter.WithLabelValues(regionErrorToLabel(regionErr)).Inc()
  1344  
  1345  	if notLeader := regionErr.GetNotLeader(); notLeader != nil {
  1346  		// Retry if error is `NotLeader`.
  1347  		logutil.BgLogger().Debug("tikv reports `NotLeader` retry later",
  1348  			zap.String("notLeader", notLeader.String()),
  1349  			zap.String("ctx", ctx.String()))
  1350  
  1351  		if s.replicaSelector != nil {
  1352  			return s.replicaSelector.onNotLeader(bo, ctx, notLeader)
  1353  		} else if notLeader.GetLeader() == nil {
  1354  			// The peer doesn't know who is the current leader. Generally it's because
  1355  			// the Raft group is in an election, but it's possible that the peer is
  1356  			// isolated and removed from the Raft group. So it's necessary to reload
  1357  			// the region from PD.
  1358  			s.regionCache.InvalidateCachedRegionWithReason(ctx.Region, NoLeader)
  1359  			if err = bo.Backoff(retry.BoRegionScheduling, errors.Errorf("not leader: %v, ctx: %v", notLeader, ctx)); err != nil {
  1360  				return false, errors.Trace(err)
  1361  			}
  1362  			return false, nil
  1363  		} else {
  1364  			// don't backoff if a new leader is returned.
  1365  			s.regionCache.UpdateLeader(ctx.Region, notLeader.GetLeader(), ctx.AccessIdx)
  1366  			return true, nil
  1367  		}
  1368  	}
  1369  
  1370  	// Retry it when tikv disk full happens.
  1371  	if diskFull := regionErr.GetDiskFull(); diskFull != nil {
  1372  		if err = bo.Backoff(retry.BoTiKVDiskFull, errors.Errorf("tikv disk full: %v ctx: %v", diskFull.String(), ctx.String())); err != nil {
  1373  			retry.BoTiKVDiskFull.SetErrors(errors.Errorf("tikv disk full: %v", diskFull.String()))
  1374  			return false, nil
  1375  		}
  1376  		return true, nil
  1377  	}
  1378  
  1379  	// This peer is removed from the region. Invalidate the region since it's too stale.
  1380  	if regionErr.GetRegionNotFound() != nil {
  1381  		s.regionCache.InvalidateCachedRegion(ctx.Region)
  1382  		return false, nil
  1383  	}
  1384  
  1385  	if regionErr.GetKeyNotInRegion() != nil {
  1386  		logutil.BgLogger().Debug("tikv reports `KeyNotInRegion`", zap.Stringer("ctx", ctx))
  1387  		s.regionCache.InvalidateCachedRegion(ctx.Region)
  1388  		return false, nil
  1389  	}
  1390  
  1391  	if epochNotMatch := regionErr.GetEpochNotMatch(); epochNotMatch != nil {
  1392  		logutil.BgLogger().Debug("tikv reports `EpochNotMatch` retry later",
  1393  			zap.Stringer("EpochNotMatch", epochNotMatch),
  1394  			zap.Stringer("ctx", ctx))
  1395  		retry, err := s.regionCache.OnRegionEpochNotMatch(bo, ctx, epochNotMatch.CurrentRegions)
  1396  		if !retry && s.replicaSelector != nil {
  1397  			s.replicaSelector.invalidateRegion()
  1398  		}
  1399  		return retry, errors.Trace(err)
  1400  	}
  1401  
  1402  	if regionErr.GetServerIsBusy() != nil {
  1403  		logutil.BgLogger().Warn("tikv reports `ServerIsBusy` retry later",
  1404  			zap.String("reason", regionErr.GetServerIsBusy().GetReason()),
  1405  			zap.Stringer("ctx", ctx))
  1406  		if ctx != nil && ctx.Store != nil && ctx.Store.storeType == tikvrpc.TiFlash {
  1407  			err = bo.Backoff(retry.BoTiFlashServerBusy, errors.Errorf("server is busy, ctx: %v", ctx))
  1408  		} else {
  1409  			err = bo.Backoff(retry.BoTiKVServerBusy, errors.Errorf("server is busy, ctx: %v", ctx))
  1410  		}
  1411  		if err != nil {
  1412  			return false, errors.Trace(err)
  1413  		}
  1414  		return true, nil
  1415  	}
  1416  
  1417  	// StaleCommand error indicates the request is sent to the old leader and its term is changed.
  1418  	// We can't know whether the request is committed or not, so it's an undetermined error too,
  1419  	// but we don't handle it now.
  1420  	if regionErr.GetStaleCommand() != nil {
  1421  		logutil.BgLogger().Debug("tikv reports `StaleCommand`", zap.Stringer("ctx", ctx))
  1422  		if s.replicaSelector != nil {
  1423  			// Needn't backoff because the new leader should be elected soon
  1424  			// and the replicaSelector will try the next peer.
  1425  		} else {
  1426  			err = bo.Backoff(retry.BoStaleCmd, errors.Errorf("stale command, ctx: %v", ctx))
  1427  			if err != nil {
  1428  				return false, errors.Trace(err)
  1429  			}
  1430  		}
  1431  		return true, nil
  1432  	}
  1433  
  1434  	if storeNotMatch := regionErr.GetStoreNotMatch(); storeNotMatch != nil {
  1435  		// store not match
  1436  		logutil.BgLogger().Debug("tikv reports `StoreNotMatch` retry later",
  1437  			zap.Stringer("storeNotMatch", storeNotMatch),
  1438  			zap.Stringer("ctx", ctx))
  1439  		ctx.Store.markNeedCheck(s.regionCache.notifyCheckCh)
  1440  		s.regionCache.InvalidateCachedRegion(ctx.Region)
  1441  		return false, nil
  1442  	}
  1443  
  1444  	if regionErr.GetRaftEntryTooLarge() != nil {
  1445  		logutil.BgLogger().Warn("tikv reports `RaftEntryTooLarge`", zap.Stringer("ctx", ctx))
  1446  		return false, errors.New(regionErr.String())
  1447  	}
  1448  
  1449  	if regionErr.GetMaxTimestampNotSynced() != nil {
  1450  		logutil.BgLogger().Debug("tikv reports `MaxTimestampNotSynced`", zap.Stringer("ctx", ctx))
  1451  		err = bo.Backoff(retry.BoMaxTsNotSynced, errors.Errorf("max timestamp not synced, ctx: %v", ctx))
  1452  		if err != nil {
  1453  			return false, errors.Trace(err)
  1454  		}
  1455  		return true, nil
  1456  	}
  1457  
  1458  	// A read request may be sent to a peer which has not been initialized yet, we should retry in this case.
  1459  	if regionErr.GetRegionNotInitialized() != nil {
  1460  		logutil.BgLogger().Debug("tikv reports `RegionNotInitialized` retry later",
  1461  			zap.Uint64("store-id", ctx.Store.storeID),
  1462  			zap.Uint64("region-id", regionErr.GetRegionNotInitialized().GetRegionId()),
  1463  			zap.Stringer("ctx", ctx))
  1464  		err = bo.Backoff(retry.BoMaxRegionNotInitialized, errors.Errorf("region not initialized"))
  1465  		if err != nil {
  1466  			return false, errors.Trace(err)
  1467  		}
  1468  		return true, nil
  1469  	}
  1470  
  1471  	// The read-index can't be handled timely because the region is splitting or merging.
  1472  	if regionErr.GetReadIndexNotReady() != nil {
  1473  		logutil.BgLogger().Debug("tikv reports `ReadIndexNotReady` retry later",
  1474  			zap.Uint64("store-id", ctx.Store.storeID),
  1475  			zap.Uint64("region-id", regionErr.GetRegionNotInitialized().GetRegionId()),
  1476  			zap.Stringer("ctx", ctx))
  1477  		// The region can't provide service until split or merge finished, so backoff.
  1478  		err = bo.Backoff(retry.BoRegionScheduling, errors.Errorf("read index not ready, ctx: %v", ctx))
  1479  		if err != nil {
  1480  			return false, errors.Trace(err)
  1481  		}
  1482  		return true, nil
  1483  	}
  1484  
  1485  	if regionErr.GetProposalInMergingMode() != nil {
  1486  		logutil.BgLogger().Debug("tikv reports `ProposalInMergingMode`", zap.Stringer("ctx", ctx))
  1487  		// The region is merging and it can't provide service until merge finished, so backoff.
  1488  		err = bo.Backoff(retry.BoRegionScheduling, errors.Errorf("region is merging, ctx: %v", ctx))
  1489  		if err != nil {
  1490  			return false, errors.Trace(err)
  1491  		}
  1492  		return true, nil
  1493  	}
  1494  
  1495  	// A stale read request may be sent to a peer which the data is not ready yet, we should retry in this case.
  1496  	// This error is specific to stale read and the target replica is randomly selected. If the request is sent
  1497  	// to the leader, the data must be ready, so we don't backoff here.
  1498  	if regionErr.GetDataIsNotReady() != nil {
  1499  		logutil.BgLogger().Warn("tikv reports `DataIsNotReady` retry later",
  1500  			zap.Uint64("store-id", ctx.Store.storeID),
  1501  			zap.Uint64("peer-id", regionErr.GetDataIsNotReady().GetPeerId()),
  1502  			zap.Uint64("region-id", regionErr.GetDataIsNotReady().GetRegionId()),
  1503  			zap.Uint64("safe-ts", regionErr.GetDataIsNotReady().GetSafeTs()),
  1504  			zap.Stringer("ctx", ctx))
  1505  		err = bo.Backoff(retry.BoMaxDataNotReady, errors.Errorf("data is not ready"))
  1506  		if err != nil {
  1507  			return false, errors.Trace(err)
  1508  		}
  1509  		return true, nil
  1510  	}
  1511  
  1512  	logutil.BgLogger().Debug("tikv reports region failed",
  1513  		zap.Stringer("regionErr", regionErr),
  1514  		zap.Stringer("ctx", ctx))
  1515  
  1516  	if s.replicaSelector != nil {
  1517  		// Try the next replica.
  1518  		return true, nil
  1519  	}
  1520  
  1521  	// When the request is sent to TiDB, there is no region in the request, so the region id will be 0.
  1522  	// So when region id is 0, there is no business with region cache.
  1523  	if ctx.Region.id != 0 {
  1524  		s.regionCache.InvalidateCachedRegion(ctx.Region)
  1525  	}
  1526  	// For other errors, we only drop cache here.
  1527  	// Because caller may need to re-split the request.
  1528  	return false, nil
  1529  }