github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/conn/conn.go (about)

     1  // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.
     2  
     3  package conn
     4  
     5  import (
     6  	"context"
     7  	"crypto/tls"
     8  	"os"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/opentracing/opentracing-go"
    13  	"github.com/pingcap/errors"
    14  	"github.com/pingcap/failpoint"
    15  	"github.com/pingcap/log"
    16  	"github.com/pingcap/tidb/domain"
    17  	"github.com/pingcap/tidb/kv"
    18  	"github.com/tikv/client-go/v2/tikv"
    19  	"github.com/tikv/client-go/v2/txnkv/txnlock"
    20  	pd "github.com/tikv/pd/client"
    21  	"go.uber.org/zap"
    22  	"google.golang.org/grpc"
    23  	"google.golang.org/grpc/backoff"
    24  	"google.golang.org/grpc/credentials"
    25  	"google.golang.org/grpc/keepalive"
    26  
    27  	backuppb "github.com/pingcap/kvproto/pkg/backup"
    28  	"github.com/pingcap/kvproto/pkg/metapb"
    29  
    30  	berrors "github.com/pingcap/br/pkg/errors"
    31  	"github.com/pingcap/br/pkg/glue"
    32  	"github.com/pingcap/br/pkg/logutil"
    33  	"github.com/pingcap/br/pkg/pdutil"
    34  	"github.com/pingcap/br/pkg/version"
    35  )
    36  
    37  const (
    38  	dialTimeout = 30 * time.Second
    39  
    40  	resetRetryTimes = 3
    41  )
    42  
    43  // Pool is a lazy pool of gRPC channels.
    44  // When `Get` called, it lazily allocates new connection if connection not full.
    45  // If it's full, then it will return allocated channels round-robin.
    46  type Pool struct {
    47  	mu sync.Mutex
    48  
    49  	conns   []*grpc.ClientConn
    50  	next    int
    51  	cap     int
    52  	newConn func(ctx context.Context) (*grpc.ClientConn, error)
    53  }
    54  
    55  func (p *Pool) takeConns() (conns []*grpc.ClientConn) {
    56  	p.mu.Lock()
    57  	defer p.mu.Unlock()
    58  	p.conns, conns = nil, p.conns
    59  	p.next = 0
    60  	return conns
    61  }
    62  
    63  // Close closes the conn pool.
    64  func (p *Pool) Close() {
    65  	for _, c := range p.takeConns() {
    66  		if err := c.Close(); err != nil {
    67  			log.Warn("failed to close clientConn", zap.String("target", c.Target()), zap.Error(err))
    68  		}
    69  	}
    70  }
    71  
    72  // Get tries to get an existing connection from the pool, or make a new one if the pool not full.
    73  func (p *Pool) Get(ctx context.Context) (*grpc.ClientConn, error) {
    74  	p.mu.Lock()
    75  	defer p.mu.Unlock()
    76  	if len(p.conns) < p.cap {
    77  		c, err := p.newConn(ctx)
    78  		if err != nil {
    79  			return nil, err
    80  		}
    81  		p.conns = append(p.conns, c)
    82  		return c, nil
    83  	}
    84  
    85  	conn := p.conns[p.next]
    86  	p.next = (p.next + 1) % p.cap
    87  	return conn, nil
    88  }
    89  
    90  // NewConnPool creates a new Pool by the specified conn factory function and capacity.
    91  func NewConnPool(cap int, newConn func(ctx context.Context) (*grpc.ClientConn, error)) *Pool {
    92  	return &Pool{
    93  		cap:     cap,
    94  		conns:   make([]*grpc.ClientConn, 0, cap),
    95  		newConn: newConn,
    96  
    97  		mu: sync.Mutex{},
    98  	}
    99  }
   100  
   101  // Mgr manages connections to a TiDB cluster.
   102  type Mgr struct {
   103  	*pdutil.PdController
   104  	tlsConf   *tls.Config
   105  	dom       *domain.Domain
   106  	storage   kv.Storage   // Used to access SQL related interfaces.
   107  	tikvStore tikv.Storage // Used to access TiKV specific interfaces.
   108  	grpcClis  struct {
   109  		mu   sync.Mutex
   110  		clis map[uint64]*grpc.ClientConn
   111  	}
   112  	keepalive   keepalive.ClientParameters
   113  	ownsStorage bool
   114  }
   115  
   116  // StoreBehavior is the action to do in GetAllTiKVStores when a non-TiKV
   117  // store (e.g. TiFlash store) is found.
   118  type StoreBehavior uint8
   119  
   120  const (
   121  	// ErrorOnTiFlash causes GetAllTiKVStores to return error when the store is
   122  	// found to be a TiFlash node.
   123  	ErrorOnTiFlash StoreBehavior = 0
   124  	// SkipTiFlash causes GetAllTiKVStores to skip the store when it is found to
   125  	// be a TiFlash node.
   126  	SkipTiFlash StoreBehavior = 1
   127  	// TiFlashOnly caused GetAllTiKVStores to skip the store which is not a
   128  	// TiFlash node.
   129  	TiFlashOnly StoreBehavior = 2
   130  )
   131  
   132  // GetAllTiKVStores returns all TiKV stores registered to the PD client. The
   133  // stores must not be a tombstone and must never contain a label `engine=tiflash`.
   134  func GetAllTiKVStores(
   135  	ctx context.Context,
   136  	pdClient pd.Client,
   137  	storeBehavior StoreBehavior,
   138  ) ([]*metapb.Store, error) {
   139  	// get all live stores.
   140  	stores, err := pdClient.GetAllStores(ctx, pd.WithExcludeTombstone())
   141  	if err != nil {
   142  		return nil, errors.Trace(err)
   143  	}
   144  
   145  	// filter out all stores which are TiFlash.
   146  	j := 0
   147  	for _, store := range stores {
   148  		isTiFlash := false
   149  		if version.IsTiFlash(store) {
   150  			if storeBehavior == SkipTiFlash {
   151  				continue
   152  			} else if storeBehavior == ErrorOnTiFlash {
   153  				return nil, errors.Annotatef(berrors.ErrPDInvalidResponse,
   154  					"cannot restore to a cluster with active TiFlash stores (store %d at %s)", store.Id, store.Address)
   155  			}
   156  			isTiFlash = true
   157  		}
   158  		if !isTiFlash && storeBehavior == TiFlashOnly {
   159  			continue
   160  		}
   161  		stores[j] = store
   162  		j++
   163  	}
   164  	return stores[:j], nil
   165  }
   166  
   167  // NewMgr creates a new Mgr.
   168  //
   169  // Domain is optional for Backup, set `needDomain` to false to disable
   170  // initializing Domain.
   171  func NewMgr(
   172  	ctx context.Context,
   173  	g glue.Glue,
   174  	pdAddrs string,
   175  	storage kv.Storage,
   176  	tlsConf *tls.Config,
   177  	securityOption pd.SecurityOption,
   178  	keepalive keepalive.ClientParameters,
   179  	storeBehavior StoreBehavior,
   180  	checkRequirements bool,
   181  	needDomain bool,
   182  ) (*Mgr, error) {
   183  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
   184  		span1 := span.Tracer().StartSpan("conn.NewMgr", opentracing.ChildOf(span.Context()))
   185  		defer span1.Finish()
   186  		ctx = opentracing.ContextWithSpan(ctx, span1)
   187  	}
   188  
   189  	tikvStorage, ok := storage.(tikv.Storage)
   190  	if !ok {
   191  		return nil, berrors.ErrKVNotTiKV
   192  	}
   193  
   194  	controller, err := pdutil.NewPdController(ctx, pdAddrs, tlsConf, securityOption)
   195  	if err != nil {
   196  		log.Error("fail to create pd controller", zap.Error(err))
   197  		return nil, errors.Trace(err)
   198  	}
   199  	if checkRequirements {
   200  		err = version.CheckClusterVersion(ctx, controller.GetPDClient(), version.CheckVersionForBR)
   201  		if err != nil {
   202  			return nil, errors.Annotate(err, "running BR in incompatible version of cluster, "+
   203  				"if you believe it's OK, use --check-requirements=false to skip.")
   204  		}
   205  	}
   206  	log.Info("new mgr", zap.String("pdAddrs", pdAddrs))
   207  
   208  	// Check live tikv.
   209  	stores, err := GetAllTiKVStores(ctx, controller.GetPDClient(), storeBehavior)
   210  	if err != nil {
   211  		log.Error("fail to get store", zap.Error(err))
   212  		return nil, errors.Trace(err)
   213  	}
   214  	liveStoreCount := 0
   215  	for _, s := range stores {
   216  		if s.GetState() != metapb.StoreState_Up {
   217  			continue
   218  		}
   219  		liveStoreCount++
   220  	}
   221  
   222  	var dom *domain.Domain
   223  	if needDomain {
   224  		dom, err = g.GetDomain(storage)
   225  		if err != nil {
   226  			return nil, errors.Trace(err)
   227  		}
   228  	}
   229  
   230  	mgr := &Mgr{
   231  		PdController: controller,
   232  		storage:      storage,
   233  		tikvStore:    tikvStorage,
   234  		dom:          dom,
   235  		tlsConf:      tlsConf,
   236  		ownsStorage:  g.OwnsStorage(),
   237  	}
   238  	mgr.grpcClis.clis = make(map[uint64]*grpc.ClientConn)
   239  	mgr.keepalive = keepalive
   240  	return mgr, nil
   241  }
   242  
   243  func (mgr *Mgr) getGrpcConnLocked(ctx context.Context, storeID uint64) (*grpc.ClientConn, error) {
   244  	failpoint.Inject("hint-get-backup-client", func(v failpoint.Value) {
   245  		log.Info("failpoint hint-get-backup-client injected, "+
   246  			"process will notify the shell.", zap.Uint64("store", storeID))
   247  		if sigFile, ok := v.(string); ok {
   248  			file, err := os.Create(sigFile)
   249  			if err != nil {
   250  				log.Warn("failed to create file for notifying, skipping notify", zap.Error(err))
   251  			}
   252  			if file != nil {
   253  				file.Close()
   254  			}
   255  		}
   256  		time.Sleep(3 * time.Second)
   257  	})
   258  	store, err := mgr.GetPDClient().GetStore(ctx, storeID)
   259  	if err != nil {
   260  		return nil, errors.Trace(err)
   261  	}
   262  	opt := grpc.WithInsecure()
   263  	if mgr.tlsConf != nil {
   264  		opt = grpc.WithTransportCredentials(credentials.NewTLS(mgr.tlsConf))
   265  	}
   266  	ctx, cancel := context.WithTimeout(ctx, dialTimeout)
   267  	bfConf := backoff.DefaultConfig
   268  	bfConf.MaxDelay = time.Second * 3
   269  	addr := store.GetPeerAddress()
   270  	if addr == "" {
   271  		addr = store.GetAddress()
   272  	}
   273  	conn, err := grpc.DialContext(
   274  		ctx,
   275  		addr,
   276  		opt,
   277  		grpc.WithBlock(),
   278  		grpc.WithConnectParams(grpc.ConnectParams{Backoff: bfConf}),
   279  		grpc.WithKeepaliveParams(mgr.keepalive),
   280  	)
   281  	cancel()
   282  	if err != nil {
   283  		return nil, berrors.ErrFailedToConnect.Wrap(err).GenWithStack("failed to make connection to store %d", storeID)
   284  	}
   285  	return conn, nil
   286  }
   287  
   288  // GetBackupClient get or create a backup client.
   289  func (mgr *Mgr) GetBackupClient(ctx context.Context, storeID uint64) (backuppb.BackupClient, error) {
   290  	if ctx.Err() != nil {
   291  		return nil, errors.Trace(ctx.Err())
   292  	}
   293  
   294  	mgr.grpcClis.mu.Lock()
   295  	defer mgr.grpcClis.mu.Unlock()
   296  
   297  	if conn, ok := mgr.grpcClis.clis[storeID]; ok {
   298  		// Find a cached backup client.
   299  		return backuppb.NewBackupClient(conn), nil
   300  	}
   301  
   302  	conn, err := mgr.getGrpcConnLocked(ctx, storeID)
   303  	if err != nil {
   304  		return nil, errors.Trace(err)
   305  	}
   306  	// Cache the conn.
   307  	mgr.grpcClis.clis[storeID] = conn
   308  	return backuppb.NewBackupClient(conn), nil
   309  }
   310  
   311  // ResetBackupClient reset the connection for backup client.
   312  func (mgr *Mgr) ResetBackupClient(ctx context.Context, storeID uint64) (backuppb.BackupClient, error) {
   313  	if ctx.Err() != nil {
   314  		return nil, errors.Trace(ctx.Err())
   315  	}
   316  
   317  	mgr.grpcClis.mu.Lock()
   318  	defer mgr.grpcClis.mu.Unlock()
   319  
   320  	if conn, ok := mgr.grpcClis.clis[storeID]; ok {
   321  		// Find a cached backup client.
   322  		log.Info("Reset backup client", zap.Uint64("storeID", storeID))
   323  		err := conn.Close()
   324  		if err != nil {
   325  			log.Warn("close backup connection failed, ignore it", zap.Uint64("storeID", storeID))
   326  		}
   327  		delete(mgr.grpcClis.clis, storeID)
   328  	}
   329  	var (
   330  		conn *grpc.ClientConn
   331  		err  error
   332  	)
   333  	for retry := 0; retry < resetRetryTimes; retry++ {
   334  		conn, err = mgr.getGrpcConnLocked(ctx, storeID)
   335  		if err != nil {
   336  			log.Warn("failed to reset grpc connection, retry it",
   337  				zap.Int("retry time", retry), logutil.ShortError(err))
   338  			time.Sleep(time.Duration(retry+3) * time.Second)
   339  			continue
   340  		}
   341  		mgr.grpcClis.clis[storeID] = conn
   342  		break
   343  	}
   344  	if err != nil {
   345  		return nil, errors.Trace(err)
   346  	}
   347  	return backuppb.NewBackupClient(conn), nil
   348  }
   349  
   350  // GetStorage returns a kv storage.
   351  func (mgr *Mgr) GetStorage() kv.Storage {
   352  	return mgr.storage
   353  }
   354  
   355  // GetTLSConfig returns the tls config.
   356  func (mgr *Mgr) GetTLSConfig() *tls.Config {
   357  	return mgr.tlsConf
   358  }
   359  
   360  // GetLockResolver gets the LockResolver.
   361  func (mgr *Mgr) GetLockResolver() *txnlock.LockResolver {
   362  	return mgr.tikvStore.GetLockResolver()
   363  }
   364  
   365  // GetDomain returns a tikv storage.
   366  func (mgr *Mgr) GetDomain() *domain.Domain {
   367  	return mgr.dom
   368  }
   369  
   370  // Close closes all client in Mgr.
   371  func (mgr *Mgr) Close() {
   372  	mgr.grpcClis.mu.Lock()
   373  	for _, cli := range mgr.grpcClis.clis {
   374  		err := cli.Close()
   375  		if err != nil {
   376  			log.Error("fail to close Mgr", zap.Error(err))
   377  		}
   378  	}
   379  	mgr.grpcClis.mu.Unlock()
   380  
   381  	// Gracefully shutdown domain so it does not affect other TiDB DDL.
   382  	// Must close domain before closing storage, otherwise it gets stuck forever.
   383  	if mgr.ownsStorage {
   384  		if mgr.dom != nil {
   385  			mgr.dom.Close()
   386  		}
   387  		tikv.StoreShuttingDown(1)
   388  		mgr.storage.Close()
   389  	}
   390  
   391  	mgr.PdController.Close()
   392  }