github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/conn/conn.go (about) 1 // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0. 2 3 package conn 4 5 import ( 6 "context" 7 "crypto/tls" 8 "os" 9 "sync" 10 "time" 11 12 "github.com/opentracing/opentracing-go" 13 "github.com/pingcap/errors" 14 "github.com/pingcap/failpoint" 15 "github.com/pingcap/log" 16 "github.com/pingcap/tidb/domain" 17 "github.com/pingcap/tidb/kv" 18 "github.com/tikv/client-go/v2/tikv" 19 "github.com/tikv/client-go/v2/txnkv/txnlock" 20 pd "github.com/tikv/pd/client" 21 "go.uber.org/zap" 22 "google.golang.org/grpc" 23 "google.golang.org/grpc/backoff" 24 "google.golang.org/grpc/credentials" 25 "google.golang.org/grpc/keepalive" 26 27 backuppb "github.com/pingcap/kvproto/pkg/backup" 28 "github.com/pingcap/kvproto/pkg/metapb" 29 30 berrors "github.com/pingcap/br/pkg/errors" 31 "github.com/pingcap/br/pkg/glue" 32 "github.com/pingcap/br/pkg/logutil" 33 "github.com/pingcap/br/pkg/pdutil" 34 "github.com/pingcap/br/pkg/version" 35 ) 36 37 const ( 38 dialTimeout = 30 * time.Second 39 40 resetRetryTimes = 3 41 ) 42 43 // Pool is a lazy pool of gRPC channels. 44 // When `Get` called, it lazily allocates new connection if connection not full. 45 // If it's full, then it will return allocated channels round-robin. 46 type Pool struct { 47 mu sync.Mutex 48 49 conns []*grpc.ClientConn 50 next int 51 cap int 52 newConn func(ctx context.Context) (*grpc.ClientConn, error) 53 } 54 55 func (p *Pool) takeConns() (conns []*grpc.ClientConn) { 56 p.mu.Lock() 57 defer p.mu.Unlock() 58 p.conns, conns = nil, p.conns 59 p.next = 0 60 return conns 61 } 62 63 // Close closes the conn pool. 64 func (p *Pool) Close() { 65 for _, c := range p.takeConns() { 66 if err := c.Close(); err != nil { 67 log.Warn("failed to close clientConn", zap.String("target", c.Target()), zap.Error(err)) 68 } 69 } 70 } 71 72 // Get tries to get an existing connection from the pool, or make a new one if the pool not full. 73 func (p *Pool) Get(ctx context.Context) (*grpc.ClientConn, error) { 74 p.mu.Lock() 75 defer p.mu.Unlock() 76 if len(p.conns) < p.cap { 77 c, err := p.newConn(ctx) 78 if err != nil { 79 return nil, err 80 } 81 p.conns = append(p.conns, c) 82 return c, nil 83 } 84 85 conn := p.conns[p.next] 86 p.next = (p.next + 1) % p.cap 87 return conn, nil 88 } 89 90 // NewConnPool creates a new Pool by the specified conn factory function and capacity. 91 func NewConnPool(cap int, newConn func(ctx context.Context) (*grpc.ClientConn, error)) *Pool { 92 return &Pool{ 93 cap: cap, 94 conns: make([]*grpc.ClientConn, 0, cap), 95 newConn: newConn, 96 97 mu: sync.Mutex{}, 98 } 99 } 100 101 // Mgr manages connections to a TiDB cluster. 102 type Mgr struct { 103 *pdutil.PdController 104 tlsConf *tls.Config 105 dom *domain.Domain 106 storage kv.Storage // Used to access SQL related interfaces. 107 tikvStore tikv.Storage // Used to access TiKV specific interfaces. 108 grpcClis struct { 109 mu sync.Mutex 110 clis map[uint64]*grpc.ClientConn 111 } 112 keepalive keepalive.ClientParameters 113 ownsStorage bool 114 } 115 116 // StoreBehavior is the action to do in GetAllTiKVStores when a non-TiKV 117 // store (e.g. TiFlash store) is found. 118 type StoreBehavior uint8 119 120 const ( 121 // ErrorOnTiFlash causes GetAllTiKVStores to return error when the store is 122 // found to be a TiFlash node. 123 ErrorOnTiFlash StoreBehavior = 0 124 // SkipTiFlash causes GetAllTiKVStores to skip the store when it is found to 125 // be a TiFlash node. 126 SkipTiFlash StoreBehavior = 1 127 // TiFlashOnly caused GetAllTiKVStores to skip the store which is not a 128 // TiFlash node. 129 TiFlashOnly StoreBehavior = 2 130 ) 131 132 // GetAllTiKVStores returns all TiKV stores registered to the PD client. The 133 // stores must not be a tombstone and must never contain a label `engine=tiflash`. 134 func GetAllTiKVStores( 135 ctx context.Context, 136 pdClient pd.Client, 137 storeBehavior StoreBehavior, 138 ) ([]*metapb.Store, error) { 139 // get all live stores. 140 stores, err := pdClient.GetAllStores(ctx, pd.WithExcludeTombstone()) 141 if err != nil { 142 return nil, errors.Trace(err) 143 } 144 145 // filter out all stores which are TiFlash. 146 j := 0 147 for _, store := range stores { 148 isTiFlash := false 149 if version.IsTiFlash(store) { 150 if storeBehavior == SkipTiFlash { 151 continue 152 } else if storeBehavior == ErrorOnTiFlash { 153 return nil, errors.Annotatef(berrors.ErrPDInvalidResponse, 154 "cannot restore to a cluster with active TiFlash stores (store %d at %s)", store.Id, store.Address) 155 } 156 isTiFlash = true 157 } 158 if !isTiFlash && storeBehavior == TiFlashOnly { 159 continue 160 } 161 stores[j] = store 162 j++ 163 } 164 return stores[:j], nil 165 } 166 167 // NewMgr creates a new Mgr. 168 // 169 // Domain is optional for Backup, set `needDomain` to false to disable 170 // initializing Domain. 171 func NewMgr( 172 ctx context.Context, 173 g glue.Glue, 174 pdAddrs string, 175 storage kv.Storage, 176 tlsConf *tls.Config, 177 securityOption pd.SecurityOption, 178 keepalive keepalive.ClientParameters, 179 storeBehavior StoreBehavior, 180 checkRequirements bool, 181 needDomain bool, 182 ) (*Mgr, error) { 183 if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { 184 span1 := span.Tracer().StartSpan("conn.NewMgr", opentracing.ChildOf(span.Context())) 185 defer span1.Finish() 186 ctx = opentracing.ContextWithSpan(ctx, span1) 187 } 188 189 tikvStorage, ok := storage.(tikv.Storage) 190 if !ok { 191 return nil, berrors.ErrKVNotTiKV 192 } 193 194 controller, err := pdutil.NewPdController(ctx, pdAddrs, tlsConf, securityOption) 195 if err != nil { 196 log.Error("fail to create pd controller", zap.Error(err)) 197 return nil, errors.Trace(err) 198 } 199 if checkRequirements { 200 err = version.CheckClusterVersion(ctx, controller.GetPDClient(), version.CheckVersionForBR) 201 if err != nil { 202 return nil, errors.Annotate(err, "running BR in incompatible version of cluster, "+ 203 "if you believe it's OK, use --check-requirements=false to skip.") 204 } 205 } 206 log.Info("new mgr", zap.String("pdAddrs", pdAddrs)) 207 208 // Check live tikv. 209 stores, err := GetAllTiKVStores(ctx, controller.GetPDClient(), storeBehavior) 210 if err != nil { 211 log.Error("fail to get store", zap.Error(err)) 212 return nil, errors.Trace(err) 213 } 214 liveStoreCount := 0 215 for _, s := range stores { 216 if s.GetState() != metapb.StoreState_Up { 217 continue 218 } 219 liveStoreCount++ 220 } 221 222 var dom *domain.Domain 223 if needDomain { 224 dom, err = g.GetDomain(storage) 225 if err != nil { 226 return nil, errors.Trace(err) 227 } 228 } 229 230 mgr := &Mgr{ 231 PdController: controller, 232 storage: storage, 233 tikvStore: tikvStorage, 234 dom: dom, 235 tlsConf: tlsConf, 236 ownsStorage: g.OwnsStorage(), 237 } 238 mgr.grpcClis.clis = make(map[uint64]*grpc.ClientConn) 239 mgr.keepalive = keepalive 240 return mgr, nil 241 } 242 243 func (mgr *Mgr) getGrpcConnLocked(ctx context.Context, storeID uint64) (*grpc.ClientConn, error) { 244 failpoint.Inject("hint-get-backup-client", func(v failpoint.Value) { 245 log.Info("failpoint hint-get-backup-client injected, "+ 246 "process will notify the shell.", zap.Uint64("store", storeID)) 247 if sigFile, ok := v.(string); ok { 248 file, err := os.Create(sigFile) 249 if err != nil { 250 log.Warn("failed to create file for notifying, skipping notify", zap.Error(err)) 251 } 252 if file != nil { 253 file.Close() 254 } 255 } 256 time.Sleep(3 * time.Second) 257 }) 258 store, err := mgr.GetPDClient().GetStore(ctx, storeID) 259 if err != nil { 260 return nil, errors.Trace(err) 261 } 262 opt := grpc.WithInsecure() 263 if mgr.tlsConf != nil { 264 opt = grpc.WithTransportCredentials(credentials.NewTLS(mgr.tlsConf)) 265 } 266 ctx, cancel := context.WithTimeout(ctx, dialTimeout) 267 bfConf := backoff.DefaultConfig 268 bfConf.MaxDelay = time.Second * 3 269 addr := store.GetPeerAddress() 270 if addr == "" { 271 addr = store.GetAddress() 272 } 273 conn, err := grpc.DialContext( 274 ctx, 275 addr, 276 opt, 277 grpc.WithBlock(), 278 grpc.WithConnectParams(grpc.ConnectParams{Backoff: bfConf}), 279 grpc.WithKeepaliveParams(mgr.keepalive), 280 ) 281 cancel() 282 if err != nil { 283 return nil, berrors.ErrFailedToConnect.Wrap(err).GenWithStack("failed to make connection to store %d", storeID) 284 } 285 return conn, nil 286 } 287 288 // GetBackupClient get or create a backup client. 289 func (mgr *Mgr) GetBackupClient(ctx context.Context, storeID uint64) (backuppb.BackupClient, error) { 290 if ctx.Err() != nil { 291 return nil, errors.Trace(ctx.Err()) 292 } 293 294 mgr.grpcClis.mu.Lock() 295 defer mgr.grpcClis.mu.Unlock() 296 297 if conn, ok := mgr.grpcClis.clis[storeID]; ok { 298 // Find a cached backup client. 299 return backuppb.NewBackupClient(conn), nil 300 } 301 302 conn, err := mgr.getGrpcConnLocked(ctx, storeID) 303 if err != nil { 304 return nil, errors.Trace(err) 305 } 306 // Cache the conn. 307 mgr.grpcClis.clis[storeID] = conn 308 return backuppb.NewBackupClient(conn), nil 309 } 310 311 // ResetBackupClient reset the connection for backup client. 312 func (mgr *Mgr) ResetBackupClient(ctx context.Context, storeID uint64) (backuppb.BackupClient, error) { 313 if ctx.Err() != nil { 314 return nil, errors.Trace(ctx.Err()) 315 } 316 317 mgr.grpcClis.mu.Lock() 318 defer mgr.grpcClis.mu.Unlock() 319 320 if conn, ok := mgr.grpcClis.clis[storeID]; ok { 321 // Find a cached backup client. 322 log.Info("Reset backup client", zap.Uint64("storeID", storeID)) 323 err := conn.Close() 324 if err != nil { 325 log.Warn("close backup connection failed, ignore it", zap.Uint64("storeID", storeID)) 326 } 327 delete(mgr.grpcClis.clis, storeID) 328 } 329 var ( 330 conn *grpc.ClientConn 331 err error 332 ) 333 for retry := 0; retry < resetRetryTimes; retry++ { 334 conn, err = mgr.getGrpcConnLocked(ctx, storeID) 335 if err != nil { 336 log.Warn("failed to reset grpc connection, retry it", 337 zap.Int("retry time", retry), logutil.ShortError(err)) 338 time.Sleep(time.Duration(retry+3) * time.Second) 339 continue 340 } 341 mgr.grpcClis.clis[storeID] = conn 342 break 343 } 344 if err != nil { 345 return nil, errors.Trace(err) 346 } 347 return backuppb.NewBackupClient(conn), nil 348 } 349 350 // GetStorage returns a kv storage. 351 func (mgr *Mgr) GetStorage() kv.Storage { 352 return mgr.storage 353 } 354 355 // GetTLSConfig returns the tls config. 356 func (mgr *Mgr) GetTLSConfig() *tls.Config { 357 return mgr.tlsConf 358 } 359 360 // GetLockResolver gets the LockResolver. 361 func (mgr *Mgr) GetLockResolver() *txnlock.LockResolver { 362 return mgr.tikvStore.GetLockResolver() 363 } 364 365 // GetDomain returns a tikv storage. 366 func (mgr *Mgr) GetDomain() *domain.Domain { 367 return mgr.dom 368 } 369 370 // Close closes all client in Mgr. 371 func (mgr *Mgr) Close() { 372 mgr.grpcClis.mu.Lock() 373 for _, cli := range mgr.grpcClis.clis { 374 err := cli.Close() 375 if err != nil { 376 log.Error("fail to close Mgr", zap.Error(err)) 377 } 378 } 379 mgr.grpcClis.mu.Unlock() 380 381 // Gracefully shutdown domain so it does not affect other TiDB DDL. 382 // Must close domain before closing storage, otherwise it gets stuck forever. 383 if mgr.ownsStorage { 384 if mgr.dom != nil { 385 mgr.dom.Close() 386 } 387 tikv.StoreShuttingDown(1) 388 mgr.storage.Close() 389 } 390 391 mgr.PdController.Close() 392 }