github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/restore/ingester.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package restore 15 16 import ( 17 "bytes" 18 "context" 19 "crypto/tls" 20 "strings" 21 "sync" 22 "time" 23 24 "github.com/google/uuid" 25 "github.com/pingcap/errors" 26 "github.com/pingcap/failpoint" 27 "github.com/pingcap/kvproto/pkg/errorpb" 28 sst "github.com/pingcap/kvproto/pkg/import_sstpb" 29 "github.com/pingcap/kvproto/pkg/kvrpcpb" 30 "github.com/pingcap/kvproto/pkg/metapb" 31 "github.com/pingcap/log" 32 "github.com/tikv/pd/pkg/codec" 33 "go.uber.org/zap" 34 "google.golang.org/grpc" 35 "google.golang.org/grpc/backoff" 36 "google.golang.org/grpc/credentials" 37 "google.golang.org/grpc/keepalive" 38 39 "github.com/pingcap/br/pkg/conn" 40 berrors "github.com/pingcap/br/pkg/errors" 41 "github.com/pingcap/br/pkg/kv" 42 "github.com/pingcap/br/pkg/logutil" 43 "github.com/pingcap/br/pkg/membuf" 44 "github.com/pingcap/br/pkg/utils" 45 ) 46 47 const ( 48 dialTimeout = 5 * time.Second 49 50 gRPCKeepAliveTime = 10 * time.Second 51 gRPCKeepAliveTimeout = 3 * time.Second 52 53 // See: https://github.com/tikv/tikv/blob/e030a0aae9622f3774df89c62f21b2171a72a69e/etc/config-template.toml#L360 54 regionMaxKeyCount = 1440000 55 56 defaultSplitSize = 96 * 1024 * 1024 57 ) 58 59 type retryType int 60 61 const ( 62 retryNone retryType = iota 63 retryWrite 64 retryIngest 65 ) 66 67 type gRPCConns struct { 68 mu sync.Mutex 69 conns map[uint64]*conn.Pool 70 tcpConcurrency int 71 } 72 73 func (conns *gRPCConns) Close() { 74 conns.mu.Lock() 75 defer conns.mu.Unlock() 76 77 for _, cp := range conns.conns { 78 cp.Close() 79 } 80 } 81 82 // Ingester writes and ingests kv to TiKV. 83 // which used for both BR log restore and Lightning local backend. 84 type Ingester struct { 85 // commit ts appends to key in tikv 86 TS uint64 87 88 tlsConf *tls.Config 89 conns gRPCConns 90 91 splitCli SplitClient 92 WorkerPool *utils.WorkerPool 93 94 batchWriteKVPairs int 95 regionSplitSize int64 96 } 97 98 // NewIngester creates Ingester. 99 func NewIngester( 100 splitCli SplitClient, cfg concurrencyCfg, commitTS uint64, tlsConf *tls.Config, 101 ) *Ingester { 102 workerPool := utils.NewWorkerPool(cfg.IngestConcurrency, "ingest worker") 103 return &Ingester{ 104 tlsConf: tlsConf, 105 conns: gRPCConns{ 106 tcpConcurrency: cfg.TCPConcurrency, 107 conns: make(map[uint64]*conn.Pool), 108 }, 109 splitCli: splitCli, 110 WorkerPool: workerPool, 111 batchWriteKVPairs: cfg.BatchWriteKVPairs, 112 regionSplitSize: defaultSplitSize, 113 TS: commitTS, 114 } 115 } 116 117 func (i *Ingester) makeConn(ctx context.Context, storeID uint64) (*grpc.ClientConn, error) { 118 store, err := i.splitCli.GetStore(ctx, storeID) 119 if err != nil { 120 return nil, errors.Trace(err) 121 } 122 opt := grpc.WithInsecure() 123 if i.tlsConf != nil { 124 opt = grpc.WithTransportCredentials(credentials.NewTLS(i.tlsConf)) 125 } 126 ctx, cancel := context.WithTimeout(ctx, dialTimeout) 127 128 bfConf := backoff.DefaultConfig 129 bfConf.MaxDelay = gRPCBackOffMaxDelay 130 // we should use peer address for tiflash. for tikv, peer address is empty 131 addr := store.GetPeerAddress() 132 if addr == "" { 133 addr = store.GetAddress() 134 } 135 grpcConn, err := grpc.DialContext( 136 ctx, 137 addr, 138 opt, 139 grpc.WithConnectParams(grpc.ConnectParams{Backoff: bfConf}), 140 grpc.WithKeepaliveParams(keepalive.ClientParameters{ 141 Time: gRPCKeepAliveTime, 142 Timeout: gRPCKeepAliveTimeout, 143 PermitWithoutStream: true, 144 }), 145 ) 146 cancel() 147 if err != nil { 148 return nil, errors.Trace(err) 149 } 150 return grpcConn, nil 151 } 152 153 // write [start, end) kv in to tikv. 154 func (i *Ingester) writeAndIngestByRange( 155 ctxt context.Context, 156 iterProducer kv.IterProducer, 157 start []byte, 158 end []byte, 159 remainRanges *syncdRanges, 160 ) error { 161 select { 162 case <-ctxt.Done(): 163 return ctxt.Err() 164 default: 165 } 166 iter := iterProducer.Produce(start, end) 167 iter.First() 168 pairStart := append([]byte{}, iter.Key()...) 169 iter.Last() 170 pairEnd := append([]byte{}, iter.Key()...) 171 if bytes.Compare(pairStart, pairEnd) > 0 { 172 log.Debug("There is no pairs in iterator", logutil.Key("start", start), 173 logutil.Key("end", end), logutil.Key("pairStart", pairStart), logutil.Key("pairEnd", pairEnd)) 174 return nil 175 } 176 var regions []*RegionInfo 177 var err error 178 ctx, cancel := context.WithCancel(ctxt) 179 defer cancel() 180 181 WriteAndIngest: 182 for retry := 0; retry < maxRetryTimes; { 183 if retry != 0 { 184 select { 185 case <-time.After(time.Second): 186 case <-ctx.Done(): 187 return ctx.Err() 188 } 189 } 190 startKey := codec.EncodeBytes(pairStart) 191 endKey := codec.EncodeBytes(kv.NextKey(pairEnd)) 192 regions, err = PaginateScanRegion(ctx, i.splitCli, startKey, endKey, 128) 193 if err != nil || len(regions) == 0 { 194 log.Warn("scan region failed", zap.Error(err), zap.Int("region_len", len(regions)), 195 logutil.Key("startKey", startKey), logutil.Key("endKey", endKey), zap.Int("retry", retry)) 196 retry++ 197 continue 198 } 199 200 for _, region := range regions { 201 log.Debug("get region", zap.Int("retry", retry), logutil.Key("startKey", startKey), 202 logutil.Key("endKey", endKey), logutil.Region(region.Region)) 203 w := i.WorkerPool.ApplyWorker() 204 var rg *Range 205 rg, err = i.writeAndIngestPairs(ctx, iter, region, pairStart, pairEnd) 206 i.WorkerPool.RecycleWorker(w) 207 if err != nil { 208 _, regionStart, _ := codec.DecodeBytes(region.Region.StartKey) 209 // if we have at least succeeded one region, retry without increasing the retry count 210 if bytes.Compare(regionStart, pairStart) > 0 { 211 pairStart = regionStart 212 } else { 213 retry++ 214 } 215 log.Info("retry write and ingest kv pairs", logutil.Key("startKey", pairStart), 216 logutil.Key("endKey", pairEnd), zap.Error(err), zap.Int("retry", retry)) 217 continue WriteAndIngest 218 } 219 if rg != nil { 220 remainRanges.add(*rg) 221 } 222 } 223 break 224 } 225 return err 226 } 227 228 func (i *Ingester) writeAndIngestPairs( 229 ctx context.Context, 230 iter kv.Iter, 231 region *RegionInfo, 232 start, end []byte, 233 ) (*Range, error) { 234 var err error 235 var remainRange *Range 236 loopWrite: 237 for retry := 0; retry < maxRetryTimes; retry++ { 238 select { 239 case <-ctx.Done(): 240 return remainRange, ctx.Err() 241 default: 242 } 243 var metas []*sst.SSTMeta 244 metas, remainRange, err = i.writeToTiKV(ctx, iter, region, start, end) 245 if err != nil { 246 log.Warn("write to tikv failed", zap.Error(err)) 247 return nil, err 248 } 249 250 for _, meta := range metas { 251 errCnt := 0 252 for errCnt < maxRetryTimes { 253 log.Debug("ingest meta", zap.Reflect("meta", meta)) 254 var resp *sst.IngestResponse 255 resp, err = i.ingest(ctx, meta, region) 256 if err != nil { 257 log.Warn("ingest failed", zap.Error(err), logutil.SSTMeta(meta), 258 zap.Reflect("region", region)) 259 errCnt++ 260 continue 261 } 262 failpoint.Inject("FailIngestMeta", func(val failpoint.Value) { 263 switch val.(string) { 264 case "notleader": 265 resp.Error.NotLeader = &errorpb.NotLeader{ 266 RegionId: region.Region.Id, Leader: region.Leader, 267 } 268 case "epochnotmatch": 269 resp.Error.EpochNotMatch = &errorpb.EpochNotMatch{ 270 CurrentRegions: []*metapb.Region{region.Region}, 271 } 272 } 273 }) 274 var retryTy retryType 275 var newRegion *RegionInfo 276 retryTy, newRegion, err = i.isIngestRetryable(ctx, resp, region, meta) 277 if err == nil { 278 // ingest next meta 279 break 280 } 281 switch retryTy { 282 case retryNone: 283 log.Warn("ingest failed and do not retry", zap.Error(err), logutil.SSTMeta(meta), 284 zap.Reflect("region", region)) 285 // met non-retryable error retry whole Write procedure 286 return remainRange, err 287 case retryWrite: 288 region = newRegion 289 continue loopWrite 290 case retryIngest: 291 region = newRegion 292 continue 293 } 294 } 295 } 296 297 if err != nil { 298 log.Warn("write and ingest region, will retry import full range", zap.Error(err), 299 logutil.Region(region.Region), logutil.Key("start", start), logutil.Key("end", end)) 300 } 301 return remainRange, errors.Trace(err) 302 } 303 304 return remainRange, errors.Trace(err) 305 } 306 307 // writeToTiKV writer engine key-value pairs to tikv and return the sst meta generated by tikv. 308 // we don't need to do cleanup for the pairs written to tikv if encounters an error, 309 // tikv will takes the responsibility to do so. 310 func (i *Ingester) writeToTiKV( 311 ctx context.Context, 312 iter kv.Iter, 313 region *RegionInfo, 314 start, end []byte, 315 ) ([]*sst.SSTMeta, *Range, error) { 316 begin := time.Now() 317 regionRange := intersectRange(region.Region, Range{Start: start, End: end}) 318 319 iter.Seek(regionRange.Start) 320 firstKey := codec.EncodeBytes(iter.Key()) 321 var lastKey []byte 322 if iter.Seek(regionRange.End) { 323 lastKey = codec.EncodeBytes(iter.Key()) 324 } else { 325 iter.Last() 326 log.Info("region range's end key not in iter, shouldn't happen", 327 zap.Any("region range", regionRange), logutil.Key("iter last", iter.Key())) 328 lastKey = codec.EncodeBytes(kv.NextKey(iter.Key())) 329 } 330 331 if bytes.Compare(firstKey, lastKey) > 0 { 332 log.Info("keys within region is empty, skip ingest", logutil.Key("start", start), 333 logutil.Key("regionStart", region.Region.StartKey), logutil.Key("end", end), 334 logutil.Key("regionEnd", region.Region.EndKey)) 335 return nil, nil, nil 336 } 337 338 u := uuid.New() 339 meta := &sst.SSTMeta{ 340 Uuid: u[:], 341 RegionId: region.Region.GetId(), 342 RegionEpoch: region.Region.GetRegionEpoch(), 343 Range: &sst.Range{ 344 Start: firstKey, 345 End: lastKey, 346 }, 347 } 348 349 leaderID := region.Leader.GetId() 350 clients := make([]sst.ImportSST_WriteClient, 0, len(region.Region.GetPeers())) 351 requests := make([]*sst.WriteRequest, 0, len(region.Region.GetPeers())) 352 for _, peer := range region.Region.GetPeers() { 353 cli, err := i.getImportClient(ctx, peer) 354 if err != nil { 355 return nil, nil, err 356 } 357 358 wstream, err := cli.Write(ctx) 359 if err != nil { 360 return nil, nil, errors.Trace(err) 361 } 362 363 // Bind uuid for this write request 364 req := &sst.WriteRequest{ 365 Chunk: &sst.WriteRequest_Meta{ 366 Meta: meta, 367 }, 368 } 369 if err = wstream.Send(req); err != nil { 370 return nil, nil, errors.Trace(err) 371 } 372 req.Chunk = &sst.WriteRequest_Batch{ 373 Batch: &sst.WriteBatch{ 374 CommitTs: i.TS, 375 }, 376 } 377 clients = append(clients, wstream) 378 requests = append(requests, req) 379 } 380 381 bytesBuf := membuf.NewBuffer() 382 defer bytesBuf.Destroy() 383 pairs := make([]*sst.Pair, 0, i.batchWriteKVPairs) 384 count := 0 385 size := int64(0) 386 totalCount := 0 387 firstLoop := true 388 regionMaxSize := i.regionSplitSize * 4 / 3 389 390 for iter.Seek(regionRange.Start); iter.Valid() && bytes.Compare(iter.Key(), regionRange.End) <= 0; iter.Next() { 391 size += int64(len(iter.Key()) + len(iter.Value())) 392 // here we reuse the `*sst.Pair`s to optimize object allocation 393 if firstLoop { 394 pair := &sst.Pair{ 395 Key: bytesBuf.AddBytes(iter.Key()), 396 Value: bytesBuf.AddBytes(iter.Value()), 397 Op: iter.OpType(), 398 } 399 pairs = append(pairs, pair) 400 } else { 401 pairs[count].Key = bytesBuf.AddBytes(iter.Key()) 402 pairs[count].Value = bytesBuf.AddBytes(iter.Value()) 403 } 404 count++ 405 totalCount++ 406 407 if count >= i.batchWriteKVPairs { 408 for i := range clients { 409 requests[i].Chunk.(*sst.WriteRequest_Batch).Batch.Pairs = pairs[:count] 410 if err := clients[i].Send(requests[i]); err != nil { 411 return nil, nil, errors.Trace(err) 412 } 413 } 414 count = 0 415 bytesBuf.Reset() 416 firstLoop = false 417 } 418 if size >= regionMaxSize || totalCount >= regionMaxKeyCount { 419 break 420 } 421 } 422 423 if count > 0 { 424 for i := range clients { 425 requests[i].Chunk.(*sst.WriteRequest_Batch).Batch.Pairs = pairs[:count] 426 if err := clients[i].Send(requests[i]); err != nil { 427 return nil, nil, errors.Trace(err) 428 } 429 } 430 } 431 432 if iter.Error() != nil { 433 return nil, nil, errors.Trace(iter.Error()) 434 } 435 436 var leaderPeerMetas []*sst.SSTMeta 437 for i, wStream := range clients { 438 if resp, closeErr := wStream.CloseAndRecv(); closeErr != nil { 439 return nil, nil, errors.Trace(closeErr) 440 } else if leaderID == region.Region.Peers[i].GetId() { 441 leaderPeerMetas = resp.Metas 442 log.Debug("get metas after write kv stream to tikv", zap.Reflect("metas", leaderPeerMetas)) 443 } 444 } 445 446 // if there is not leader currently, we should directly return an error 447 if leaderPeerMetas == nil { 448 log.Warn("write to tikv no leader", zap.Reflect("region", region), 449 zap.Uint64("leader_id", leaderID), zap.Reflect("meta", meta), 450 zap.Int("kv_pairs", totalCount), zap.Int64("total_bytes", size)) 451 return nil, nil, errors.Annotatef(berrors.ErrPDLeaderNotFound, "write to tikv with no leader returned, region '%d', leader: %d", 452 region.Region.Id, leaderID) 453 } 454 455 log.Debug("write to kv", zap.Reflect("region", region), zap.Uint64("leader", leaderID), 456 zap.Reflect("meta", meta), zap.Reflect("return metas", leaderPeerMetas), 457 zap.Int("kv_pairs", totalCount), zap.Int64("total_bytes", size), 458 zap.Int64("buf_size", bytesBuf.TotalSize()), 459 zap.Stringer("takeTime", time.Since(begin))) 460 461 var remainRange *Range 462 if iter.Valid() && iter.Next() { 463 firstKey := append([]byte{}, iter.Key()...) 464 remainRange = &Range{Start: firstKey, End: regionRange.End} 465 log.Info("write to tikv partial finish", zap.Int("count", totalCount), 466 zap.Int64("size", size), zap.Binary("startKey", regionRange.Start), zap.Binary("endKey", regionRange.End), 467 zap.Binary("remainStart", remainRange.Start), zap.Binary("remainEnd", remainRange.End), 468 zap.Reflect("region", region)) 469 } 470 471 return leaderPeerMetas, remainRange, nil 472 } 473 474 func (i *Ingester) ingest(ctx context.Context, meta *sst.SSTMeta, region *RegionInfo) (*sst.IngestResponse, error) { 475 leader := region.Leader 476 if leader == nil { 477 leader = region.Region.GetPeers()[0] 478 } 479 480 cli, err := i.getImportClient(ctx, leader) 481 if err != nil { 482 return nil, err 483 } 484 reqCtx := &kvrpcpb.Context{ 485 RegionId: region.Region.GetId(), 486 RegionEpoch: region.Region.GetRegionEpoch(), 487 Peer: leader, 488 } 489 490 req := &sst.IngestRequest{ 491 Context: reqCtx, 492 Sst: meta, 493 } 494 resp, err := cli.Ingest(ctx, req) 495 if err != nil { 496 return nil, errors.Trace(err) 497 } 498 return resp, nil 499 } 500 501 func (i *Ingester) getImportClient(ctx context.Context, peer *metapb.Peer) (sst.ImportSSTClient, error) { 502 i.conns.mu.Lock() 503 defer i.conns.mu.Unlock() 504 505 conn, err := i.getGrpcConnLocked(ctx, peer.GetStoreId()) 506 if err != nil { 507 return nil, errors.Trace(err) 508 } 509 return sst.NewImportSSTClient(conn), nil 510 } 511 512 func (i *Ingester) getGrpcConnLocked(ctx context.Context, storeID uint64) (*grpc.ClientConn, error) { 513 if _, ok := i.conns.conns[storeID]; !ok { 514 i.conns.conns[storeID] = conn.NewConnPool(i.conns.tcpConcurrency, func(ctx context.Context) (*grpc.ClientConn, error) { 515 return i.makeConn(ctx, storeID) 516 }) 517 } 518 return i.conns.conns[storeID].Get(ctx) 519 } 520 521 func (i *Ingester) isIngestRetryable( 522 ctx context.Context, 523 resp *sst.IngestResponse, 524 region *RegionInfo, 525 meta *sst.SSTMeta, 526 ) (retryType, *RegionInfo, error) { 527 if resp.GetError() == nil { 528 return retryNone, nil, nil 529 } 530 531 getRegion := func() (*RegionInfo, error) { 532 for retry := 0; ; retry++ { 533 newRegion, err := i.splitCli.GetRegion(ctx, region.Region.GetStartKey()) 534 if err != nil { 535 return nil, errors.Trace(err) 536 } 537 if newRegion != nil { 538 return newRegion, nil 539 } 540 log.Warn("get region by key return nil, will retry", zap.Reflect("region", region), 541 zap.Int("retry", retry)) 542 select { 543 case <-ctx.Done(): 544 return nil, ctx.Err() 545 case <-time.After(time.Second): 546 } 547 } 548 } 549 550 var newRegion *RegionInfo 551 var err error 552 switch errPb := resp.GetError(); { 553 case errPb.NotLeader != nil: 554 if newLeader := errPb.GetNotLeader().GetLeader(); newLeader != nil { 555 newRegion = &RegionInfo{ 556 Leader: newLeader, 557 Region: region.Region, 558 } 559 } else { 560 newRegion, err = getRegion() 561 if err != nil { 562 return retryNone, nil, errors.Trace(err) 563 } 564 } 565 return retryIngest, newRegion, errors.Annotatef(berrors.ErrKVNotLeader, "not leader: %s", errPb.GetMessage()) 566 case errPb.EpochNotMatch != nil: 567 if currentRegions := errPb.GetEpochNotMatch().GetCurrentRegions(); currentRegions != nil { 568 var currentRegion *metapb.Region 569 for _, r := range currentRegions { 570 if insideRegion(r, meta) { 571 currentRegion = r 572 break 573 } 574 } 575 if currentRegion != nil { 576 var newLeader *metapb.Peer 577 for _, p := range currentRegion.Peers { 578 if p.GetStoreId() == region.Leader.GetStoreId() { 579 newLeader = p 580 break 581 } 582 } 583 if newLeader != nil { 584 newRegion = &RegionInfo{ 585 Leader: newLeader, 586 Region: currentRegion, 587 } 588 } 589 } 590 } 591 retryTy := retryNone 592 if newRegion != nil { 593 retryTy = retryWrite 594 } 595 return retryTy, newRegion, errors.Annotatef(berrors.ErrKVEpochNotMatch, "epoch not match: %s", errPb.GetMessage()) 596 case strings.Contains(errPb.Message, "raft: proposal dropped"): 597 // TODO: we should change 'Raft raft: proposal dropped' to a error type like 'NotLeader' 598 newRegion, err = getRegion() 599 if err != nil { 600 return retryNone, nil, errors.Trace(err) 601 } 602 return retryIngest, newRegion, errors.Annotate(berrors.ErrKVUnknown, errPb.GetMessage()) 603 } 604 return retryNone, nil, errors.Annotatef(berrors.ErrKVUnknown, "non-retryable error: %s", resp.GetError().GetMessage()) 605 }