github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/repair.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "bytes" 25 "errors" 26 "fmt" 27 "math" 28 "strconv" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "github.com/m3db/m3/src/dbnode/client" 34 "github.com/m3db/m3/src/dbnode/namespace" 35 "github.com/m3db/m3/src/dbnode/retention" 36 "github.com/m3db/m3/src/dbnode/storage/block" 37 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 38 "github.com/m3db/m3/src/dbnode/storage/repair" 39 "github.com/m3db/m3/src/dbnode/topology" 40 "github.com/m3db/m3/src/dbnode/x/xio" 41 "github.com/m3db/m3/src/x/clock" 42 "github.com/m3db/m3/src/x/context" 43 xerrors "github.com/m3db/m3/src/x/errors" 44 "github.com/m3db/m3/src/x/ident" 45 "github.com/m3db/m3/src/x/instrument" 46 xtime "github.com/m3db/m3/src/x/time" 47 48 "github.com/jhump/protoreflect/dynamic" 49 "github.com/uber-go/tally" 50 "go.uber.org/zap" 51 ) 52 53 var ( 54 errNoRepairOptions = errors.New("no repair options") 55 errRepairInProgress = errors.New("repair already in progress") 56 ) 57 58 type recordFn func( 59 origin topology.Host, 60 namespace ident.ID, 61 shard databaseShard, 62 diffRes repair.MetadataComparisonResult, 63 ) 64 65 // TODO(rartoul): See if we can find a way to guard against too much metadata. 66 type shardRepairer struct { 67 opts Options 68 rpopts repair.Options 69 clients []client.AdminClient 70 record recordFn 71 nowFn clock.NowFn 72 logger *zap.Logger 73 scope tally.Scope 74 metrics shardRepairerMetrics 75 } 76 77 type shardRepairerMetrics struct { 78 runDefault tally.Counter 79 runOnlyCompare tally.Counter 80 } 81 82 func newShardRepairerMetrics(scope tally.Scope) shardRepairerMetrics { 83 return shardRepairerMetrics{ 84 runDefault: scope.Tagged(map[string]string{ 85 "repair_type": "default", 86 }).Counter("run"), 87 runOnlyCompare: scope.Tagged(map[string]string{ 88 "repair_type": "only_compare", 89 }).Counter("run"), 90 } 91 } 92 93 func newShardRepairer(opts Options, rpopts repair.Options) databaseShardRepairer { 94 iopts := opts.InstrumentOptions() 95 scope := iopts.MetricsScope().SubScope("repair") 96 97 r := shardRepairer{ 98 opts: opts, 99 rpopts: rpopts, 100 clients: rpopts.AdminClients(), 101 nowFn: opts.ClockOptions().NowFn(), 102 logger: iopts.Logger(), 103 scope: scope, 104 metrics: newShardRepairerMetrics(scope), 105 } 106 r.record = r.recordDifferences 107 108 return r 109 } 110 111 func (r shardRepairer) Options() repair.Options { 112 return r.rpopts 113 } 114 115 func (r shardRepairer) Repair( 116 ctx context.Context, 117 nsCtx namespace.Context, 118 nsMeta namespace.Metadata, 119 tr xtime.Range, 120 shard databaseShard, 121 ) (repair.MetadataComparisonResult, error) { 122 repairType := r.rpopts.Type() 123 switch repairType { 124 case repair.DefaultRepair: 125 defer r.metrics.runDefault.Inc(1) 126 case repair.OnlyCompareRepair: 127 defer r.metrics.runOnlyCompare.Inc(1) 128 default: 129 // Unknown repair type. 130 err := fmt.Errorf("unknown repair type: %v", repairType) 131 return repair.MetadataComparisonResult{}, err 132 } 133 134 var sessions []sessionAndTopo 135 for _, c := range r.clients { 136 session, err := c.DefaultAdminSession() 137 if err != nil { 138 fmtErr := fmt.Errorf("error obtaining default admin session: %v", err) 139 return repair.MetadataComparisonResult{}, fmtErr 140 } 141 142 topo, err := session.TopologyMap() 143 if err != nil { 144 fmtErr := fmt.Errorf("error obtaining topology map: %v", err) 145 return repair.MetadataComparisonResult{}, fmtErr 146 } 147 148 sessions = append(sessions, sessionAndTopo{ 149 session: session, 150 topo: topo, 151 }) 152 } 153 154 var ( 155 start = tr.Start 156 end = tr.End 157 // Guaranteed to have at least one session and all should have an identical 158 // origin (both assumptions guaranteed by options validation). 159 origin = sessions[0].session.Origin() 160 ) 161 162 metadata := repair.NewReplicaMetadataComparer(origin, r.rpopts) 163 ctx.RegisterFinalizer(metadata) 164 165 // Add local metadata. 166 opts := block.FetchBlocksMetadataOptions{ 167 IncludeSizes: true, 168 IncludeChecksums: true, 169 } 170 var ( 171 accumLocalMetadata = block.NewFetchBlocksMetadataResults() 172 pageToken PageToken 173 err error 174 ) 175 // Safe to register since by the time this function completes we won't be using the metadata 176 // for anything anymore. 177 ctx.RegisterCloser(accumLocalMetadata) 178 179 for { 180 // It's possible for FetchBlocksMetadataV2 to not return all the metadata at once even if 181 // math.MaxInt64 is passed as the limit due to its implementation and the different phases 182 // of the page token. As a result, the only way to ensure that all the metadata has been 183 // fetched is to continue looping until a nil pageToken is returned. 184 var currLocalMetadata block.FetchBlocksMetadataResults 185 currLocalMetadata, pageToken, err = shard.FetchBlocksMetadataV2(ctx, start, end, math.MaxInt64, pageToken, opts) 186 if err != nil { 187 return repair.MetadataComparisonResult{}, err 188 } 189 190 // Merge. 191 if currLocalMetadata != nil { 192 for _, result := range currLocalMetadata.Results() { 193 accumLocalMetadata.Add(result) 194 } 195 } 196 197 if pageToken == nil { 198 break 199 } 200 } 201 202 if r.rpopts.DebugShadowComparisonsEnabled() { 203 for _, sesTopo := range sessions { 204 // Shadow comparison is mostly a debug feature that can be used to test new builds and diagnose 205 // issues with the repair feature. It should not be enabled for production use-cases. 206 err := r.shadowCompare(start, end, accumLocalMetadata, sesTopo.session, shard, nsCtx) 207 if err != nil { 208 r.logger.Error( 209 "Shadow compare failed", 210 zap.Error(err)) 211 } 212 } 213 } 214 215 localIter := block.NewFilteredBlocksMetadataIter(accumLocalMetadata) 216 err = metadata.AddLocalMetadata(localIter) 217 if err != nil { 218 return repair.MetadataComparisonResult{}, err 219 } 220 221 var ( 222 rsOpts = r.opts.RepairOptions().ResultOptions() 223 level = r.rpopts.RepairConsistencyLevel() 224 ) 225 for _, sesTopo := range sessions { 226 // Add peer metadata. 227 peerIter, err := sesTopo.session.FetchBlocksMetadataFromPeers(nsCtx.ID, shard.ID(), start, end, 228 level, rsOpts) 229 if err != nil { 230 return repair.MetadataComparisonResult{}, err 231 } 232 if err := metadata.AddPeerMetadata(peerIter); err != nil { 233 return repair.MetadataComparisonResult{}, err 234 } 235 } 236 237 var ( 238 // TODO(rartoul): Pool these slices. 239 metadatasToFetchBlocksForPerSession = make([][]block.ReplicaMetadata, len(sessions)) 240 metadataRes = metadata.Compare() 241 seriesWithChecksumMismatches = metadataRes.ChecksumDifferences.Series() 242 ) 243 244 // Shard repair can fail due to transient network errors due to the significant amount of data fetched from peers. 245 // So collect and emit metadata comparison metrics before fetching blocks from peer to repair. 246 r.record(origin, nsCtx.ID, shard, metadataRes) 247 if repairType == repair.OnlyCompareRepair { 248 // Early return if repair type doesn't require executing repairing the data step. 249 return metadataRes, nil 250 } 251 252 originID := origin.ID() 253 for _, e := range seriesWithChecksumMismatches.Iter() { 254 for blockStart, replicaMetadataBlocks := range e.Value().Metadata.Blocks() { 255 blStartRange := xtime.Range{Start: blockStart, End: blockStart} 256 if !tr.Contains(blStartRange) { 257 instrument.EmitAndLogInvariantViolation(r.opts.InstrumentOptions(), func(l *zap.Logger) { 258 l.With( 259 zap.Time("blockStart", blockStart.ToTime()), 260 zap.String("namespace", nsMeta.ID().String()), 261 zap.Uint32("shard", shard.ID()), 262 ).Error("repair received replica metadata for unrequested blockStart") 263 }) 264 continue 265 } 266 267 for _, replicaMetadata := range replicaMetadataBlocks.Metadata() { 268 metadataHostID := replicaMetadata.Host.ID() 269 if metadataHostID == originID { 270 // Don't request blocks for self metadata. 271 continue 272 } 273 274 if len(sessions) == 1 { 275 // Optimized path for single session case. 276 metadatasToFetchBlocksForPerSession[0] = append(metadatasToFetchBlocksForPerSession[0], replicaMetadata) 277 continue 278 } 279 280 // If there is more than one session then we need to match up all of the metadata to the 281 // session it belongs to so that we can fetch the corresponding blocks of data. 282 foundSessionForMetadata := false 283 for i, sesTopo := range sessions { 284 _, ok := sesTopo.topo.LookupHostShardSet(metadataHostID) 285 if !ok { 286 // The host this metadata came from is not part of the cluster this session is connected to. 287 continue 288 } 289 metadatasToFetchBlocksForPerSession[i] = append(metadatasToFetchBlocksForPerSession[i], replicaMetadata) 290 foundSessionForMetadata = true 291 break 292 } 293 294 if !foundSessionForMetadata { 295 // Could happen during topology changes (I.E node is kicked out of the cluster in-between 296 // fetching its metadata and this step). 297 r.logger.Debug( 298 "could not identify which session mismatched metadata belong to", 299 zap.String("hostID", metadataHostID), 300 zap.Time("blockStart", blockStart.ToTime()), 301 ) 302 } 303 } 304 } 305 } 306 307 // TODO(rartoul): Copying the IDs for the purposes of the map key is wasteful. Considering using 308 // SetUnsafe or marking as NoFinalize() and making the map check IsNoFinalize(). 309 results := result.NewShardResult(rsOpts) 310 for i, metadatasToFetchBlocksFor := range metadatasToFetchBlocksForPerSession { 311 if len(metadatasToFetchBlocksFor) == 0 { 312 continue 313 } 314 315 session := sessions[i].session 316 perSeriesReplicaIter, err := session.FetchBlocksFromPeers(nsMeta, shard.ID(), level, metadatasToFetchBlocksFor, rsOpts) 317 if err != nil { 318 return repair.MetadataComparisonResult{}, err 319 } 320 321 for perSeriesReplicaIter.Next() { 322 _, id, tags, block := perSeriesReplicaIter.Current() 323 if existing, ok := results.BlockAt(id, block.StartTime()); ok { 324 // Merge contents with existing block. 325 if err := existing.Merge(block); err != nil { 326 return repair.MetadataComparisonResult{}, err 327 } 328 continue 329 } 330 331 // Add block for first time to results. 332 results.AddBlock(id, tags, block) 333 } 334 } 335 336 if err := r.loadDataIntoShard(shard, results); err != nil { 337 return repair.MetadataComparisonResult{}, err 338 } 339 340 return metadataRes, nil 341 } 342 343 // TODO(rartoul): Currently throttling via the MemoryTracker can only occur at the level of an entire 344 // block for a given namespace/shard/blockStart. For almost all practical use-cases this is fine, but 345 // this could be improved and made more granular by breaking data that is being loaded into the shard 346 // into smaller batches (less than one complete block). This would improve the granularity of throttling 347 // for clusters where the number of shards is low. 348 func (r shardRepairer) loadDataIntoShard(shard databaseShard, data result.ShardResult) error { 349 var ( 350 waitingGauge = r.scope.Gauge("waiting-for-limit") 351 waitedCounter = r.scope.Counter("waited-for-limit") 352 doneCh = make(chan struct{}) 353 waiting bool 354 waitingLock sync.Mutex 355 ) 356 defer close(doneCh) 357 358 // Emit a gauge constantly that indicates whether or not the repair process is blocked waiting. 359 go func() { 360 for { 361 select { 362 case <-doneCh: 363 waitingGauge.Update(0) 364 return 365 default: 366 waitingLock.Lock() 367 currWaiting := waiting 368 waitingLock.Unlock() 369 if currWaiting { 370 waitingGauge.Update(1) 371 } else { 372 waitingGauge.Update(0) 373 } 374 time.Sleep(5 * time.Second) 375 } 376 } 377 }() 378 379 for { 380 err := shard.LoadBlocks(data.AllSeries()) 381 if err == ErrDatabaseLoadLimitHit { 382 waitedCounter.Inc(1) 383 waitingLock.Lock() 384 waiting = true 385 waitingLock.Unlock() 386 // Wait for some of the outstanding data to be flushed before trying again. 387 r.logger.Info("repair throttled due to memory load limits, waiting for data to be flushed before continuing") 388 r.opts.MemoryTracker().WaitForDec() 389 continue 390 } 391 if err != nil { 392 return err 393 } 394 return nil 395 } 396 } 397 398 func (r shardRepairer) recordDifferences( 399 origin topology.Host, 400 namespace ident.ID, 401 shard databaseShard, 402 diffRes repair.MetadataComparisonResult, 403 ) { 404 var ( 405 shardScope = r.scope.Tagged(map[string]string{ 406 "namespace": namespace.String(), 407 "shard": strconv.Itoa(int(shard.ID())), 408 }) 409 totalScope = shardScope.Tagged(map[string]string{"resultType": "total"}) 410 sizeDiffScope = shardScope.Tagged(map[string]string{"resultType": "sizeDiff"}) 411 checksumDiffScope = shardScope.Tagged(map[string]string{"resultType": "checksumDiff"}) 412 ) 413 414 // Record total number of series and total number of blocks. 415 totalScope.Counter("series").Inc(diffRes.NumSeries) 416 totalScope.Counter("blocks").Inc(diffRes.NumBlocks) 417 418 // Record size differences. 419 sizeDiffScope.Counter("series").Inc(diffRes.SizeDifferences.NumSeries()) 420 sizeDiffScope.Counter("blocks").Inc(diffRes.SizeDifferences.NumBlocks()) 421 422 absoluteBlockSizeDiff, blockSizeDiffAsPercentage := r.computeMaximumBlockSizeDifference(origin, diffRes) 423 sizeDiffScope.Gauge("max-block-size-diff").Update(float64(absoluteBlockSizeDiff)) 424 sizeDiffScope.Gauge("max-block-size-diff-as-percentage").Update(blockSizeDiffAsPercentage) 425 426 // Record checksum differences. 427 checksumDiffScope.Counter("series").Inc(diffRes.ChecksumDifferences.NumSeries()) 428 checksumDiffScope.Counter("blocks").Inc(diffRes.ChecksumDifferences.NumBlocks()) 429 } 430 431 // computeMaximumBlockSizeDifferenceAsPercentage returns a metric which represents maximum divergence of a shard with 432 // any of its peers. A positive divergence means that origin shard has more data than its peer and a negative 433 // divergence means that origin shard has lesser data than its peer. Since sizes for all the blocks in rentention 434 // window are not readily available, exact divergence of a shard from its peer cannot be calculated. So this method 435 // settles for returning maximum divergence of a block/shard with any of its peers. Divergence(as percentage) of shard 436 // is upper bounded by divergence of block/shard so this metric can be used to monitor severity of divergence. 437 func (r shardRepairer) computeMaximumBlockSizeDifference( 438 origin topology.Host, 439 diffRes repair.MetadataComparisonResult, 440 ) (int64, float64) { 441 var ( 442 maxBlockSizeDiffAsRatio float64 443 maxBlockSizeDiff int64 444 ) 445 // Iterate over all the series which differ in size between origin and a peer. 446 for _, entry := range diffRes.SizeDifferences.Series().Iter() { 447 series := entry.Value() 448 replicaBlocksMetadata := diffRes.SizeDifferences.GetOrAdd(series.ID) 449 // Iterate over all the time ranges which had a mismatched series between origin and a peer. 450 for _, replicasMetadata := range replicaBlocksMetadata.Blocks() { 451 var ( 452 // Setting minimum origin block size to 1 so that percetages off of origin block size can be calculated 453 // without worrying about divide by zero errors. Exact percentages are not required so setting a 454 // non-zero size for an empty block is acceptable. 455 originBlockSize int64 = 1 456 // Represents maximum size difference of a block with one of its peers. 457 maxPeerBlockSizeDiff int64 458 ) 459 // Record the block size on the origin. 460 for _, replicaMetadata := range replicasMetadata.Metadata() { 461 if replicaMetadata.Host.ID() == origin.ID() && replicaMetadata.Size > 0 { 462 originBlockSize = replicaMetadata.Size 463 break 464 } 465 } 466 // Fetch the maximum block size difference of origin with any of its peers. 467 for _, replicaMetadata := range replicasMetadata.Metadata() { 468 if replicaMetadata.Host.ID() != origin.ID() { 469 blockSizeDiff := originBlockSize - replicaMetadata.Size 470 if math.Abs(float64(blockSizeDiff)) > math.Abs(float64(maxPeerBlockSizeDiff)) { 471 maxPeerBlockSizeDiff = blockSizeDiff 472 } 473 } 474 } 475 // Record divergence as percentage for origin block which has diverged the most from its peers. 476 if math.Abs(float64(maxPeerBlockSizeDiff)) > math.Abs(float64(maxBlockSizeDiff)) { 477 maxBlockSizeDiff = maxPeerBlockSizeDiff 478 maxBlockSizeDiffAsRatio = float64(maxPeerBlockSizeDiff) / float64(originBlockSize) 479 } 480 } 481 } 482 return maxBlockSizeDiff, maxBlockSizeDiffAsRatio * 100 483 } 484 485 type repairFn func() error 486 487 type sleepFn func(d time.Duration) 488 489 type repairStatus int 490 491 const ( 492 repairNotStarted repairStatus = iota 493 repairSuccess 494 repairFailed 495 ) 496 497 type repairState struct { 498 LastAttempt xtime.UnixNano 499 Status repairStatus 500 } 501 502 type namespaceRepairStateByTime map[xtime.UnixNano]repairState 503 504 // NB(r): This uses a map[string]element instead of a generated map for 505 // native ident.ID keys, this was because the call frequency is very low 506 // it's not in the hot path so casting ident.ID to string isn't too expensive 507 // and this data structure may very well change soon with a refactor of the 508 // background repair in the works. 509 type repairStatesByNs map[string]namespaceRepairStateByTime 510 511 func newRepairStates() repairStatesByNs { 512 return make(repairStatesByNs) 513 } 514 515 func (r repairStatesByNs) repairStates( 516 namespace ident.ID, 517 t xtime.UnixNano, 518 ) (repairState, bool) { 519 var rs repairState 520 521 nsRepairState, ok := r[namespace.String()] 522 if !ok { 523 return rs, false 524 } 525 526 rs, ok = nsRepairState[t] 527 return rs, ok 528 } 529 530 func (r repairStatesByNs) setRepairState( 531 namespace ident.ID, 532 t xtime.UnixNano, 533 state repairState, 534 ) { 535 nsRepairState, ok := r[namespace.String()] 536 if !ok { 537 nsRepairState = make(namespaceRepairStateByTime) 538 r[namespace.String()] = nsRepairState 539 } 540 nsRepairState[t] = state 541 } 542 543 // NB(prateek): dbRepairer.Repair(...) guarantees atomicity of execution, so all other 544 // state does not need to be thread safe. One exception - `dbRepairer.closed` is used 545 // for early termination if `dbRepairer.Stop()` is called during a repair, so we guard 546 // it with a mutex. 547 type dbRepairer struct { 548 database database 549 opts Options 550 ropts repair.Options 551 shardRepairer databaseShardRepairer 552 repairStatesByNs repairStatesByNs 553 554 repairFn repairFn 555 sleepFn sleepFn 556 nowFn clock.NowFn 557 logger *zap.Logger 558 repairCheckInterval time.Duration 559 scope tally.Scope 560 status tally.Gauge 561 562 closedLock sync.Mutex 563 running int32 564 closed bool 565 } 566 567 func newDatabaseRepairer(database database, opts Options) (databaseRepairer, error) { 568 var ( 569 nowFn = opts.ClockOptions().NowFn() 570 scope = opts.InstrumentOptions().MetricsScope().SubScope("repair") 571 ropts = opts.RepairOptions() 572 ) 573 if ropts == nil { 574 return nil, errNoRepairOptions 575 } 576 if err := ropts.Validate(); err != nil { 577 return nil, err 578 } 579 580 shardRepairer := newShardRepairer(opts, ropts) 581 582 r := &dbRepairer{ 583 database: database, 584 opts: opts, 585 ropts: ropts, 586 shardRepairer: shardRepairer, 587 repairStatesByNs: newRepairStates(), 588 sleepFn: time.Sleep, 589 nowFn: nowFn, 590 logger: opts.InstrumentOptions().Logger(), 591 repairCheckInterval: ropts.RepairCheckInterval(), 592 scope: scope, 593 status: scope.Gauge("repair"), 594 } 595 r.repairFn = r.Repair 596 597 return r, nil 598 } 599 600 func (r *dbRepairer) run() { 601 for { 602 r.closedLock.Lock() 603 closed := r.closed 604 r.closedLock.Unlock() 605 606 if closed { 607 break 608 } 609 610 r.sleepFn(r.repairCheckInterval) 611 612 if err := r.repairFn(); err != nil { 613 r.logger.Error("error repairing database", zap.Error(err)) 614 } 615 } 616 } 617 618 func (r *dbRepairer) namespaceRepairTimeRange(ns databaseNamespace) xtime.Range { 619 var ( 620 now = xtime.ToUnixNano(r.nowFn()) 621 rtopts = ns.Options().RetentionOptions() 622 ) 623 return xtime.Range{ 624 Start: retention.FlushTimeStart(rtopts, now), 625 End: retention.FlushTimeEnd(rtopts, now)} 626 } 627 628 func (r *dbRepairer) Start() { 629 go r.run() 630 } 631 632 func (r *dbRepairer) Stop() { 633 r.closedLock.Lock() 634 r.closed = true 635 r.closedLock.Unlock() 636 } 637 638 // Repair will analyze the current repair state for each namespace/blockStart combination and pick one blockStart 639 // per namespace to repair. It will prioritize blocks that have never been repaired over those that have been 640 // repaired before, and it will prioritize more recent blocks over older ones. If all blocks have been repaired 641 // before then it will prioritize the least recently repaired block. 642 // 643 // The Repair function only attempts to repair one block at a time because this allows the background repair process 644 // to run its prioritization logic more frequently. For example, if we attempted to repair all blocks in one pass, 645 // even with appropriate backpressure, this could lead to situations where recent blocks are not repaired for a 646 // substantial amount of time whereas with the current approach the longest delay between running the prioritization 647 // logic is the amount of time it takes to repair one block for all shards. 648 // 649 // Long term we will want to move to a model that actually tracks state for individual shard/blockStart combinations, 650 // not just blockStarts. 651 func (r *dbRepairer) Repair() error { 652 // Don't attempt a repair if the database is not bootstrapped yet 653 if !r.database.IsBootstrapped() { 654 return nil 655 } 656 657 if !atomic.CompareAndSwapInt32(&r.running, 0, 1) { 658 return errRepairInProgress 659 } 660 661 defer func() { 662 atomic.StoreInt32(&r.running, 0) 663 }() 664 665 multiErr := xerrors.NewMultiError() 666 namespaces, err := r.database.OwnedNamespaces() 667 if err != nil { 668 return err 669 } 670 671 var ( 672 strategy = r.ropts.Strategy() 673 repairBlockStartShortCircuitRepair bool 674 ) 675 switch strategy { 676 case repair.DefaultStrategy: 677 repairBlockStartShortCircuitRepair = true 678 case repair.FullSweepStrategy: 679 repairBlockStartShortCircuitRepair = false 680 default: 681 // Unrecognized strategy. 682 return fmt.Errorf("unknown repair strategy: %v", strategy) 683 } 684 685 for _, n := range namespaces { 686 repairRange := r.namespaceRepairTimeRange(n) 687 blockSize := n.Options().RetentionOptions().BlockSize() 688 689 // Iterating backwards will be exclusive on the start, but we want to be inclusive on the 690 // start so subtract a blocksize. 691 repairRange.Start = repairRange.Start.Add(-blockSize) 692 693 var ( 694 numUnrepairedBlocks = 0 695 hasRepairedABlockStart = false 696 leastRecentlyRepairedBlockStart xtime.UnixNano 697 leastRecentlyRepairedBlockStartLastRepairTime xtime.UnixNano 698 namespaceScope = r.scope.Tagged(map[string]string{ 699 "namespace": n.ID().String(), 700 }) 701 ) 702 repairRange.IterateBackward(blockSize, func(blockStart xtime.UnixNano) bool { 703 // Update metrics around progress of repair. 704 blockStartUnixSeconds := blockStart.ToTime().Unix() 705 namespaceScope.Gauge("timestamp-current-block-repair").Update(float64(blockStartUnixSeconds)) 706 707 // Update state for later reporting of least recently repaired block. 708 repairState, ok := r.repairStatesByNs.repairStates(n.ID(), blockStart) 709 if ok && (leastRecentlyRepairedBlockStart.IsZero() || 710 repairState.LastAttempt.Before(leastRecentlyRepairedBlockStartLastRepairTime)) { 711 leastRecentlyRepairedBlockStart = blockStart 712 leastRecentlyRepairedBlockStartLastRepairTime = repairState.LastAttempt 713 } 714 715 if ok && repairState.Status == repairSuccess { 716 return true 717 } 718 719 // Failed or unrepair block from this point onwards. 720 numUnrepairedBlocks++ 721 if hasRepairedABlockStart && repairBlockStartShortCircuitRepair { 722 // Only want to repair one namespace/blockStart per call to Repair() 723 // so once we've repaired a single blockStart we don't perform any 724 // more actual repairs although we do keep iterating so that we can 725 // emit an accurate value for the "num-unrepaired-blocks" gauge. 726 return true 727 } 728 729 if err := r.repairNamespaceBlockstart(n, blockStart); err != nil { 730 multiErr = multiErr.Add(err) 731 } else { 732 hasRepairedABlockStart = true 733 } 734 735 return true 736 }) 737 738 // Update metrics with statistics about repair status. 739 namespaceScope.Gauge("num-unrepaired-blocks").Update(float64(numUnrepairedBlocks)) 740 741 secondsSinceLastRepair := xtime.ToUnixNano(r.nowFn()). 742 Sub(leastRecentlyRepairedBlockStartLastRepairTime).Seconds() 743 namespaceScope.Gauge("max-seconds-since-last-block-repair").Update(secondsSinceLastRepair) 744 745 if hasRepairedABlockStart { 746 // Previous loop performed a repair which means we've hit our limit of repairing 747 // one block per namespace per call to Repair() so we can skip the logic below. 748 continue 749 } 750 751 // If we've made it this far that means that there were no unrepaired blocks which means we should 752 // repair the least recently repaired block instead. 753 if leastRecentlyRepairedBlockStart.IsZero() { 754 continue 755 } 756 if err := r.repairNamespaceBlockstart(n, leastRecentlyRepairedBlockStart); err != nil { 757 multiErr = multiErr.Add(err) 758 } 759 } 760 761 return multiErr.FinalError() 762 } 763 764 func (r *dbRepairer) Report() { 765 if atomic.LoadInt32(&r.running) == 1 { 766 r.status.Update(1) 767 } else { 768 r.status.Update(0) 769 } 770 } 771 772 func (r *dbRepairer) repairNamespaceBlockstart(n databaseNamespace, blockStart xtime.UnixNano) error { 773 var ( 774 blockSize = n.Options().RetentionOptions().BlockSize() 775 repairRange = xtime.Range{Start: blockStart, End: blockStart.Add(blockSize)} 776 repairTime = xtime.ToUnixNano(r.nowFn()) 777 ) 778 if err := r.repairNamespaceWithTimeRange(n, repairRange); err != nil { 779 r.markRepairAttempt(n.ID(), blockStart, repairTime, repairFailed) 780 return err 781 } 782 783 r.markRepairAttempt(n.ID(), blockStart, repairTime, repairSuccess) 784 return nil 785 } 786 787 func (r *dbRepairer) repairNamespaceWithTimeRange(n databaseNamespace, tr xtime.Range) error { 788 if err := n.Repair(r.shardRepairer, tr, NamespaceRepairOptions{ 789 Force: r.ropts.Force(), 790 }); err != nil { 791 return fmt.Errorf("namespace %s failed to repair time range %v: %v", n.ID().String(), tr, err) 792 } 793 return nil 794 } 795 796 func (r *dbRepairer) markRepairAttempt( 797 namespace ident.ID, 798 blockStart xtime.UnixNano, 799 repairTime xtime.UnixNano, 800 repairStatus repairStatus) { 801 repairState, _ := r.repairStatesByNs.repairStates(namespace, blockStart) 802 repairState.Status = repairStatus 803 repairState.LastAttempt = repairTime 804 r.repairStatesByNs.setRepairState(namespace, blockStart, repairState) 805 } 806 807 var noOpRepairer databaseRepairer = repairerNoOp{} 808 809 type repairerNoOp struct{} 810 811 func newNoopDatabaseRepairer() databaseRepairer { return noOpRepairer } 812 813 func (r repairerNoOp) Start() {} 814 func (r repairerNoOp) Stop() {} 815 func (r repairerNoOp) Repair() error { return nil } 816 func (r repairerNoOp) Report() {} 817 818 func (r shardRepairer) shadowCompare( 819 start xtime.UnixNano, 820 end xtime.UnixNano, 821 localMetadataBlocks block.FetchBlocksMetadataResults, 822 session client.AdminSession, 823 shard databaseShard, 824 nsCtx namespace.Context, 825 ) error { 826 dice, err := newDice(r.rpopts.DebugShadowComparisonsPercentage()) 827 if err != nil { 828 return fmt.Errorf("err creating shadow comparison dice: %v", err) 829 } 830 831 var localM, peerM *dynamic.Message 832 if nsCtx.Schema != nil { 833 // Only required if a schema (proto feature) is present. Reset between uses. 834 localM = dynamic.NewMessage(nsCtx.Schema.Get().MessageDescriptor) 835 peerM = dynamic.NewMessage(nsCtx.Schema.Get().MessageDescriptor) 836 } 837 838 readCtx := r.opts.ContextPool().Get() 839 compareResultFunc := func(result block.FetchBlocksMetadataResult) error { 840 seriesID := result.ID 841 peerSeriesIter, err := session.Fetch(nsCtx.ID, seriesID, start, end) 842 if err != nil { 843 return err 844 } 845 defer peerSeriesIter.Close() 846 847 readCtx.Reset() 848 defer readCtx.BlockingCloseReset() 849 850 iter, err := shard.ReadEncoded(readCtx, seriesID, start, end, nsCtx) 851 if err != nil { 852 return err 853 } 854 unfilteredLocalSeriesDataBlocks, err := iter.ToSlices(readCtx) 855 if err != nil { 856 return err 857 } 858 localSeriesDataBlocks, err := xio.FilterEmptyBlockReadersSliceOfSlicesInPlace(unfilteredLocalSeriesDataBlocks) 859 if err != nil { 860 return err 861 } 862 863 localSeriesSliceOfSlices := xio.NewReaderSliceOfSlicesFromBlockReadersIterator(localSeriesDataBlocks) 864 localSeriesIter := r.opts.MultiReaderIteratorPool().Get() 865 localSeriesIter.ResetSliceOfSlices(localSeriesSliceOfSlices, nsCtx.Schema) 866 867 var ( 868 i = 0 869 foundMismatch = false 870 ) 871 for localSeriesIter.Next() { 872 if !peerSeriesIter.Next() { 873 r.logger.Error( 874 "series had next locally, but not from peers", 875 zap.String("namespace", nsCtx.ID.String()), 876 zap.Time("start", start.ToTime()), 877 zap.Time("end", end.ToTime()), 878 zap.String("series", seriesID.String()), 879 zap.Error(peerSeriesIter.Err()), 880 ) 881 foundMismatch = true 882 break 883 } 884 885 localDP, localUnit, localAnnotation := localSeriesIter.Current() 886 peerDP, peerUnit, peerAnnotation := peerSeriesIter.Current() 887 888 if !localDP.Equal(peerDP) { 889 r.logger.Error( 890 "datapoints did not match", 891 zap.Int("index", i), 892 zap.Any("local", localDP), 893 zap.Any("peer", peerDP), 894 ) 895 foundMismatch = true 896 break 897 } 898 899 if localUnit != peerUnit { 900 r.logger.Error( 901 "units did not match", 902 zap.Int("index", i), 903 zap.Int("local", int(localUnit)), 904 zap.Int("peer", int(peerUnit)), 905 ) 906 foundMismatch = true 907 break 908 } 909 910 if nsCtx.Schema == nil { 911 // Remaining shadow logic is proto-specific. 912 continue 913 } 914 915 err = localM.Unmarshal(localAnnotation) 916 if err != nil { 917 r.logger.Error( 918 "Unable to unmarshal local annotation", 919 zap.Int("index", i), 920 zap.Error(err), 921 ) 922 foundMismatch = true 923 break 924 } 925 926 err = peerM.Unmarshal(peerAnnotation) 927 if err != nil { 928 r.logger.Error( 929 "Unable to unmarshal peer annotation", 930 zap.Int("index", i), 931 zap.Error(err), 932 ) 933 foundMismatch = true 934 break 935 } 936 937 if !dynamic.Equal(localM, peerM) { 938 r.logger.Error( 939 "Local message does not equal peer message", 940 zap.Int("index", i), 941 zap.String("local", localM.String()), 942 zap.String("peer", peerM.String()), 943 ) 944 foundMismatch = true 945 break 946 } 947 948 if !bytes.Equal(localAnnotation, peerAnnotation) { 949 r.logger.Error( 950 "Local message equals peer message, but annotations do not match", 951 zap.Int("index", i), 952 zap.String("local", string(localAnnotation)), 953 zap.String("peer", string(peerAnnotation)), 954 ) 955 foundMismatch = true 956 break 957 } 958 959 i++ 960 } 961 962 if localSeriesIter.Err() != nil { 963 r.logger.Error( 964 "Local series iterator experienced an error", 965 zap.String("namespace", nsCtx.ID.String()), 966 zap.Time("start", start.ToTime()), 967 zap.Time("end", end.ToTime()), 968 zap.String("series", seriesID.String()), 969 zap.Int("numDPs", i), 970 zap.Error(localSeriesIter.Err()), 971 ) 972 } else if foundMismatch { 973 r.logger.Error( 974 "Found mismatch between series", 975 zap.String("namespace", nsCtx.ID.String()), 976 zap.Time("start", start.ToTime()), 977 zap.Time("end", end.ToTime()), 978 zap.String("series", seriesID.String()), 979 zap.Int("numDPs", i), 980 ) 981 } else { 982 r.logger.Debug( 983 "All values for series match", 984 zap.String("namespace", nsCtx.ID.String()), 985 zap.Time("start", start.ToTime()), 986 zap.Time("end", end.ToTime()), 987 zap.String("series", seriesID.String()), 988 zap.Int("numDPs", i), 989 ) 990 } 991 992 return nil 993 } 994 995 for _, result := range localMetadataBlocks.Results() { 996 if !dice.Roll() { 997 continue 998 } 999 1000 if err := compareResultFunc(result); err != nil { 1001 return err 1002 } 1003 } 1004 1005 return nil 1006 } 1007 1008 type sessionAndTopo struct { 1009 session client.AdminSession 1010 topo topology.Map 1011 }