github.com/m3db/m3@v1.5.0/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package fs 22 23 import ( 24 "errors" 25 "fmt" 26 "sync" 27 "time" 28 29 "github.com/m3db/m3/src/dbnode/namespace" 30 "github.com/m3db/m3/src/dbnode/persist" 31 "github.com/m3db/m3/src/dbnode/persist/fs" 32 "github.com/m3db/m3/src/dbnode/persist/fs/migration" 33 "github.com/m3db/m3/src/dbnode/retention" 34 "github.com/m3db/m3/src/dbnode/storage/block" 35 "github.com/m3db/m3/src/dbnode/storage/bootstrap" 36 "github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper" 37 "github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper/fs/migrator" 38 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 39 "github.com/m3db/m3/src/dbnode/storage/index" 40 "github.com/m3db/m3/src/dbnode/storage/index/compaction" 41 "github.com/m3db/m3/src/dbnode/storage/index/convert" 42 "github.com/m3db/m3/src/dbnode/storage/series" 43 "github.com/m3db/m3/src/dbnode/ts" 44 "github.com/m3db/m3/src/m3ninx/doc" 45 "github.com/m3db/m3/src/m3ninx/index/segment/fst" 46 idxpersist "github.com/m3db/m3/src/m3ninx/persist" 47 "github.com/m3db/m3/src/x/checked" 48 "github.com/m3db/m3/src/x/clock" 49 "github.com/m3db/m3/src/x/context" 50 "github.com/m3db/m3/src/x/ident" 51 "github.com/m3db/m3/src/x/instrument" 52 "github.com/m3db/m3/src/x/pool" 53 xtime "github.com/m3db/m3/src/x/time" 54 55 "github.com/opentracing/opentracing-go" 56 opentracinglog "github.com/opentracing/opentracing-go/log" 57 "github.com/uber-go/tally" 58 "go.uber.org/zap" 59 "go.uber.org/zap/zapcore" 60 ) 61 62 type runType int 63 64 const ( 65 bootstrapDataRunType runType = iota 66 bootstrapIndexRunType 67 ) 68 69 type newDataFileSetReaderFn func( 70 bytesPool pool.CheckedBytesPool, 71 opts fs.Options, 72 ) (fs.DataFileSetReader, error) 73 74 type fileSystemSource struct { 75 opts Options 76 fsopts fs.Options 77 log *zap.Logger 78 nowFn clock.NowFn 79 idPool ident.Pool 80 newReaderFn newDataFileSetReaderFn 81 newReaderPoolOpts bootstrapper.NewReaderPoolOptions 82 metrics fileSystemSourceMetrics 83 instrumentation *instrumentation 84 } 85 86 type fileSystemSourceMetrics struct { 87 persistedIndexBlocksRead tally.Counter 88 persistedIndexBlocksWrite tally.Counter 89 persistedIndexBlocksOutOfRetention tally.Counter 90 } 91 92 func newFileSystemSource(opts Options) (bootstrap.Source, error) { 93 if err := opts.Validate(); err != nil { 94 return nil, err 95 } 96 97 var ( 98 scope = opts.InstrumentOptions().MetricsScope().SubScope("fs-bootstrapper") 99 iopts = opts.InstrumentOptions().SetMetricsScope(scope) 100 ) 101 opts = opts.SetInstrumentOptions(iopts) 102 103 s := &fileSystemSource{ 104 opts: opts, 105 fsopts: opts.FilesystemOptions(), 106 log: iopts.Logger().With(zap.String("bootstrapper", "filesystem")), 107 nowFn: opts.ResultOptions().ClockOptions().NowFn(), 108 idPool: opts.IdentifierPool(), 109 newReaderFn: fs.NewReader, 110 metrics: fileSystemSourceMetrics{ 111 persistedIndexBlocksRead: scope.Counter("persist-index-blocks-read"), 112 persistedIndexBlocksWrite: scope.Counter("persist-index-blocks-write"), 113 persistedIndexBlocksOutOfRetention: scope.Counter("persist-index-blocks-out-of-retention"), 114 }, 115 instrumentation: newInstrumentation(opts, scope, iopts), 116 } 117 s.newReaderPoolOpts.Alloc = s.newReader 118 119 return s, nil 120 } 121 122 func (s *fileSystemSource) AvailableData( 123 md namespace.Metadata, 124 shardTimeRanges result.ShardTimeRanges, 125 cache bootstrap.Cache, 126 _ bootstrap.RunOptions, 127 ) (result.ShardTimeRanges, error) { 128 return s.availability(md, shardTimeRanges, cache) 129 } 130 131 func (s *fileSystemSource) AvailableIndex( 132 md namespace.Metadata, 133 shardTimeRanges result.ShardTimeRanges, 134 cache bootstrap.Cache, 135 _ bootstrap.RunOptions, 136 ) (result.ShardTimeRanges, error) { 137 return s.availability(md, shardTimeRanges, cache) 138 } 139 140 func (s *fileSystemSource) Read( 141 ctx context.Context, 142 namespaces bootstrap.Namespaces, 143 cache bootstrap.Cache, 144 ) (bootstrap.NamespaceResults, error) { 145 instrCtx := s.instrumentation.fsBootstrapperSourceReadStarted(ctx) 146 defer instrCtx.finish() 147 148 results := bootstrap.NamespaceResults{ 149 Results: bootstrap.NewNamespaceResultsMap(bootstrap.NamespaceResultsMapOptions{}), 150 } 151 152 // Perform any necessary migrations but don't block bootstrap process on failure. Will update info file 153 // in-memory structures in place if migrations have written new files to disk. This saves us the need from 154 // having to re-read migrated info files. 155 infoFilesByNamespace := cache.ReadInfoFiles() 156 s.runMigrations(ctx, infoFilesByNamespace) 157 158 // NB(r): Perform all data bootstrapping first then index bootstrapping 159 // to more clearly deliniate which process is slower than the other. 160 instrCtx.bootstrapDataStarted() 161 for _, elem := range namespaces.Namespaces.Iter() { 162 namespace := elem.Value() 163 md := namespace.Metadata 164 165 r, err := s.read(bootstrapDataRunType, md, namespace.DataAccumulator, 166 namespace.DataRunOptions.ShardTimeRanges, 167 namespace.DataRunOptions.RunOptions, instrCtx.span, cache) 168 if err != nil { 169 return bootstrap.NamespaceResults{}, err 170 } 171 172 results.Results.Set(md.ID(), bootstrap.NamespaceResult{ 173 Metadata: md, 174 Shards: namespace.Shards, 175 DataResult: r.data, 176 }) 177 } 178 instrCtx.bootstrapDataCompleted() 179 180 instrCtx.bootstrapIndexStarted() 181 for _, elem := range namespaces.Namespaces.Iter() { 182 namespace := elem.Value() 183 md := namespace.Metadata 184 if !md.Options().IndexOptions().Enabled() { 185 // Not bootstrapping for index. 186 s.log.Info("bootstrapping for namespace disabled by options", 187 zap.String("ns", md.ID().String())) 188 continue 189 } 190 191 r, err := s.read(bootstrapIndexRunType, md, namespace.DataAccumulator, 192 namespace.IndexRunOptions.ShardTimeRanges, 193 namespace.IndexRunOptions.RunOptions, instrCtx.span, cache) 194 if err != nil { 195 return bootstrap.NamespaceResults{}, err 196 } 197 198 result, ok := results.Results.Get(md.ID()) 199 if !ok { 200 err = fmt.Errorf("missing expected result for namespace: %s", 201 md.ID().String()) 202 return bootstrap.NamespaceResults{}, err 203 } 204 205 result.IndexResult = r.index 206 results.Results.Set(md.ID(), result) 207 } 208 instrCtx.bootstrapIndexCompleted() 209 return results, nil 210 } 211 212 func (s *fileSystemSource) runMigrations(ctx context.Context, infoFilesByNamespace bootstrap.InfoFilesByNamespace) { 213 // Only one migration for now, so just short circuit entirely if not enabled 214 if s.opts.MigrationOptions().TargetMigrationVersion() != migration.MigrationVersion_1_1 { 215 return 216 } 217 218 migrator, err := migrator.NewMigrator(migrator.NewOptions(). 219 SetMigrationTaskFn(migration.MigrationTask). 220 SetInfoFilesByNamespace(infoFilesByNamespace). 221 SetMigrationOptions(s.opts.MigrationOptions()). 222 SetFilesystemOptions(s.fsopts). 223 SetInstrumentOptions(s.opts.InstrumentOptions()). 224 SetStorageOptions(s.opts.StorageOptions())) 225 if err != nil { 226 s.log.Error("error creating migrator. continuing bootstrap", zap.Error(err)) 227 } 228 229 // NB(nate): Handling of errors should be re-evaluated as migrations are added. Current migrations 230 // do not mutate state in such a way that data can be left in an invalid state in the case of failures. Additionally, 231 // we want to ensure that the bootstrap process is always able to continue. If either of these conditions change, 232 // error handling at this level AND the individual migration task level should be reconsidered. 233 // 234 // One final note, as more migrations are introduced and the complexity is increased, we may want to consider adding 235 // 1) a recovery mechanism to ensure that repeatable panics don't create a crash loop and 236 // 2) state tracking to abort migration attempts after a certain number of consecutive failures. 237 // For now, simply setting the target migration to "None" in config is enough to mitigate both of these cases. 238 if err = migrator.Run(ctx); err != nil { 239 s.log.Error("error performing migrations. continuing bootstrap", zap.Error(err)) 240 } 241 } 242 243 func (s *fileSystemSource) availability( 244 md namespace.Metadata, 245 shardTimeRanges result.ShardTimeRanges, 246 cache bootstrap.Cache, 247 ) (result.ShardTimeRanges, error) { 248 result := result.NewShardTimeRangesFromSize(shardTimeRanges.Len()) 249 for shard, ranges := range shardTimeRanges.Iter() { 250 availabilities, err := s.shardAvailability(md, shard, ranges, cache) 251 if err != nil { 252 return nil, err 253 } 254 result.Set(shard, availabilities) 255 } 256 return result, nil 257 } 258 259 func (s *fileSystemSource) shardAvailability( 260 md namespace.Metadata, 261 shard uint32, 262 targetRangesForShard xtime.Ranges, 263 cache bootstrap.Cache, 264 ) (xtime.Ranges, error) { 265 if targetRangesForShard.IsEmpty() { 266 return xtime.NewRanges(), nil 267 } 268 readInfoFileResults, err := cache.InfoFilesForShard(md, shard) 269 if err != nil { 270 return nil, err 271 } 272 return s.shardAvailabilityWithInfoFiles(md.ID(), shard, targetRangesForShard, readInfoFileResults), nil 273 } 274 275 func (s *fileSystemSource) shardAvailabilityWithInfoFiles( 276 namespace ident.ID, 277 shard uint32, 278 targetRangesForShard xtime.Ranges, 279 readInfoFilesResults []fs.ReadInfoFileResult, 280 ) xtime.Ranges { 281 tr := xtime.NewRanges() 282 for i := 0; i < len(readInfoFilesResults); i++ { 283 result := readInfoFilesResults[i] 284 if err := result.Err.Error(); err != nil { 285 s.log.Error("unable to read info files in shardAvailability", 286 zap.Uint32("shard", shard), 287 zap.Stringer("namespace", namespace), 288 zap.Error(err), 289 zap.Any("targetRangesForShard", targetRangesForShard), 290 zap.String("filepath", result.Err.Filepath()), 291 ) 292 continue 293 } 294 info := result.Info 295 t := xtime.UnixNano(info.BlockStart) 296 w := time.Duration(info.BlockSize) 297 currRange := xtime.Range{Start: t, End: t.Add(w)} 298 if targetRangesForShard.Overlaps(currRange) { 299 tr.AddRange(currRange) 300 } 301 } 302 return tr 303 } 304 305 func (s *fileSystemSource) bootstrapFromReaders( 306 run runType, 307 ns namespace.Metadata, 308 accumulator bootstrap.NamespaceDataAccumulator, 309 runOpts bootstrap.RunOptions, 310 runResult *runResult, 311 readerPool *bootstrapper.ReaderPool, 312 readersCh <-chan bootstrapper.TimeWindowReaders, 313 builder *result.IndexBuilder, 314 persistManager *bootstrapper.SharedPersistManager, 315 compactor *bootstrapper.SharedCompactor, 316 ) { 317 resultOpts := s.opts.ResultOptions() 318 319 for timeWindowReaders := range readersCh { 320 // NB(bodu): Since we are re-using the same builder for all bootstrapped index blocks, 321 // it is not thread safe and requires reset after every processed index block. 322 builder.Builder().Reset() 323 324 s.loadShardReadersDataIntoShardResult(run, ns, accumulator, 325 runOpts, runResult, resultOpts, timeWindowReaders, readerPool, 326 builder, persistManager, compactor) 327 } 328 } 329 330 // markRunResultErrorsAndUnfulfilled checks the list of times that had errors and makes 331 // sure that we don't return any blocks or bloom filters for them. In addition, 332 // it looks at any remaining (unfulfilled) ranges and makes sure they're marked 333 // as unfulfilled. 334 func (s *fileSystemSource) markRunResultErrorsAndUnfulfilled( 335 runResult *runResult, 336 requestedRanges result.ShardTimeRanges, 337 remainingRanges result.ShardTimeRanges, 338 timesWithErrors []time.Time, 339 ) { 340 // NB(xichen): this is the exceptional case where we encountered errors due to files 341 // being corrupted, which should be fairly rare so we can live with the overhead. We 342 // experimented with adding the series to a temporary map and only adding the temporary map 343 // to the final result but adding series to large map with string keys is expensive, and 344 // the current implementation saves the extra overhead of merging temporary map with the 345 // final result. 346 if len(timesWithErrors) > 0 { 347 timesWithErrorsString := make([]string, len(timesWithErrors)) 348 for i := range timesWithErrors { 349 timesWithErrorsString[i] = timesWithErrors[i].String() 350 } 351 s.log.Info("encountered errors for range", 352 zap.String("requestedRanges", requestedRanges.SummaryString()), 353 zap.Strings("timesWithErrors", timesWithErrorsString)) 354 } 355 356 if !remainingRanges.IsEmpty() { 357 runResult.Lock() 358 for _, unfulfilled := range []result.ShardTimeRanges{ 359 runResult.data.Unfulfilled(), 360 runResult.index.Unfulfilled(), 361 } { 362 unfulfilled.AddRanges(remainingRanges) 363 } 364 runResult.Unlock() 365 } 366 } 367 368 func (s *fileSystemSource) loadShardReadersDataIntoShardResult( 369 run runType, 370 ns namespace.Metadata, 371 accumulator bootstrap.NamespaceDataAccumulator, 372 runOpts bootstrap.RunOptions, 373 runResult *runResult, 374 ropts result.Options, 375 timeWindowReaders bootstrapper.TimeWindowReaders, 376 readerPool *bootstrapper.ReaderPool, 377 builder *result.IndexBuilder, 378 persistManager *bootstrapper.SharedPersistManager, 379 compactor *bootstrapper.SharedCompactor, 380 ) { 381 var ( 382 blockPool = ropts.DatabaseBlockOptions().DatabaseBlockPool() 383 seriesCachePolicy = ropts.SeriesCachePolicy() 384 timesWithErrors []time.Time 385 nsCtx = namespace.NewContextFrom(ns) 386 metadataPool = s.opts.IndexOptions().MetadataArrayPool() 387 batch = metadataPool.Get() 388 totalEntries int 389 totalFulfilledRanges = result.NewShardTimeRanges() 390 ) 391 defer metadataPool.Put(batch) 392 393 requestedRanges := timeWindowReaders.Ranges 394 remainingRanges := requestedRanges.Copy() 395 shardReaders := timeWindowReaders.Readers 396 defer func() { 397 // Return readers to pool. 398 for _, shardReaders := range shardReaders { 399 for _, r := range shardReaders.Readers { 400 if err := r.Close(); err == nil { 401 readerPool.Put(r) 402 } 403 } 404 } 405 }() 406 407 for shard, shardReaders := range shardReaders { 408 shard := uint32(shard) 409 readers := shardReaders.Readers 410 411 for _, r := range readers { 412 var ( 413 timeRange = r.Range() 414 start = timeRange.Start 415 blockSize = ns.Options().RetentionOptions().BlockSize() 416 err error 417 ) 418 switch run { 419 case bootstrapDataRunType: 420 // Pass, since nothing to do. 421 case bootstrapIndexRunType: 422 runResult.addIndexBlockIfNotExists(start, ns) 423 default: 424 // Unreachable unless an internal method calls with a run type casted from int. 425 panic(fmt.Errorf("invalid run type: %d", run)) 426 } 427 428 numEntries := r.Entries() 429 for i := 0; err == nil && i < numEntries; i++ { 430 switch run { 431 case bootstrapDataRunType: 432 err = s.readNextEntryAndRecordBlock(nsCtx, accumulator, shard, r, 433 runResult, start, blockSize, blockPool, seriesCachePolicy) 434 case bootstrapIndexRunType: 435 // We can just read the entry and index if performing an index run. 436 batch, err = s.readNextEntryAndMaybeIndex(r, batch, builder) 437 if err != nil { 438 s.log.Error("readNextEntryAndMaybeIndex failed", zap.Error(err), 439 zap.Time("timeRangeStart", timeRange.Start.ToTime())) 440 } 441 totalEntries++ 442 default: 443 // Unreachable unless an internal method calls with a run type casted from int. 444 panic(fmt.Errorf("invalid run type: %d", run)) 445 } 446 } 447 // NB(bodu): Only flush if we've experienced no errors up to this point. 448 if err == nil && len(batch) > 0 { 449 batch, err = builder.FlushBatch(batch) 450 if err != nil { 451 s.log.Error("builder FlushBatch failed", zap.Error(err), 452 zap.Time("timeRangeStart", timeRange.Start.ToTime())) 453 } 454 } 455 456 if err == nil { 457 // Validate the read results. 458 var validateErr error 459 switch run { 460 case bootstrapDataRunType: 461 if seriesCachePolicy == series.CacheAll { 462 validateErr = r.Validate() 463 } else { 464 err = fmt.Errorf("invalid series cache policy: %s", seriesCachePolicy.String()) 465 } 466 case bootstrapIndexRunType: 467 validateErr = r.ValidateMetadata() 468 default: 469 // Unreachable unless an internal method calls with a run type casted from int. 470 panic(fmt.Errorf("invalid run type: %d", run)) 471 } 472 if validateErr != nil { 473 err = fmt.Errorf("data validation failed: %v", validateErr) 474 } 475 } 476 477 if err == nil && run == bootstrapIndexRunType { 478 // Mark index block as fulfilled. 479 fulfilled := result.NewShardTimeRanges().Set(shard, xtime.NewRanges(timeRange)) 480 runResult.Lock() 481 err = runResult.index.IndexResults().MarkFulfilled(start, fulfilled, 482 // NB(bodu): By default, we always load bootstrapped data into the default index volume. 483 idxpersist.DefaultIndexVolumeType, ns.Options().IndexOptions()) 484 runResult.Unlock() 485 if err != nil { 486 s.log.Error("indexResults MarkFulfilled failed", zap.Error(err), 487 zap.Time("timeRangeStart", timeRange.Start.ToTime())) 488 } 489 } 490 491 if err == nil { 492 fulfilled := result.NewShardTimeRanges().Set(shard, xtime.NewRanges(timeRange)) 493 totalFulfilledRanges.AddRanges(fulfilled) 494 remainingRanges.Subtract(fulfilled) 495 } else { 496 s.log.Error("unknown error", zap.Error(err), 497 zap.Time("timeRangeStart", timeRange.Start.ToTime())) 498 timesWithErrors = append(timesWithErrors, timeRange.Start.ToTime()) 499 } 500 } 501 } 502 503 var ( 504 noneRemaining = remainingRanges.IsEmpty() 505 shouldBuildSegment = run == bootstrapIndexRunType && 506 // NB(r): Do not try to build a segment if no entries to index. 507 totalEntries > 0 && 508 len(timesWithErrors) == 0 509 ) 510 if shouldBuildSegment { 511 var ( 512 indexBlockSize = ns.Options().IndexOptions().BlockSize() 513 retentionPeriod = ns.Options().RetentionOptions().RetentionPeriod() 514 beginningOfIndexRetention = retention.FlushTimeStartForRetentionPeriod( 515 retentionPeriod, indexBlockSize, xtime.ToUnixNano(s.nowFn())) 516 initialIndexRange = xtime.Range{ 517 Start: beginningOfIndexRetention, 518 End: beginningOfIndexRetention.Add(indexBlockSize), 519 } 520 overlapsWithInitialIndexRange = false 521 min, max = requestedRanges.MinMax() 522 blockStart = min.Truncate(indexBlockSize) 523 blockEnd = blockStart.Add(indexBlockSize) 524 iopts = s.opts.ResultOptions().InstrumentOptions() 525 indexBlock result.IndexBlock 526 err error 527 ) 528 for _, remainingRange := range remainingRanges.Iter() { 529 if remainingRange.Overlaps(initialIndexRange) { 530 overlapsWithInitialIndexRange = true 531 } 532 } 533 534 remainingMin, remainingMax := remainingRanges.MinMax() 535 fulfilledMin, fulfilledMax := totalFulfilledRanges.MinMax() 536 537 // NB(bodu): Assume if we're bootstrapping data from disk that it is the 538 // "default" index volume type. 539 runResult.Lock() 540 existingIndexBlock, ok := bootstrapper.GetDefaultIndexBlockForBlockStart( 541 runResult.index.IndexResults(), blockStart) 542 runResult.Unlock() 543 if !ok { 544 err := fmt.Errorf("could not find index block in results: time=%s, ts=%d", 545 blockStart.String(), blockStart) 546 instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) { 547 l.Error("index bootstrap failed", 548 zap.Error(err), 549 zap.Stringer("namespace", ns.ID()), 550 zap.Stringer("requestedRanges", requestedRanges)) 551 }) 552 } 553 554 // Determine if should flush data for range. 555 persistCfg := runOpts.PersistConfig() 556 shouldFlush := persistCfg.Enabled && 557 persistCfg.FileSetType == persist.FileSetFlushType 558 559 // Determine all requested ranges were fulfilled or at edge of retention 560 satisfiedFlushRanges := noneRemaining || overlapsWithInitialIndexRange 561 562 buildIndexLogFields := []zapcore.Field{ 563 zap.Stringer("namespace", ns.ID()), 564 zap.Bool("shouldBuildSegment", shouldBuildSegment), 565 zap.Bool("noneRemaining", noneRemaining), 566 zap.Bool("overlapsWithInitialIndexRange", overlapsWithInitialIndexRange), 567 zap.Int("totalEntries", totalEntries), 568 zap.String("requestedRangesMinMax", fmt.Sprintf("%v - %v", min, max)), 569 zap.String("remainingRangesMinMax", fmt.Sprintf("%v - %v", remainingMin, remainingMax)), 570 zap.String("remainingRanges", remainingRanges.SummaryString()), 571 zap.String("totalFulfilledRangesMinMax", fmt.Sprintf("%v - %v", fulfilledMin, fulfilledMax)), 572 zap.String("totalFulfilledRanges", totalFulfilledRanges.SummaryString()), 573 zap.String("initialIndexRange", fmt.Sprintf("%v - %v", initialIndexRange.Start, initialIndexRange.End)), 574 zap.Bool("shouldFlush", shouldFlush), 575 zap.Bool("satisfiedFlushRanges", satisfiedFlushRanges), 576 } 577 578 if shouldFlush && satisfiedFlushRanges { 579 s.log.Debug("building file set index segment", buildIndexLogFields...) 580 indexBlock, err = bootstrapper.PersistBootstrapIndexSegment( 581 ns, 582 requestedRanges, 583 builder.Builder(), 584 persistManager, 585 s.opts.IndexClaimsManager(), 586 s.opts.ResultOptions(), 587 existingIndexBlock.Fulfilled(), 588 blockStart, 589 blockEnd, 590 ) 591 if errors.Is(err, fs.ErrIndexOutOfRetention) { 592 // Bail early if the index segment is already out of retention. 593 // This can happen when the edge of requested ranges at time of data bootstrap 594 // is now out of retention. 595 s.log.Debug("skipping out of retention index segment", buildIndexLogFields...) 596 s.metrics.persistedIndexBlocksOutOfRetention.Inc(1) 597 return 598 } else if err != nil { 599 instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) { 600 l.Error("persist fs index bootstrap failed", 601 zap.Error(err), 602 zap.Stringer("namespace", ns.ID()), 603 zap.Stringer("requestedRanges", requestedRanges)) 604 }) 605 } 606 // Track success. 607 s.metrics.persistedIndexBlocksWrite.Inc(1) 608 } else { 609 s.log.Info("building in-memory index segment", buildIndexLogFields...) 610 indexBlock, err = bootstrapper.BuildBootstrapIndexSegment( 611 ns, 612 requestedRanges, 613 builder.Builder(), 614 compactor, 615 s.opts.ResultOptions(), 616 s.opts.FilesystemOptions().MmapReporter(), 617 blockStart, 618 blockEnd, 619 ) 620 if errors.Is(err, fs.ErrIndexOutOfRetention) { 621 // Bail early if the index segment is already out of retention. 622 // This can happen when the edge of requested ranges at time of data bootstrap 623 // is now out of retention. 624 s.log.Debug("skipping out of retention index segment", buildIndexLogFields...) 625 s.metrics.persistedIndexBlocksOutOfRetention.Inc(1) 626 return 627 } else if err != nil { 628 iopts := s.opts.ResultOptions().InstrumentOptions() 629 instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) { 630 l.Error("build fs index bootstrap failed", 631 zap.Error(err), 632 zap.Stringer("namespace", ns.ID()), 633 zap.Stringer("requestedRanges", requestedRanges)) 634 }) 635 } 636 } 637 638 // Merge segments and fulfilled time ranges. 639 segments := indexBlock.Segments() 640 for _, seg := range existingIndexBlock.Segments() { 641 segments = append(segments, seg) 642 } 643 newFulfilled := existingIndexBlock.Fulfilled().Copy() 644 newFulfilled.AddRanges(indexBlock.Fulfilled()) 645 646 // Replace index block for default index volume type. 647 runResult.Lock() 648 runResult.index.IndexResults()[blockStart]. 649 SetBlock(idxpersist.DefaultIndexVolumeType, result.NewIndexBlock(segments, newFulfilled)) 650 runResult.Unlock() 651 } 652 653 s.markRunResultErrorsAndUnfulfilled(runResult, requestedRanges, 654 remainingRanges, timesWithErrors) 655 } 656 657 func (s *fileSystemSource) readNextEntryAndRecordBlock( 658 nsCtx namespace.Context, 659 accumulator bootstrap.NamespaceDataAccumulator, 660 shardID uint32, 661 r fs.DataFileSetReader, 662 runResult *runResult, 663 blockStart xtime.UnixNano, 664 blockSize time.Duration, 665 blockPool block.DatabaseBlockPool, 666 seriesCachePolicy series.CachePolicy, 667 ) error { 668 var ( 669 seriesBlock = blockPool.Get() 670 id ident.ID 671 tagsIter ident.TagIterator 672 data checked.Bytes 673 err error 674 ) 675 676 defer func() { 677 // Can finalize the ID and tags always. 678 if id != nil { 679 id.Finalize() 680 } 681 if tagsIter != nil { 682 tagsIter.Close() 683 } 684 }() 685 686 switch seriesCachePolicy { 687 case series.CacheAll: 688 id, tagsIter, data, _, err = r.Read() 689 default: 690 err = fmt.Errorf("invalid series cache policy: %s", seriesCachePolicy.String()) 691 } 692 if err != nil { 693 return fmt.Errorf("error reading data file: %v", err) 694 } 695 696 ref, owned, err := accumulator.CheckoutSeriesWithLock(shardID, id, tagsIter) 697 if err != nil { 698 if !owned { 699 // Ignore if we no longer own the shard for this series. 700 return nil 701 } 702 return fmt.Errorf("unable to checkout series: %v", err) 703 } 704 705 seg := ts.NewSegment(data, nil, 0, ts.FinalizeHead) 706 seriesBlock.Reset(blockStart, blockSize, seg, nsCtx) 707 708 seriesRef, err := ref.Resolver.SeriesRef() 709 if err != nil { 710 return fmt.Errorf("unable to resolve seriesRef: %w", err) 711 } 712 if err := seriesRef.LoadBlock(seriesBlock, series.WarmWrite); err != nil { 713 return fmt.Errorf("unable to load block: %v", err) 714 } 715 716 return nil 717 } 718 719 func (s *fileSystemSource) readNextEntryAndMaybeIndex( 720 r fs.DataFileSetReader, 721 batch []doc.Metadata, 722 builder *result.IndexBuilder, 723 ) ([]doc.Metadata, error) { 724 // If performing index run, then simply read the metadata and add to segment. 725 entry, err := r.StreamingReadMetadata() 726 if err != nil { 727 return batch, err 728 } 729 730 d, err := convert.FromSeriesIDAndEncodedTags(entry.ID, entry.EncodedTags) 731 if err != nil { 732 return batch, err 733 } 734 735 batch = append(batch, d) 736 737 if len(batch) >= index.MetadataArrayPoolCapacity { 738 return builder.FlushBatch(batch) 739 } 740 741 return batch, nil 742 } 743 744 func (s *fileSystemSource) read( 745 run runType, 746 md namespace.Metadata, 747 accumulator bootstrap.NamespaceDataAccumulator, 748 shardTimeRanges result.ShardTimeRanges, 749 runOpts bootstrap.RunOptions, 750 span opentracing.Span, 751 cache bootstrap.Cache, 752 ) (*runResult, error) { 753 var ( 754 seriesCachePolicy = s.opts.ResultOptions().SeriesCachePolicy() 755 res *runResult 756 ) 757 if shardTimeRanges.IsEmpty() { 758 return newRunResult(), nil 759 } 760 761 setOrMergeResult := func(newResult *runResult) { 762 if newResult == nil { 763 return 764 } 765 if res == nil { 766 res = newResult 767 } else { 768 res = res.mergedResult(newResult) 769 } 770 } 771 772 if run == bootstrapDataRunType { 773 if seriesCachePolicy != series.CacheAll { 774 // Unless we're caching all series (or all series metadata) in memory, we 775 // return just the availability of the files we have. 776 return s.bootstrapDataRunResultFromAvailability(md, shardTimeRanges, cache) 777 } 778 } 779 780 logSpan := func(event string) { 781 span.LogFields( 782 opentracinglog.String("event", event), 783 opentracinglog.String("nsID", md.ID().String()), 784 opentracinglog.String("shardTimeRanges", shardTimeRanges.SummaryString()), 785 ) 786 } 787 if run == bootstrapIndexRunType { 788 logSpan("bootstrap_from_index_persisted_blocks_start") 789 // NB(r): First read all the FSTs and add to runResult index results, 790 // subtract the shard + time ranges from what we intend to bootstrap 791 // for those we found. 792 r, err := s.bootstrapFromIndexPersistedBlocks(md, 793 shardTimeRanges) 794 if err != nil { 795 s.log.Warn("filesystem bootstrapped failed to read persisted index blocks") 796 } else { 797 // We may have less we need to read 798 shardTimeRanges = shardTimeRanges.Copy() 799 shardTimeRanges.Subtract(r.fulfilled) 800 // Set or merge result. 801 setOrMergeResult(r.result) 802 } 803 logSpan("bootstrap_from_index_persisted_blocks_done") 804 } 805 806 // Create a reader pool once per bootstrap as we don't really want to 807 // allocate and keep around readers outside of the bootstrapping process, 808 // hence why its created on demand each time. 809 readerPool := bootstrapper.NewReaderPool(s.newReaderPoolOpts) 810 indexSegmentConcurrency := s.opts.IndexSegmentConcurrency() 811 readersCh := make(chan bootstrapper.TimeWindowReaders, indexSegmentConcurrency) 812 var blockSize time.Duration 813 switch run { 814 case bootstrapDataRunType: 815 blockSize = md.Options().RetentionOptions().BlockSize() 816 case bootstrapIndexRunType: 817 blockSize = md.Options().IndexOptions().BlockSize() 818 default: 819 panic(fmt.Errorf("unrecognized run type: %d", run)) 820 } 821 runtimeOpts := s.opts.RuntimeOptionsManager().Get() 822 go bootstrapper.EnqueueReaders(bootstrapper.EnqueueReadersOptions{ 823 NsMD: md, 824 RunOpts: runOpts, 825 RuntimeOpts: runtimeOpts, 826 FsOpts: s.fsopts, 827 ShardTimeRanges: shardTimeRanges, 828 ReaderPool: readerPool, 829 ReadersCh: readersCh, 830 BlockSize: blockSize, 831 // NB(bodu): We only read metadata when bootstrap index 832 // so we do not need to sort the data fileset reader. 833 ReadMetadataOnly: run == bootstrapIndexRunType, 834 Logger: s.log, 835 Span: span, 836 NowFn: s.nowFn, 837 Cache: cache, 838 }) 839 840 bootstrapFromReadersRunResult := newRunResult() 841 842 var buildWg sync.WaitGroup 843 for i := 0; i < indexSegmentConcurrency; i++ { 844 alloc := s.opts.ResultOptions().IndexDocumentsBuilderAllocator() 845 segBuilder, err := alloc() 846 if err != nil { 847 return nil, err 848 } 849 850 builder := result.NewIndexBuilder(segBuilder) 851 852 indexOpts := s.opts.IndexOptions() 853 compactor, err := compaction.NewCompactor(indexOpts.MetadataArrayPool(), 854 index.MetadataArrayPoolCapacity, 855 indexOpts.SegmentBuilderOptions(), 856 indexOpts.FSTSegmentOptions(), 857 compaction.CompactorOptions{ 858 FSTWriterOptions: &fst.WriterOptions{ 859 // DisableRegistry is set to true to trade a larger FST size 860 // for a faster FST compaction since we want to reduce the end 861 // to end latency for time to first index a metric. 862 DisableRegistry: true, 863 }, 864 }) 865 if err != nil { 866 return nil, err 867 } 868 869 persistManager, err := fs.NewPersistManager(s.opts.FilesystemOptions()) 870 if err != nil { 871 return nil, err 872 } 873 874 buildWg.Add(1) 875 go func() { 876 s.bootstrapFromReaders(run, md, 877 accumulator, runOpts, bootstrapFromReadersRunResult, 878 readerPool, readersCh, builder, 879 &bootstrapper.SharedPersistManager{Mgr: persistManager}, 880 &bootstrapper.SharedCompactor{Compactor: compactor}) 881 buildWg.Done() 882 }() 883 } 884 885 buildWg.Wait() 886 887 // Merge any existing results if necessary. 888 setOrMergeResult(bootstrapFromReadersRunResult) 889 890 return res, nil 891 } 892 893 func (s *fileSystemSource) newReader() (fs.DataFileSetReader, error) { 894 bytesPool := s.opts.ResultOptions().DatabaseBlockOptions().BytesPool() 895 return s.newReaderFn(bytesPool, s.fsopts) 896 } 897 898 func (s *fileSystemSource) bootstrapDataRunResultFromAvailability( 899 md namespace.Metadata, 900 shardTimeRanges result.ShardTimeRanges, 901 cache bootstrap.Cache, 902 ) (*runResult, error) { 903 // No locking required, all local to this fn until returned. 904 runResult := newRunResult() 905 unfulfilled := runResult.data.Unfulfilled() 906 for shard, ranges := range shardTimeRanges.Iter() { 907 if ranges.IsEmpty() { 908 continue 909 } 910 infoFiles, err := cache.InfoFilesForShard(md, shard) 911 if err != nil { 912 return nil, err 913 } 914 availability := s.shardAvailabilityWithInfoFiles(md.ID(), shard, ranges, infoFiles) 915 remaining := ranges.Clone() 916 remaining.RemoveRanges(availability) 917 if !remaining.IsEmpty() { 918 unfulfilled.AddRanges(result.NewShardTimeRanges().Set( 919 shard, 920 remaining, 921 )) 922 } 923 } 924 runResult.data.SetUnfulfilled(unfulfilled) 925 return runResult, nil 926 } 927 928 type bootstrapFromIndexPersistedBlocksResult struct { 929 fulfilled result.ShardTimeRanges 930 result *runResult 931 } 932 933 func (s *fileSystemSource) bootstrapFromIndexPersistedBlocks( 934 ns namespace.Metadata, 935 shardTimeRanges result.ShardTimeRanges, 936 ) (bootstrapFromIndexPersistedBlocksResult, error) { 937 res := bootstrapFromIndexPersistedBlocksResult{ 938 fulfilled: result.NewShardTimeRanges(), 939 } 940 941 indexBlockSize := ns.Options().IndexOptions().BlockSize() 942 infoFiles := fs.ReadIndexInfoFiles(fs.ReadIndexInfoFilesOptions{ 943 FilePathPrefix: s.fsopts.FilePathPrefix(), 944 Namespace: ns.ID(), 945 ReaderBufferSize: s.fsopts.InfoReaderBufferSize(), 946 }) 947 948 for _, infoFile := range infoFiles { 949 if err := infoFile.Err.Error(); err != nil { 950 s.log.Error("unable to read index info file", 951 zap.Stringer("namespace", ns.ID()), 952 zap.Error(err), 953 zap.Stringer("shardTimeRanges", shardTimeRanges), 954 zap.String("filepath", infoFile.Err.Filepath()), 955 ) 956 continue 957 } 958 959 info := infoFile.Info 960 indexBlockStart := xtime.UnixNano(info.BlockStart) 961 indexBlockRange := xtime.Range{ 962 Start: indexBlockStart, 963 End: indexBlockStart.Add(indexBlockSize), 964 } 965 willFulfill := result.NewShardTimeRanges() 966 for _, shard := range info.Shards { 967 tr, ok := shardTimeRanges.Get(shard) 968 if !ok { 969 // No ranges match for this shard. 970 continue 971 } 972 if _, ok := willFulfill.Get(shard); !ok { 973 willFulfill.Set(shard, xtime.NewRanges()) 974 } 975 976 iter := tr.Iter() 977 for iter.Next() { 978 curr := iter.Value() 979 intersection, intersects := curr.Intersect(indexBlockRange) 980 if !intersects { 981 continue 982 } 983 willFulfill.GetOrAdd(shard).AddRange(intersection) 984 } 985 } 986 987 if willFulfill.IsEmpty() { 988 // No matching shard/time ranges with this block. 989 continue 990 } 991 992 fsOpts := s.fsopts 993 verify := s.opts.IndexSegmentsVerify() 994 if verify { 995 // Make sure for this call to read index segments 996 // to validate the index segment. 997 // If fails validation will rebuild since missing from 998 // fulfilled range. 999 fsOpts = fsOpts.SetIndexReaderAutovalidateIndexSegments(true) 1000 } 1001 1002 readResult, err := fs.ReadIndexSegments(fs.ReadIndexSegmentsOptions{ 1003 ReaderOptions: fs.IndexReaderOpenOptions{ 1004 Identifier: infoFile.ID, 1005 FileSetType: persist.FileSetFlushType, 1006 }, 1007 FilesystemOptions: fsOpts, 1008 }) 1009 if err != nil { 1010 s.log.Error("unable to read segments from index fileset", 1011 zap.Stringer("namespace", ns.ID()), 1012 zap.Error(err), 1013 zap.Time("blockStart", indexBlockStart.ToTime()), 1014 zap.Int("volumeIndex", infoFile.ID.VolumeIndex), 1015 ) 1016 continue 1017 } 1018 1019 // Track success. 1020 s.metrics.persistedIndexBlocksRead.Inc(1) 1021 1022 // Record result. 1023 if res.result == nil { 1024 res.result = newRunResult() 1025 } 1026 segmentsFulfilled := willFulfill 1027 // NB(bodu): All segments read from disk are already persisted. 1028 persistedSegments := make([]result.Segment, 0, len(readResult.Segments)) 1029 for _, segment := range readResult.Segments { 1030 persistedSegments = append(persistedSegments, result.NewSegment(segment, true)) 1031 } 1032 volumeType := idxpersist.DefaultIndexVolumeType 1033 if info.IndexVolumeType != nil { 1034 volumeType = idxpersist.IndexVolumeType(info.IndexVolumeType.Value) 1035 } 1036 indexBlockByVolumeType := result.NewIndexBlockByVolumeType(indexBlockStart) 1037 indexBlockByVolumeType.SetBlock(volumeType, result.NewIndexBlock(persistedSegments, segmentsFulfilled)) 1038 // NB(r): Don't need to call MarkFulfilled on the IndexResults here 1039 // as we've already passed the ranges fulfilled to the block that 1040 // we place in the IndexResuts with the call to Add(...). 1041 res.result.index.Add(indexBlockByVolumeType, nil) 1042 res.fulfilled.AddRanges(segmentsFulfilled) 1043 } 1044 1045 return res, nil 1046 } 1047 1048 type runResult struct { 1049 sync.RWMutex 1050 data result.DataBootstrapResult 1051 index result.IndexBootstrapResult 1052 } 1053 1054 func newRunResult() *runResult { 1055 return &runResult{ 1056 data: result.NewDataBootstrapResult(), 1057 index: result.NewIndexBootstrapResult(), 1058 } 1059 } 1060 1061 func (r *runResult) addIndexBlockIfNotExists( 1062 start xtime.UnixNano, 1063 ns namespace.Metadata, 1064 ) { 1065 // Only called once per shard so ok to acquire write lock immediately. 1066 r.Lock() 1067 defer r.Unlock() 1068 1069 idxOpts := ns.Options().IndexOptions() 1070 r.index.IndexResults().AddBlockIfNotExists(start, idxOpts) 1071 } 1072 1073 func (r *runResult) mergedResult(other *runResult) *runResult { 1074 r.Lock() 1075 defer r.Unlock() 1076 1077 other.Lock() 1078 defer other.Unlock() 1079 1080 return &runResult{ 1081 data: result.MergedDataBootstrapResult(r.data, other.data), 1082 index: result.MergedIndexBootstrapResult(r.index, other.index), 1083 } 1084 }