github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/shard.go (about) 1 // Copyright (c) 2020 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "container/list" 25 "errors" 26 "fmt" 27 "io" 28 "math" 29 "sync" 30 "time" 31 32 "github.com/m3db/m3/src/dbnode/generated/proto/pagetoken" 33 "github.com/m3db/m3/src/dbnode/namespace" 34 "github.com/m3db/m3/src/dbnode/persist" 35 "github.com/m3db/m3/src/dbnode/persist/fs" 36 "github.com/m3db/m3/src/dbnode/retention" 37 "github.com/m3db/m3/src/dbnode/runtime" 38 "github.com/m3db/m3/src/dbnode/storage/block" 39 "github.com/m3db/m3/src/dbnode/storage/bootstrap" 40 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 41 "github.com/m3db/m3/src/dbnode/storage/index" 42 "github.com/m3db/m3/src/dbnode/storage/index/convert" 43 "github.com/m3db/m3/src/dbnode/storage/repair" 44 "github.com/m3db/m3/src/dbnode/storage/series" 45 "github.com/m3db/m3/src/dbnode/tracepoint" 46 "github.com/m3db/m3/src/dbnode/ts" 47 "github.com/m3db/m3/src/dbnode/ts/writes" 48 "github.com/m3db/m3/src/dbnode/x/xio" 49 "github.com/m3db/m3/src/m3ninx/doc" 50 "github.com/m3db/m3/src/x/checked" 51 "github.com/m3db/m3/src/x/clock" 52 "github.com/m3db/m3/src/x/context" 53 xerrors "github.com/m3db/m3/src/x/errors" 54 "github.com/m3db/m3/src/x/ident" 55 "github.com/m3db/m3/src/x/instrument" 56 xresource "github.com/m3db/m3/src/x/resource" 57 xtime "github.com/m3db/m3/src/x/time" 58 59 "github.com/gogo/protobuf/proto" 60 "github.com/opentracing/opentracing-go/log" 61 "github.com/uber-go/tally" 62 "go.uber.org/zap" 63 ) 64 65 const ( 66 shardIterateBatchPercent = 0.01 67 shardIterateBatchMinSize = 16 68 ) 69 70 var ( 71 errShardEntryNotFound = errors.New("shard entry not found") 72 errShardNotOpen = errors.New("shard is not open") 73 errShardAlreadyTicking = errors.New("shard is already ticking") 74 errShardClosingTickTerminated = errors.New("shard is closing, terminating tick") 75 errShardInvalidPageToken = errors.New("shard could not unmarshal page token") 76 errShardIsNotBootstrapped = errors.New("shard is not bootstrapped") 77 errShardAlreadyBootstrapped = errors.New("shard is already bootstrapped") 78 errFlushStateIsNotInitialized = errors.New("shard flush state is not initialized") 79 errTriedToLoadNilSeries = errors.New("tried to load nil series into shard") 80 81 // ErrDatabaseLoadLimitHit is the error returned when the database load limit 82 // is hit or exceeded. 83 ErrDatabaseLoadLimitHit = errors.New("error loading series, database load limit hit") 84 85 emptyDoc = doc.Metadata{} 86 ) 87 88 type filesetsFn func( 89 filePathPrefix string, 90 namespace ident.ID, 91 shardID uint32, 92 ) (fs.FileSetFilesSlice, error) 93 94 type filesetPathsBeforeFn func( 95 filePathPrefix string, 96 namespace ident.ID, 97 shardID uint32, 98 t xtime.UnixNano, 99 ) ([]string, error) 100 101 type tickPolicy int 102 103 const ( 104 tickPolicyRegular tickPolicy = iota 105 tickPolicyCloseShard 106 ) 107 108 type dbShardState int 109 110 const ( 111 dbShardStateOpen dbShardState = iota 112 dbShardStateClosing 113 ) 114 115 type dbShard struct { 116 sync.RWMutex 117 block.DatabaseBlockRetriever 118 opts Options 119 seriesOpts series.Options 120 nowFn clock.NowFn 121 state dbShardState 122 namespace namespace.Metadata 123 seriesBlockRetriever series.QueryableBlockRetriever 124 seriesOnRetrieveBlock block.OnRetrieveBlock 125 namespaceReaderMgr databaseNamespaceReaderManager 126 increasingIndex increasingIndex 127 seriesPool series.DatabaseSeriesPool 128 reverseIndex NamespaceIndex 129 insertQueue *dbShardInsertQueue 130 lookup *shardMap 131 list *list.List 132 bootstrapState BootstrapState 133 newMergerFn fs.NewMergerFn 134 newFSMergeWithMemFn newFSMergeWithMemFn 135 filesetsFn filesetsFn 136 filesetPathsBeforeFn filesetPathsBeforeFn 137 deleteFilesFn deleteFilesFn 138 snapshotFilesFn snapshotFilesFn 139 newReaderFn fs.NewReaderFn 140 sleepFn func(time.Duration) 141 identifierPool ident.Pool 142 contextPool context.Pool 143 flushState shardFlushState 144 tickWg *sync.WaitGroup 145 runtimeOptsListenClosers []xresource.SimpleCloser 146 currRuntimeOptions dbShardRuntimeOptions 147 logger *zap.Logger 148 metrics dbShardMetrics 149 tileAggregator TileAggregator 150 ticking bool 151 shard uint32 152 coldWritesEnabled bool 153 indexEnabled bool 154 155 entryMetrics *EntryMetrics 156 } 157 158 // NB(r): dbShardRuntimeOptions does not contain its own 159 // mutex as some of the variables are needed each write 160 // which already at least acquires read lock from the shard 161 // mutex, so to keep the lock acquisitions to a minimum 162 // these are protected under the same shard mutex. 163 type dbShardRuntimeOptions struct { 164 writeNewSeriesAsync bool 165 tickSleepSeriesBatchSize int 166 tickSleepPerSeries time.Duration 167 } 168 169 type dbShardMetrics struct { 170 create tally.Counter 171 close tally.Counter 172 closeStart tally.Counter 173 closeLatency tally.Timer 174 seriesTicked tally.Gauge 175 insertAsyncInsertErrors tally.Counter 176 insertAsyncWriteInternalErrors tally.Counter 177 insertAsyncWriteInvalidParamsErrors tally.Counter 178 insertAsyncIndexErrors tally.Counter 179 snapshotTotalLatency tally.Timer 180 snapshotPrepareLatency tally.Timer 181 snapshotMergeByBucketLatency tally.Timer 182 snapshotMergeAcrossBucketsLatency tally.Timer 183 snapshotChecksumLatency tally.Timer 184 snapshotPersistLatency tally.Timer 185 snapshotCloseLatency tally.Timer 186 187 purgeUnexpectedRefCount tally.Counter 188 } 189 190 func newDatabaseShardMetrics(shardID uint32, scope tally.Scope) dbShardMetrics { 191 const insertErrorName = "insert-async.errors" 192 snapshotScope := scope.SubScope("snapshot") 193 return dbShardMetrics{ 194 create: scope.Counter("create"), 195 close: scope.Counter("close"), 196 closeStart: scope.Counter("close-start"), 197 closeLatency: scope.Timer("close-latency"), 198 seriesTicked: scope.Tagged(map[string]string{ 199 "shard": fmt.Sprintf("%d", shardID), 200 }).Gauge("series-ticked"), 201 insertAsyncInsertErrors: scope.Tagged(map[string]string{ 202 "error_type": "insert-series", 203 "suberror_type": "shard-entry-insert-error", 204 }).Counter(insertErrorName), 205 insertAsyncWriteInternalErrors: scope.Tagged(map[string]string{ 206 "error_type": "write-value", 207 "suberror_type": "internal-error", 208 }).Counter(insertErrorName), 209 insertAsyncWriteInvalidParamsErrors: scope.Tagged(map[string]string{ 210 "error_type": "write-value", 211 "suberror_type": "invalid-params-error", 212 }).Counter(insertErrorName), 213 insertAsyncIndexErrors: scope.Tagged(map[string]string{ 214 "error_type": "reverse-index", 215 "suberror_type": "write-batch-error", 216 }).Counter(insertErrorName), 217 snapshotTotalLatency: snapshotScope.Timer("total-latency"), 218 snapshotPrepareLatency: snapshotScope.Timer("prepare-latency"), 219 snapshotMergeByBucketLatency: snapshotScope.Timer("merge-by-bucket-latency"), 220 snapshotMergeAcrossBucketsLatency: snapshotScope.Timer("merge-across-buckets-latency"), 221 snapshotChecksumLatency: snapshotScope.Timer("checksum-latency"), 222 snapshotPersistLatency: snapshotScope.Timer("persist-latency"), 223 snapshotCloseLatency: snapshotScope.Timer("close-latency"), 224 purgeUnexpectedRefCount: scope.Counter("purge-unexpected-ref-count"), 225 } 226 } 227 228 type dbShardEntryWorkFn func(entry *Entry) bool 229 230 type dbShardEntryBatchWorkFn func(entries []*Entry) bool 231 232 type shardListElement *list.Element 233 234 type shardFlushState struct { 235 sync.RWMutex 236 statesByTime map[xtime.UnixNano]fileOpState 237 initialized bool 238 } 239 240 func newShardFlushState() shardFlushState { 241 return shardFlushState{ 242 statesByTime: make(map[xtime.UnixNano]fileOpState), 243 } 244 } 245 246 func newDatabaseShard( 247 namespaceMetadata namespace.Metadata, 248 shard uint32, 249 blockRetriever block.DatabaseBlockRetriever, 250 namespaceReaderMgr databaseNamespaceReaderManager, 251 increasingIndex increasingIndex, 252 reverseIndex NamespaceIndex, 253 needsBootstrap bool, 254 opts Options, 255 seriesOpts series.Options, 256 ) databaseShard { 257 scope := opts.InstrumentOptions().MetricsScope(). 258 SubScope("dbshard") 259 260 s := &dbShard{ 261 opts: opts, 262 seriesOpts: seriesOpts, 263 nowFn: opts.ClockOptions().NowFn(), 264 state: dbShardStateOpen, 265 namespace: namespaceMetadata, 266 shard: shard, 267 namespaceReaderMgr: namespaceReaderMgr, 268 increasingIndex: increasingIndex, 269 seriesPool: opts.DatabaseSeriesPool(), 270 reverseIndex: reverseIndex, 271 lookup: newShardMap(shardMapOptions{}), 272 list: list.New(), 273 newMergerFn: fs.NewMerger, 274 newFSMergeWithMemFn: newFSMergeWithMem, 275 filesetsFn: fs.DataFiles, 276 filesetPathsBeforeFn: fs.DataFileSetsBefore, 277 deleteFilesFn: fs.DeleteFiles, 278 snapshotFilesFn: fs.SnapshotFiles, 279 sleepFn: time.Sleep, 280 newReaderFn: fs.NewReader, 281 identifierPool: opts.IdentifierPool(), 282 contextPool: opts.ContextPool(), 283 flushState: newShardFlushState(), 284 tickWg: &sync.WaitGroup{}, 285 coldWritesEnabled: namespaceMetadata.Options().ColdWritesEnabled(), 286 indexEnabled: namespaceMetadata.Options().IndexOptions().Enabled(), 287 logger: opts.InstrumentOptions().Logger(), 288 metrics: newDatabaseShardMetrics(shard, scope), 289 tileAggregator: opts.TileAggregator(), 290 entryMetrics: NewEntryMetrics(scope.SubScope("entries")), 291 } 292 s.insertQueue = newDatabaseShardInsertQueue(s.insertSeriesBatch, 293 s.nowFn, opts.CoreFn(), scope, opts.InstrumentOptions().Logger()) 294 295 registerRuntimeOptionsListener := func(listener runtime.OptionsListener) { 296 elem := opts.RuntimeOptionsManager().RegisterListener(listener) 297 s.runtimeOptsListenClosers = append(s.runtimeOptsListenClosers, elem) 298 } 299 registerRuntimeOptionsListener(s) 300 registerRuntimeOptionsListener(s.insertQueue) 301 302 // Start the insert queue after registering runtime options listeners 303 // that may immediately fire with values 304 s.insertQueue.Start() 305 306 if !needsBootstrap { 307 s.bootstrapState = Bootstrapped 308 } 309 310 if blockRetriever != nil { 311 s.setBlockRetriever(blockRetriever) 312 } 313 314 s.metrics.create.Inc(1) 315 316 return s 317 } 318 319 func (s *dbShard) setBlockRetriever(retriever block.DatabaseBlockRetriever) { 320 // If using the block retriever then set the block retriever field 321 // and set the series block retriever as the shard itself and 322 // the on retrieve block callback as the shard itself as well 323 s.DatabaseBlockRetriever = retriever 324 s.seriesBlockRetriever = s 325 s.seriesOnRetrieveBlock = s 326 } 327 328 func (s *dbShard) SetRuntimeOptions(value runtime.Options) { 329 s.Lock() 330 s.currRuntimeOptions = dbShardRuntimeOptions{ 331 writeNewSeriesAsync: value.WriteNewSeriesAsync(), 332 tickSleepSeriesBatchSize: value.TickSeriesBatchSize(), 333 tickSleepPerSeries: value.TickPerSeriesSleepDuration(), 334 } 335 s.Unlock() 336 } 337 338 func (s *dbShard) ID() uint32 { 339 return s.shard 340 } 341 342 func (s *dbShard) NumSeries() int64 { 343 s.RLock() 344 n := s.list.Len() 345 s.RUnlock() 346 return int64(n) 347 } 348 349 // Stream implements series.QueryableBlockRetriever 350 func (s *dbShard) Stream( 351 ctx context.Context, 352 id ident.ID, 353 blockStart xtime.UnixNano, 354 onRetrieve block.OnRetrieveBlock, 355 nsCtx namespace.Context, 356 ) (xio.BlockReader, error) { 357 return s.DatabaseBlockRetriever.Stream(ctx, s.shard, id, 358 blockStart, onRetrieve, nsCtx) 359 } 360 361 // IsBlockRetrievable implements series.QueryableBlockRetriever 362 func (s *dbShard) IsBlockRetrievable(blockStart xtime.UnixNano) (bool, error) { 363 return s.hasWarmFlushed(blockStart) 364 } 365 366 func (s *dbShard) hasWarmFlushed(blockStart xtime.UnixNano) (bool, error) { 367 flushState, err := s.FlushState(blockStart) 368 if err != nil { 369 return false, err 370 } 371 return s.warmStatusIsRetrievable(flushState.WarmStatus), nil 372 } 373 374 func (s *dbShard) warmStatusIsRetrievable(status warmStatus) bool { 375 if !statusIsRetrievable(status.DataFlushed) { 376 return false 377 } 378 379 // If the index is disabled, then we only are tracking data flushing. 380 // Otherwise, warm status requires both data and index flushed. 381 if !s.indexEnabled { 382 return true 383 } 384 385 return statusIsRetrievable(status.IndexFlushed) 386 } 387 388 func statusIsRetrievable(status fileOpStatus) bool { 389 switch status { 390 case fileOpNotStarted, fileOpInProgress, fileOpFailed: 391 return false 392 case fileOpSuccess: 393 return true 394 } 395 panic(fmt.Errorf("shard queried is retrievable with bad flush state %d", 396 status)) 397 } 398 399 // RetrievableBlockColdVersion implements series.QueryableBlockRetriever 400 func (s *dbShard) RetrievableBlockColdVersion(blockStart xtime.UnixNano) (int, error) { 401 flushState, err := s.FlushState(blockStart) 402 if err != nil { 403 return -1, err 404 } 405 return flushState.ColdVersionFlushed, nil 406 } 407 408 // BlockStatesSnapshot implements series.QueryableBlockRetriever 409 func (s *dbShard) BlockStatesSnapshot() series.ShardBlockStateSnapshot { 410 s.RLock() 411 snapshots := s.blockStatesSnapshotWithRLock() 412 s.RUnlock() 413 414 return snapshots 415 } 416 417 func (s *dbShard) blockStatesSnapshotWithRLock() series.ShardBlockStateSnapshot { 418 bootstrapped := s.bootstrapState == Bootstrapped 419 if !bootstrapped { 420 // Needs to be bootstrapped. 421 return series.NewShardBlockStateSnapshot(false, series.BootstrappedBlockStateSnapshot{}) 422 } 423 424 s.flushState.RLock() 425 defer s.flushState.RUnlock() 426 if !s.flushState.initialized { 427 // Also needs to have the shard flush states initialized. 428 return series.NewShardBlockStateSnapshot(false, series.BootstrappedBlockStateSnapshot{}) 429 } 430 431 snapshot := make(map[xtime.UnixNano]series.BlockState, len(s.flushState.statesByTime)) 432 for time, state := range s.flushState.statesByTime { 433 snapshot[time] = series.BlockState{ 434 WarmRetrievable: s.warmStatusIsRetrievable(state.WarmStatus), 435 // Use ColdVersionRetrievable instead of ColdVersionFlushed since the snapshot 436 // will be used to make eviction decisions and we don't want to evict data before 437 // it is retrievable. 438 ColdVersion: state.ColdVersionRetrievable, 439 } 440 } 441 442 return series.NewShardBlockStateSnapshot(true, series.BootstrappedBlockStateSnapshot{ 443 Snapshot: snapshot, 444 }) 445 } 446 447 func (s *dbShard) OnRetrieveBlock( 448 id ident.ID, 449 tags ident.TagIterator, 450 startTime xtime.UnixNano, 451 segment ts.Segment, 452 nsCtx namespace.Context, 453 ) { 454 s.RLock() 455 entry, err := s.lookupEntryWithLock(id) 456 if entry != nil { 457 entry.IncrementReaderWriterCount() 458 defer entry.DecrementReaderWriterCount() 459 } 460 s.RUnlock() 461 462 if err != nil && err != errShardEntryNotFound { 463 return // Likely closing 464 } 465 466 if entry != nil { 467 entry.Series.OnRetrieveBlock(id, tags, startTime, segment, nsCtx) 468 return 469 } 470 471 entry, err = s.newShardEntry(id, convert.NewTagsIterMetadataResolver(tags)) 472 if err != nil { 473 // should never happen 474 instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), 475 func(logger *zap.Logger) { 476 logger.Error("unable to create shardEntry from retrieved block data", 477 zap.Stringer("id", id), 478 zap.Time("startTime", startTime.ToTime()), 479 zap.Error(err)) 480 }) 481 return 482 } 483 484 // NB(r): Do not need to specify that needs to be indexed as series would 485 // have been already been indexed when it was written 486 copiedID := entry.Series.ID() 487 copiedTagsIter := s.identifierPool.TagsIterator() 488 copiedTagsIter.ResetFields(entry.Series.Metadata().Fields) 489 s.insertQueue.Insert(dbShardInsert{ 490 entry: entry, 491 opts: dbShardInsertAsyncOptions{ 492 // NB(r): Caching blocks should not be considered for 493 // new series insert rate limit. 494 skipRateLimit: true, 495 hasPendingRetrievedBlock: true, 496 pendingRetrievedBlock: dbShardPendingRetrievedBlock{ 497 id: copiedID, 498 tags: copiedTagsIter, 499 start: startTime, 500 segment: segment, 501 nsCtx: nsCtx, 502 }, 503 }, 504 }) 505 } 506 507 func (s *dbShard) OnEvictedFromWiredList(id ident.ID, blockStart xtime.UnixNano) { 508 s.RLock() 509 entry, err := s.lookupEntryWithLock(id) 510 s.RUnlock() 511 512 if err != nil && err != errShardEntryNotFound { 513 return // Shard is probably closing 514 } 515 516 if entry == nil { 517 // Its counter-intuitive that this can ever occur because the series should 518 // always exist if it has any active blocks, and if we've reached this point 519 // then the WiredList had a reference to a block that should still be in the 520 // series, and thus the series should exist. The reason this can occur is that 521 // even though the WiredList controls the lifecycle of blocks retrieved from 522 // disk, those blocks can still be removed from the series if they've completely 523 // fallen out of the retention period. In that case, the series tick will still 524 // remove the block, and then the shard tick can remove the series. At that point, 525 // it's possible for the WiredList to have a reference to an expired block for a 526 // series that is no longer in the shard. 527 return 528 } 529 530 entry.Series.OnEvictedFromWiredList(id, blockStart) 531 } 532 533 func (s *dbShard) forEachShardEntry(entryFn dbShardEntryWorkFn) { 534 s.forEachShardEntryBatch(func(currEntries []*Entry) bool { 535 for _, entry := range currEntries { 536 if continueForEach := entryFn(entry); !continueForEach { 537 return false 538 } 539 } 540 return true 541 }) 542 } 543 544 func iterateBatchSize(elemsLen int) int { 545 if elemsLen < shardIterateBatchMinSize { 546 return shardIterateBatchMinSize 547 } 548 t := math.Ceil(shardIterateBatchPercent * float64(elemsLen)) 549 return int(math.Max(shardIterateBatchMinSize, t)) 550 } 551 552 func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) { 553 // NB(r): consider using a lockless list for ticking. 554 s.RLock() 555 elemsLen := s.list.Len() 556 s.RUnlock() 557 558 batchSize := iterateBatchSize(elemsLen) 559 decRefElem := func(e *list.Element) { 560 if e == nil { 561 return 562 } 563 e.Value.(*Entry).DecrementReaderWriterCount() 564 } 565 566 var ( 567 currEntries = make([]*Entry, 0, batchSize) 568 first = true 569 nextElem *list.Element 570 ) 571 for nextElem != nil || first { 572 s.RLock() 573 // NB(prateek): release held reference on the next element pointer now 574 // that we have the read lock and are guaranteed it cannot be changed 575 // from under us. 576 decRefElem(nextElem) 577 578 // lazily pull from the head of the list at first 579 if first { 580 nextElem = s.list.Front() 581 first = false 582 } 583 584 elem := nextElem 585 for ticked := 0; ticked < batchSize && elem != nil; ticked++ { 586 nextElem = elem.Next() 587 entry := elem.Value.(*Entry) 588 entry.IncrementReaderWriterCount() 589 currEntries = append(currEntries, entry) 590 elem = nextElem 591 } 592 593 // NB(prateek): inc a reference to the next element while we have a lock, 594 // to guarantee the element pointer cannot be changed from under us. 595 if nextElem != nil { 596 nextElem.Value.(*Entry).IncrementReaderWriterCount() 597 } 598 s.RUnlock() 599 600 continueExecution := entriesBatchFn(currEntries) 601 for i := range currEntries { 602 currEntries[i].DecrementReaderWriterCount() 603 currEntries[i] = nil 604 } 605 currEntries = currEntries[:0] 606 if !continueExecution { 607 decRefElem(nextElem) 608 return 609 } 610 } 611 } 612 613 func (s *dbShard) IsBootstrapped() bool { 614 return s.BootstrapState() == Bootstrapped 615 } 616 617 func (s *dbShard) Close() error { 618 s.Lock() 619 if s.state != dbShardStateOpen { 620 s.Unlock() 621 return errShardNotOpen 622 } 623 s.state = dbShardStateClosing 624 s.Unlock() 625 626 s.insertQueue.Stop() 627 628 for _, closer := range s.runtimeOptsListenClosers { 629 closer.Close() 630 } 631 632 s.metrics.closeStart.Inc(1) 633 stopwatch := s.metrics.closeLatency.Start() 634 defer func() { 635 s.metrics.close.Inc(1) 636 stopwatch.Stop() 637 }() 638 639 // NB(prateek): wait till any existing ticks are finished. In the usual 640 // case, no other ticks are running, and tickWg count is at 0, so the 641 // call to Wait() will return immediately. 642 // In the case when there is an existing Tick running, the count for 643 // tickWg will be > 0, and we'll wait until it's reset to zero, which 644 // will happen because earlier in this function we set the shard state 645 // to dbShardStateClosing, which triggers an early termination of 646 // any active ticks. 647 s.tickWg.Wait() 648 649 // NB(r): Asynchronously we purge expired series to ensure pressure on the 650 // GC is not placed all at one time. If the deadline is too low and still 651 // causes the GC to impact performance when closing shards the deadline 652 // should be increased. 653 cancellable := context.NewNoOpCanncellable() 654 _, err := s.tickAndExpire(cancellable, tickPolicyCloseShard, namespace.Context{}) 655 return err 656 } 657 658 func (s *dbShard) Closed() bool { 659 return s.isClosing() 660 } 661 662 func (s *dbShard) isClosing() bool { 663 s.RLock() 664 closing := s.isClosingWithLock() 665 s.RUnlock() 666 return closing 667 } 668 669 func (s *dbShard) isClosingWithLock() bool { 670 return s.state == dbShardStateClosing 671 } 672 673 func (s *dbShard) Tick(c context.Cancellable, startTime xtime.UnixNano, nsCtx namespace.Context) (tickResult, error) { 674 s.removeAnyFlushStatesTooEarly(startTime) 675 return s.tickAndExpire(c, tickPolicyRegular, nsCtx) 676 } 677 678 func (s *dbShard) tickAndExpire( 679 c context.Cancellable, 680 policy tickPolicy, 681 nsCtx namespace.Context, 682 ) (tickResult, error) { 683 s.Lock() 684 // ensure only one tick can execute at a time 685 if s.ticking { 686 s.Unlock() 687 // i.e. we were previously ticking 688 return tickResult{}, errShardAlreadyTicking 689 } 690 691 // NB(prateek): we bail out early if the shard is closing, 692 // unless it's the final tick issued during the Close(). This 693 // final tick is required to release resources back to our pools. 694 if policy != tickPolicyCloseShard && s.isClosingWithLock() { 695 s.Unlock() 696 return tickResult{}, errShardClosingTickTerminated 697 } 698 699 // enable Close() to track the lifecycle of the tick 700 s.ticking = true 701 s.tickWg.Add(1) 702 s.Unlock() 703 704 // reset ticking state 705 defer func() { 706 s.Lock() 707 s.ticking = false 708 s.tickWg.Done() 709 s.Unlock() 710 s.metrics.seriesTicked.Update(0.0) // reset external visibility 711 }() 712 713 var ( 714 r tickResult 715 terminatedTickingDueToClosing bool 716 i int 717 slept time.Duration 718 expired []*Entry 719 ) 720 s.RLock() 721 tickSleepBatch := s.currRuntimeOptions.tickSleepSeriesBatchSize 722 tickSleepPerSeries := s.currRuntimeOptions.tickSleepPerSeries 723 // Use blockStatesSnapshotWithRLock here to prevent nested read locks. 724 // Nested read locks will cause deadlocks if there is write lock attempt in 725 // between the nested read locks, since the write lock attempt will block 726 // future read lock attempts. 727 blockStates := s.blockStatesSnapshotWithRLock() 728 s.RUnlock() 729 s.forEachShardEntryBatch(func(currEntries []*Entry) bool { 730 // re-using `expired` to amortize allocs, still need to reset it 731 // to be safe for re-use. 732 for i := range expired { 733 expired[i] = nil 734 } 735 expired = expired[:0] 736 for _, entry := range currEntries { 737 if i > 0 && i%tickSleepBatch == 0 { 738 // NB(xichen): if the tick is cancelled, we bail out immediately. 739 // The cancellation check is performed on every batch of entries 740 // instead of every entry to reduce load. 741 if c.IsCancelled() { 742 return false 743 } 744 // NB(prateek): Also bail out early if the shard is closing, 745 // unless it's the final tick issued during the Close(). This 746 // final tick is required to release resources back to our pools. 747 if policy != tickPolicyCloseShard && s.isClosing() { 748 terminatedTickingDueToClosing = true 749 return false 750 } 751 // Expose shard level Tick() progress externally. 752 s.metrics.seriesTicked.Update(float64(i)) 753 // Throttle the tick 754 sleepFor := time.Duration(tickSleepBatch) * tickSleepPerSeries 755 s.sleepFn(sleepFor) 756 slept += sleepFor 757 } 758 759 var ( 760 result series.TickResult 761 err error 762 ) 763 switch policy { 764 case tickPolicyRegular: 765 result, err = entry.Series.Tick(blockStates, nsCtx) 766 case tickPolicyCloseShard: 767 err = series.ErrSeriesAllDatapointsExpired 768 } 769 if err == series.ErrSeriesAllDatapointsExpired { 770 expired = append(expired, entry) 771 r.expiredSeries++ 772 } else { 773 r.activeSeries++ 774 if err != nil { 775 r.errors++ 776 } 777 } 778 r.activeBlocks += result.ActiveBlocks 779 r.wiredBlocks += result.WiredBlocks 780 r.unwiredBlocks += result.UnwiredBlocks 781 r.pendingMergeBlocks += result.PendingMergeBlocks 782 r.madeExpiredBlocks += result.MadeExpiredBlocks 783 r.madeUnwiredBlocks += result.MadeUnwiredBlocks 784 r.mergedOutOfOrderBlocks += result.MergedOutOfOrderBlocks 785 r.evictedBuckets += result.EvictedBuckets 786 i++ 787 } 788 789 // Purge any series requiring purging. 790 if len(expired) > 0 { 791 s.purgeExpiredSeries(expired) 792 for i := range expired { 793 expired[i] = nil 794 } 795 expired = expired[:0] 796 } 797 // Continue. 798 return true 799 }) 800 801 if terminatedTickingDueToClosing { 802 return tickResult{}, errShardClosingTickTerminated 803 } 804 805 return r, nil 806 } 807 808 // NB(prateek): purgeExpiredSeries requires that all entries passed to it have at least one reader/writer, 809 // i.e. have a readWriteCount of at least 1. 810 // Currently, this function is only called by the lambda inside `tickAndExpire`'s `forEachShardEntryBatch` 811 // call. This satisfies the contract of all entries it operating upon being guaranteed to have a 812 // readerWriterEntryCount of at least 1, by virtue of the implementation of `forEachShardEntryBatch`. 813 func (s *dbShard) purgeExpiredSeries(expiredEntries []*Entry) { 814 // Remove all expired series from lookup and list. 815 s.Lock() 816 for _, entry := range expiredEntries { 817 // Only purge series after they've been GCed from the index, so that these happen and in order 818 // and there is no raciness around GCing something from the index when the series has already 819 // been removed from memory. 820 if s.indexEnabled && !entry.IndexGarbageCollected.Load() { 821 continue 822 } 823 824 series := entry.Series 825 id := series.ID() 826 elem, exists := s.lookup.Get(id) 827 if !exists { 828 continue 829 } 830 831 count := entry.ReaderWriterCount() 832 // The contract requires all entries to have count >= 1. 833 if count < 1 { 834 s.metrics.purgeUnexpectedRefCount.Inc(1) 835 instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), func(l *zap.Logger) { 836 l.Error("purgeExpiredSeries encountered invalid series read/write count", 837 zap.Stringer("namespace", s.namespace.ID()), 838 zap.Uint32("shard", s.ID()), 839 zap.Stringer("series", series.ID()), 840 zap.Int32("readerWriterCount", count)) 841 }) 842 continue 843 } 844 // If this series is currently being written to or read from, we don't 845 // remove to ensure a consistent view of the series to other users. 846 if count > 1 { 847 continue 848 } 849 // If there have been datapoints written to the series since its 850 // last empty check, we don't remove it. 851 if !series.IsEmpty() { 852 continue 853 } 854 855 // NB(xichen): if we get here, we are guaranteed that there can be 856 // no more reads/writes to this series while the lock is held, so it's 857 // safe to remove it. 858 series.Close() 859 s.list.Remove(elem) 860 s.lookup.Delete(id) 861 } 862 s.Unlock() 863 } 864 865 func (s *dbShard) WriteTagged( 866 ctx context.Context, 867 id ident.ID, 868 tagResolver convert.TagMetadataResolver, 869 timestamp xtime.UnixNano, 870 value float64, 871 unit xtime.Unit, 872 annotation []byte, 873 wOpts series.WriteOptions, 874 ) (SeriesWrite, error) { 875 return s.writeAndIndex(ctx, id, tagResolver, timestamp, 876 value, unit, annotation, wOpts, true) 877 } 878 879 func (s *dbShard) Write( 880 ctx context.Context, 881 id ident.ID, 882 timestamp xtime.UnixNano, 883 value float64, 884 unit xtime.Unit, 885 annotation []byte, 886 wOpts series.WriteOptions, 887 ) (SeriesWrite, error) { 888 return s.writeAndIndex(ctx, id, convert.EmptyTagMetadataResolver, timestamp, 889 value, unit, annotation, wOpts, false) 890 } 891 892 func (s *dbShard) writeAndIndex( 893 ctx context.Context, 894 id ident.ID, 895 tagResolver convert.TagMetadataResolver, 896 timestamp xtime.UnixNano, 897 value float64, 898 unit xtime.Unit, 899 annotation []byte, 900 wOpts series.WriteOptions, 901 shouldReverseIndex bool, 902 ) (SeriesWrite, error) { 903 // Prepare write 904 entry, opts, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id) 905 if err != nil { 906 return SeriesWrite{}, err 907 } 908 909 writable := entry != nil 910 911 // If no entry and we are not writing new series asynchronously. 912 if !writable && !opts.WriteNewSeriesAsync { 913 // Avoid double lookup by enqueueing insert immediately. 914 result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{ 915 hasPendingIndexing: shouldReverseIndex, 916 pendingIndex: dbShardPendingIndex{ 917 timestamp: timestamp, 918 enqueuedAt: s.nowFn(), 919 }, 920 }) 921 if err != nil { 922 return SeriesWrite{}, err 923 } 924 925 // Wait for the insert to be batched together and inserted 926 result.wg.Wait() 927 928 // Retrieve the inserted entry 929 entry, err = s.writableSeries(id, tagResolver) 930 if err != nil { 931 return SeriesWrite{}, err 932 } 933 writable = true 934 935 // NB(r): We just indexed this series if shouldReverseIndex was true 936 shouldReverseIndex = false 937 } 938 939 var ( 940 commitLogSeriesID ident.ID 941 commitLogSeriesUniqueIndex uint64 942 needsIndex bool 943 pendingIndexInsert writes.PendingIndexInsert 944 // Err on the side of caution and always write to the commitlog if writing 945 // async, since there is no information about whether the write succeeded 946 // or not. 947 wasWritten = true 948 ) 949 if writable { 950 // Perform write. No need to copy the annotation here because we're using it 951 // synchronously and all downstream code will copy anthing they need to maintain 952 // a reference to. 953 wasWritten, _, err = entry.Series.Write(ctx, timestamp, value, unit, annotation, wOpts) 954 // Load series metadata before decrementing the writer count 955 // to ensure this metadata is snapshotted at a consistent state 956 // NB(r): We explicitly do not place the series ID back into a 957 // pool as high frequency users of series IDs such 958 // as the commit log need to use the reference without the 959 // overhead of ownership tracking. This makes taking a ref here safe. 960 commitLogSeriesID = entry.Series.ID() 961 commitLogSeriesUniqueIndex = entry.Index 962 if err == nil && shouldReverseIndex { 963 if entry.NeedsIndexUpdate(s.reverseIndex.BlockStartForWriteTime(timestamp)) { 964 if !opts.WriteNewSeriesAsync { 965 return SeriesWrite{}, fmt.Errorf("to index async need write new series to be enabled") 966 } 967 needsIndex = true 968 pendingIndexInsert = s.pendingIndexInsert(entry, timestamp) 969 } 970 } 971 // release the reference we got on entry from `writableSeries` 972 entry.DecrementReaderWriterCount() 973 if err != nil { 974 return SeriesWrite{}, err 975 } 976 } else { 977 // This is an asynchronous insert and write which means we need to clone the annotation 978 // because its lifecycle in the commit log is independent of the calling function. 979 var annotationClone checked.Bytes 980 if len(annotation) != 0 { 981 annotationClone = s.opts.BytesPool().Get(len(annotation)) 982 // IncRef here so we can write the bytes in, but don't DecRef because the queue is about 983 // to take ownership and will DecRef when its done. 984 annotationClone.IncRef() 985 annotationClone.AppendAll(annotation) 986 } 987 988 result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{ 989 hasPendingWrite: true, 990 pendingWrite: dbShardPendingWrite{ 991 timestamp: timestamp, 992 value: value, 993 unit: unit, 994 annotation: annotationClone, 995 opts: wOpts, 996 }, 997 }) 998 if err != nil { 999 return SeriesWrite{}, err 1000 } 1001 1002 if shouldReverseIndex { 1003 if !opts.WriteNewSeriesAsync { 1004 return SeriesWrite{}, fmt.Errorf("to index async need write new series to be enabled") 1005 } 1006 needsIndex = true 1007 pendingIndexInsert = s.pendingIndexInsert(result.entry, timestamp) 1008 } 1009 1010 // NB(r): Make sure to use the copied ID which will eventually 1011 // be set to the newly series inserted ID. 1012 // The `id` var here is volatile after the context is closed 1013 // and adding ownership tracking to use it in the commit log 1014 // (i.e. registering a dependency on the context) is too expensive. 1015 commitLogSeriesID = result.copiedID 1016 commitLogSeriesUniqueIndex = result.entry.Index 1017 } 1018 1019 // Return metadata useful for writing to commit log and indexing. 1020 return SeriesWrite{ 1021 Series: ts.Series{ 1022 UniqueIndex: commitLogSeriesUniqueIndex, 1023 Namespace: s.namespace.ID(), 1024 ID: commitLogSeriesID, 1025 Shard: s.shard, 1026 }, 1027 WasWritten: wasWritten, 1028 NeedsIndex: needsIndex, 1029 PendingIndexInsert: pendingIndexInsert, 1030 }, nil 1031 } 1032 1033 func (s *dbShard) SeriesRefResolver( 1034 id ident.ID, 1035 tags ident.TagIterator, 1036 ) (bootstrap.SeriesRefResolver, error) { 1037 // Try retrieve existing series. 1038 entry, err := s.retrieveWritableSeriesAndIncrementReaderWriterCount(id) 1039 if err != nil { 1040 return nil, err 1041 } 1042 1043 if entry != nil { 1044 // The read/write ref is already incremented. 1045 return entry, nil 1046 } 1047 1048 entry, err = s.newShardEntry(id, convert.NewTagsIterMetadataResolver(tags)) 1049 if err != nil { 1050 return nil, err 1051 } 1052 1053 // Increment ref count to avoid expiration of the new entry just after adding it to the queue. 1054 // It is possible that this entry does not end up as the one in the shard. Therefore, the resolver 1055 // for this specific entry is responsible for closing, and there should always be one resolver 1056 // responsible for the one that DOES end up in the shard. 1057 entry.IncrementReaderWriterCount() 1058 1059 wg, err := s.insertQueue.Insert(dbShardInsert{ 1060 entry: entry, 1061 opts: dbShardInsertAsyncOptions{ 1062 // skipRateLimit for true since this method is used by bootstrapping 1063 // and should not be rate limited. 1064 skipRateLimit: true, 1065 // do not release entry ref during async write, because entry ref will be released when 1066 // ReleaseRef() is called on bootstrap.SeriesRefResolver. 1067 releaseEntryRef: false, 1068 }, 1069 }) 1070 if err != nil { 1071 return nil, err 1072 } 1073 1074 // Series will wait for the result to be batched together and inserted. 1075 return NewSeriesResolver( 1076 wg, 1077 entry, 1078 s.retrieveWritableSeriesAndIncrementReaderWriterCount), nil 1079 } 1080 1081 func (s *dbShard) ReadEncoded( 1082 ctx context.Context, 1083 id ident.ID, 1084 start, end xtime.UnixNano, 1085 nsCtx namespace.Context, 1086 ) (series.BlockReaderIter, error) { 1087 s.RLock() 1088 entry, err := s.lookupEntryWithLock(id) 1089 if entry != nil { 1090 // NB(r): Ensure readers have consistent view of this series, do 1091 // not expire the series while being read from. 1092 entry.IncrementReaderWriterCount() 1093 defer entry.DecrementReaderWriterCount() 1094 } 1095 s.RUnlock() 1096 1097 if err == errShardEntryNotFound { 1098 switch s.opts.SeriesCachePolicy() { 1099 case series.CacheAll: 1100 // No-op, would be in memory if cached 1101 return nil, nil 1102 } 1103 } else if err != nil { 1104 return nil, err 1105 } 1106 1107 if entry != nil { 1108 return entry.Series.ReadEncoded(ctx, start, end, nsCtx) 1109 } 1110 1111 retriever := s.seriesBlockRetriever 1112 onRetrieve := s.seriesOnRetrieveBlock 1113 opts := s.seriesOpts 1114 reader := series.NewReaderUsingRetriever(id, retriever, onRetrieve, nil, opts) 1115 return reader.ReadEncoded(ctx, start, end, nsCtx) 1116 } 1117 1118 // lookupEntryWithLock returns the entry for a given id while holding a read lock or a write lock. 1119 func (s *dbShard) lookupEntryWithLock(id ident.ID) (*Entry, error) { 1120 if s.state != dbShardStateOpen { 1121 // NB(r): Return an invalid params error here so any upstream 1122 // callers will not retry this operation 1123 return nil, xerrors.NewInvalidParamsError(errShardNotOpen) 1124 } 1125 elem, exists := s.lookup.Get(id) 1126 if !exists { 1127 return nil, errShardEntryNotFound 1128 } 1129 return elem.Value.(*Entry), nil 1130 } 1131 1132 func (s *dbShard) writableSeries(id ident.ID, tagResolver convert.TagMetadataResolver) (*Entry, error) { 1133 for { 1134 entry, err := s.retrieveWritableSeriesAndIncrementReaderWriterCount(id) 1135 if entry != nil { 1136 return entry, nil 1137 } 1138 if err != nil { 1139 return nil, err 1140 } 1141 1142 // Not inserted, attempt a batched insert 1143 result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{}) 1144 if err != nil { 1145 return nil, err 1146 } 1147 1148 // Wait for the insert attempt 1149 result.wg.Wait() 1150 } 1151 } 1152 1153 // WritableSeriesOptions defines writable series options. 1154 type WritableSeriesOptions struct { 1155 // WriteNewSeriesAsync specifies if the series should be async written. 1156 WriteNewSeriesAsync bool 1157 } 1158 1159 // TryRetrieveSeriesAndIncrementReaderWriterCount attempts to retrieve a writable series. 1160 // This increments the reader/writer count and so should be decremented when the series 1161 // is no longer held. 1162 func (s *dbShard) TryRetrieveSeriesAndIncrementReaderWriterCount(id ident.ID) ( 1163 *Entry, 1164 WritableSeriesOptions, 1165 error, 1166 ) { 1167 s.RLock() 1168 opts := WritableSeriesOptions{ 1169 WriteNewSeriesAsync: s.currRuntimeOptions.writeNewSeriesAsync, 1170 } 1171 if entry, err := s.lookupEntryWithLock(id); err == nil { 1172 entry.IncrementReaderWriterCount() 1173 s.RUnlock() 1174 return entry, opts, nil 1175 } else if err != errShardEntryNotFound { 1176 s.RUnlock() 1177 return nil, opts, err 1178 } 1179 s.RUnlock() 1180 return nil, opts, nil 1181 } 1182 1183 func (s *dbShard) retrieveWritableSeriesAndIncrementReaderWriterCount(id ident.ID) (*Entry, error) { 1184 entry, _, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id) 1185 return entry, err 1186 } 1187 1188 func (s *dbShard) newShardEntry( 1189 id ident.ID, 1190 tagResolver convert.TagMetadataResolver, 1191 ) (*Entry, error) { 1192 // NB(r): As documented in storage/series.DatabaseSeries the series IDs 1193 // and metadata are garbage collected, hence we cast the ID to a BytesID 1194 // that can't be finalized. 1195 // Since series are purged so infrequently the overhead of not releasing 1196 // back an ID and metadata to a pool is amortized over a long period of 1197 // time. 1198 // Also of note, when a series is indexed in multiple index segments it is 1199 // worth keeping the metadata around so it can be referenced to twice 1200 // without creating a new array of []doc.Field for all the tags twice. 1201 // Hence this stays on the storage/series.DatabaseSeries for when it needs 1202 // to be re-indexed. 1203 var ( 1204 seriesMetadata doc.Metadata 1205 err error 1206 ) 1207 1208 seriesMetadata, err = tagResolver.Resolve(id) 1209 if err != nil { 1210 return nil, err 1211 } 1212 1213 // Use the same bytes as the series metadata for the ID. 1214 seriesID := ident.BytesID(seriesMetadata.ID) 1215 1216 uniqueIndex := s.increasingIndex.nextIndex() 1217 newSeries := s.seriesPool.Get() 1218 newSeries.Reset(series.DatabaseSeriesOptions{ 1219 ID: seriesID, 1220 Metadata: seriesMetadata, 1221 UniqueIndex: uniqueIndex, 1222 BlockRetriever: s.seriesBlockRetriever, 1223 OnRetrieveBlock: s.seriesOnRetrieveBlock, 1224 OnEvictedFromWiredList: s, 1225 Options: s.seriesOpts, 1226 }) 1227 return NewEntry(NewEntryOptions{ 1228 Shard: s, 1229 Series: newSeries, 1230 Index: uniqueIndex, 1231 IndexWriter: s.reverseIndex, 1232 NowFn: s.nowFn, 1233 EntryMetrics: s.entryMetrics, 1234 }), nil 1235 } 1236 1237 type insertAsyncResult struct { 1238 wg *sync.WaitGroup 1239 copiedID ident.ID 1240 // entry is not guaranteed to be the final entry 1241 // inserted into the shard map in case there is already 1242 // an existing entry waiting in the insert queue 1243 entry *Entry 1244 } 1245 1246 func (s *dbShard) pendingIndexInsert( 1247 entry *Entry, 1248 timestamp xtime.UnixNano, 1249 ) writes.PendingIndexInsert { 1250 // inc a ref on the entry to ensure it's valid until the queue acts upon it. 1251 entry.OnIndexPrepare(s.reverseIndex.BlockStartForWriteTime(timestamp)) 1252 return writes.PendingIndexInsert{ 1253 Entry: index.WriteBatchEntry{ 1254 Timestamp: timestamp, 1255 OnIndexSeries: entry, 1256 EnqueuedAt: s.nowFn(), 1257 }, 1258 Document: entry.Series.Metadata(), 1259 } 1260 } 1261 1262 func (s *dbShard) insertSeriesForIndexingAsyncBatched( 1263 entry *Entry, 1264 timestamp xtime.UnixNano, 1265 async bool, 1266 ) error { 1267 indexBlockStart := s.reverseIndex.BlockStartForWriteTime(timestamp) 1268 // inc a ref on the entry to ensure it's valid until the queue acts upon it. 1269 entry.OnIndexPrepare(indexBlockStart) 1270 wg, err := s.insertQueue.Insert(dbShardInsert{ 1271 entry: entry, 1272 opts: dbShardInsertAsyncOptions{ 1273 // NB(r): Just indexing, should not be considered for new 1274 // series insert rate limiting. 1275 skipRateLimit: true, 1276 hasPendingIndexing: true, 1277 pendingIndex: dbShardPendingIndex{ 1278 timestamp: timestamp, 1279 enqueuedAt: s.nowFn(), 1280 }, 1281 // indicate we already have inc'd the entry's ref count, so we can correctly 1282 // handle the ref counting semantics in `insertSeriesBatch`. 1283 releaseEntryRef: true, 1284 }, 1285 }) 1286 // i.e. unable to enqueue into shard insert queue 1287 if err != nil { 1288 entry.OnIndexFinalize(indexBlockStart) // release any reference's we've held for indexing 1289 return err 1290 } 1291 1292 // if operating in async mode, we're done 1293 if async { 1294 return nil 1295 } 1296 1297 // if indexing in sync mode, wait till we're done and ensure we have have indexed the entry 1298 wg.Wait() 1299 if !entry.IndexedForBlockStart(indexBlockStart) { 1300 // i.e. indexing failed 1301 return fmt.Errorf("internal error: unable to index series") 1302 } 1303 1304 return nil 1305 } 1306 1307 func (s *dbShard) insertSeriesAsyncBatched( 1308 id ident.ID, 1309 tagResolver convert.TagMetadataResolver, 1310 opts dbShardInsertAsyncOptions, 1311 ) (insertAsyncResult, error) { 1312 entry, err := s.newShardEntry(id, tagResolver) 1313 if err != nil { 1314 return insertAsyncResult{}, err 1315 } 1316 1317 wg, err := s.insertQueue.Insert(dbShardInsert{ 1318 entry: entry, 1319 opts: opts, 1320 }) 1321 return insertAsyncResult{ 1322 wg: wg, 1323 // Make sure to return the copied ID from the new series. 1324 copiedID: entry.Series.ID(), 1325 entry: entry, 1326 }, err 1327 } 1328 1329 type insertSyncType uint8 1330 1331 // nolint: varcheck, unused 1332 const ( 1333 insertSync insertSyncType = iota 1334 insertSyncIncReaderWriterCount 1335 ) 1336 1337 type insertSyncOptions struct { 1338 insertType insertSyncType 1339 hasPendingIndex bool 1340 pendingIndex dbShardPendingIndex 1341 } 1342 1343 func (s *dbShard) insertSeriesSync( 1344 id ident.ID, 1345 tagResolver convert.TagMetadataResolver, 1346 opts insertSyncOptions, 1347 ) (*Entry, error) { 1348 // NB(r): Create new shard entry outside of write lock to reduce 1349 // time using write lock. 1350 newEntry, err := s.newShardEntry(id, tagResolver) 1351 if err != nil { 1352 // should never happen 1353 instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), 1354 func(logger *zap.Logger) { 1355 logger.Error("insertSeriesSync error creating shard entry", 1356 zap.String("id", id.String()), 1357 zap.Error(err)) 1358 }) 1359 return nil, err 1360 } 1361 1362 s.Lock() 1363 unlocked := false 1364 defer func() { 1365 if !unlocked { 1366 s.Unlock() 1367 } 1368 }() 1369 1370 existingEntry, err := s.lookupEntryWithLock(id) 1371 if err != nil && err != errShardEntryNotFound { 1372 // Shard not taking inserts likely. 1373 return nil, err 1374 } 1375 if existingEntry != nil { 1376 // Already inserted, likely a race. 1377 return existingEntry, nil 1378 } 1379 1380 s.insertNewShardEntryWithLock(newEntry) 1381 1382 // Track unlocking. 1383 unlocked = true 1384 s.Unlock() 1385 1386 // Be sure to enqueue for indexing if requires a pending index. 1387 if opts.hasPendingIndex { 1388 if _, err := s.insertQueue.Insert(dbShardInsert{ 1389 entry: newEntry, 1390 opts: dbShardInsertAsyncOptions{ 1391 // NB(r): Just indexing, should not be considered for new 1392 // series insert rate limiting. 1393 skipRateLimit: true, 1394 hasPendingIndexing: opts.hasPendingIndex, 1395 pendingIndex: opts.pendingIndex, 1396 }, 1397 }); err != nil { 1398 return nil, err 1399 } 1400 } 1401 1402 // Check if we're making a modification to this entry, be sure 1403 // to increment the writer count so it's visible when we release 1404 // the lock. 1405 if opts.insertType == insertSyncIncReaderWriterCount { 1406 newEntry.IncrementReaderWriterCount() 1407 } 1408 1409 return newEntry, nil 1410 } 1411 1412 func (s *dbShard) insertNewShardEntryWithLock(entry *Entry) { 1413 // Set the lookup value, we use the copied ID and since it is GC'd 1414 // we explicitly set it with options to not copy the key and not to 1415 // finalize it. 1416 copiedID := entry.Series.ID() 1417 listElem := s.list.PushBack(entry) 1418 s.lookup.SetUnsafe(copiedID, listElem, shardMapSetUnsafeOptions{ 1419 NoCopyKey: true, 1420 NoFinalizeKey: true, 1421 }) 1422 entry.SetInsertTime(s.nowFn()) 1423 } 1424 1425 func (s *dbShard) insertSeriesBatch(inserts []dbShardInsert) error { 1426 var ( 1427 anyPendingAction = false 1428 numPendingIndexing = 0 1429 ) 1430 1431 s.Lock() 1432 for i := range inserts { 1433 // If we are going to write to this entry then increment the 1434 // writer count so it does not look empty immediately after 1435 // we release the write lock. 1436 hasPendingWrite := inserts[i].opts.hasPendingWrite 1437 hasPendingIndexing := inserts[i].opts.hasPendingIndexing 1438 hasPendingRetrievedBlock := inserts[i].opts.hasPendingRetrievedBlock 1439 anyPendingAction = anyPendingAction || hasPendingWrite || 1440 hasPendingRetrievedBlock || hasPendingIndexing 1441 1442 if hasPendingIndexing { 1443 numPendingIndexing++ 1444 } 1445 1446 // we don't need to inc the entry ref count if we already have a ref on the entry. check if 1447 // that's the case. 1448 if inserts[i].opts.releaseEntryRef { 1449 // don't need to inc a ref on the entry, we were given as writable entry as input. 1450 continue 1451 } 1452 1453 // i.e. we don't have a ref on provided entry, so we check if between the operation being 1454 // enqueue in the shard insert queue, and this function executing, an entry was created 1455 // for the same ID. 1456 entry, err := s.lookupEntryWithLock(inserts[i].entry.Series.ID()) 1457 if entry != nil { 1458 // Already exists so update the entry we're pointed at for this insert. 1459 inserts[i].entry = entry 1460 } 1461 1462 if hasPendingIndexing || hasPendingWrite || hasPendingRetrievedBlock { 1463 // We're definitely writing a value, ensure that the pending write is 1464 // visible before we release the lookup write lock. 1465 inserts[i].entry.IncrementReaderWriterCount() 1466 // also indicate that we have a ref count on this entry for this operation. 1467 inserts[i].opts.releaseEntryRef = true 1468 } 1469 1470 if err == nil { 1471 // Already inserted. 1472 continue 1473 } 1474 1475 if err != errShardEntryNotFound { 1476 // Shard is not taking inserts. 1477 s.Unlock() 1478 // FOLLOWUP(prateek): is this an existing bug? why don't we need to release any ref's we've inc'd 1479 // on entries in the loop before this point, i.e. in range [0, i). Otherwise, how are those entries 1480 // going to get cleaned up? 1481 s.metrics.insertAsyncInsertErrors.Inc(int64(len(inserts) - i)) 1482 return err 1483 } 1484 1485 // Insert still pending, perform the insert 1486 entry = inserts[i].entry 1487 s.insertNewShardEntryWithLock(entry) 1488 } 1489 s.Unlock() 1490 1491 if !anyPendingAction { 1492 return nil 1493 } 1494 1495 // Perform any indexing, pending writes or pending retrieved blocks outside of lock 1496 ctx := s.contextPool.Get() 1497 // TODO(prateek): pool this type 1498 indexBlockSize := s.namespace.Options().IndexOptions().BlockSize() 1499 indexBatch := index.NewWriteBatch(index.WriteBatchOptions{ 1500 InitialCapacity: numPendingIndexing, 1501 IndexBlockSize: indexBlockSize, 1502 }) 1503 for i := range inserts { 1504 var ( 1505 entry = inserts[i].entry 1506 releaseEntryRef = inserts[i].opts.releaseEntryRef 1507 err error 1508 ) 1509 1510 if inserts[i].opts.hasPendingWrite { 1511 write := inserts[i].opts.pendingWrite 1512 var annotationBytes []byte 1513 if write.annotation != nil { 1514 annotationBytes = write.annotation.Bytes() 1515 } 1516 // NB: Ignore the `wasWritten` return argument here since this is an async 1517 // operation and there is nothing further to do with this value. 1518 // TODO: Consider propagating the `wasWritten` argument back to the caller 1519 // using waitgroup (or otherwise) in the future. 1520 _, _, err = entry.Series.Write(ctx, write.timestamp, write.value, 1521 write.unit, annotationBytes, write.opts) 1522 if err != nil { 1523 if xerrors.IsInvalidParams(err) { 1524 s.metrics.insertAsyncWriteInvalidParamsErrors.Inc(1) 1525 } else { 1526 s.metrics.insertAsyncWriteInternalErrors.Inc(1) 1527 s.logger.Error("error with async insert write", zap.Error(err)) 1528 } 1529 } 1530 1531 if write.annotation != nil { 1532 // Now that we've performed the write, we can finalize the annotation because 1533 // we're done with it and all the code from the series downwards has copied any 1534 // data that it required. 1535 write.annotation.DecRef() 1536 write.annotation.Finalize() 1537 } 1538 } 1539 1540 if inserts[i].opts.hasPendingIndexing { 1541 pendingIndex := inserts[i].opts.pendingIndex 1542 // increment the ref on the entry, as the original one was transferred to the 1543 // this method (insertSeriesBatch) via `releaseEntryRef` mechanism. 1544 entry.OnIndexPrepare(s.reverseIndex.BlockStartForWriteTime(pendingIndex.timestamp)) 1545 1546 writeBatchEntry := index.WriteBatchEntry{ 1547 Timestamp: pendingIndex.timestamp, 1548 OnIndexSeries: entry, 1549 EnqueuedAt: pendingIndex.enqueuedAt, 1550 } 1551 1552 indexBatch.Append(writeBatchEntry, entry.Series.Metadata()) 1553 } 1554 1555 if inserts[i].opts.hasPendingRetrievedBlock { 1556 block := inserts[i].opts.pendingRetrievedBlock 1557 entry.Series.OnRetrieveBlock(block.id, block.tags, block.start, block.segment, block.nsCtx) 1558 } 1559 1560 // Entries in the shard insert queue are either of: 1561 // - new entries 1562 // - existing entries that we've taken a ref on (marked as releaseEntryRef) 1563 if releaseEntryRef { 1564 entry.DecrementReaderWriterCount() 1565 } 1566 } 1567 1568 var err error 1569 // index all requested entries in batch. 1570 if n := indexBatch.Len(); n > 0 { 1571 err = s.reverseIndex.WriteBatch(indexBatch) 1572 if err != nil { 1573 s.metrics.insertAsyncIndexErrors.Inc(int64(n)) 1574 } 1575 } 1576 1577 // Avoid goroutine spinning up to close this context 1578 ctx.BlockingClose() 1579 1580 return err 1581 } 1582 1583 func (s *dbShard) FetchBlocks( 1584 ctx context.Context, 1585 id ident.ID, 1586 starts []xtime.UnixNano, 1587 nsCtx namespace.Context, 1588 ) ([]block.FetchBlockResult, error) { 1589 s.RLock() 1590 entry, err := s.lookupEntryWithLock(id) 1591 if entry != nil { 1592 // NB(r): Ensure readers have consistent view of this series, do 1593 // not expire the series while being read from. 1594 entry.IncrementReaderWriterCount() 1595 defer entry.DecrementReaderWriterCount() 1596 } 1597 s.RUnlock() 1598 1599 if err == errShardEntryNotFound { 1600 switch s.opts.SeriesCachePolicy() { 1601 case series.CacheAll: 1602 // No-op, would be in memory if cached 1603 return nil, nil 1604 } 1605 } else if err != nil { 1606 return nil, err 1607 } 1608 1609 if entry != nil { 1610 return entry.Series.FetchBlocks(ctx, starts, nsCtx) 1611 } 1612 1613 retriever := s.seriesBlockRetriever 1614 onRetrieve := s.seriesOnRetrieveBlock 1615 opts := s.seriesOpts 1616 // Nil for onRead callback because we don't want peer bootstrapping to impact 1617 // the behavior of the LRU 1618 var onReadCb block.OnReadBlock 1619 reader := series.NewReaderUsingRetriever(id, retriever, onRetrieve, onReadCb, opts) 1620 return reader.FetchBlocks(ctx, starts, nsCtx) 1621 } 1622 1623 func (s *dbShard) FetchBlocksForColdFlush( 1624 ctx context.Context, 1625 seriesID ident.ID, 1626 start xtime.UnixNano, 1627 version int, 1628 nsCtx namespace.Context, 1629 ) (block.FetchBlockResult, error) { 1630 s.RLock() 1631 entry, err := s.lookupEntryWithLock(seriesID) 1632 s.RUnlock() 1633 if entry == nil || err != nil { 1634 return block.FetchBlockResult{}, err 1635 } 1636 1637 return entry.Series.FetchBlocksForColdFlush(ctx, start, version, nsCtx) 1638 } 1639 1640 func (s *dbShard) fetchActiveBlocksMetadata( 1641 ctx context.Context, 1642 start, end xtime.UnixNano, 1643 limit int64, 1644 indexCursor int64, 1645 opts series.FetchBlocksMetadataOptions, 1646 ) (block.FetchBlocksMetadataResults, *int64, error) { 1647 var ( 1648 res = s.opts.FetchBlocksMetadataResultsPool().Get() 1649 fetchCtx = s.contextPool.Get() 1650 nextIndexCursor *int64 1651 ) 1652 1653 var loopErr error 1654 s.forEachShardEntry(func(entry *Entry) bool { 1655 // Break out of the iteration loop once we've accumulated enough entries. 1656 if int64(len(res.Results())) >= limit { 1657 next := int64(entry.Index) 1658 nextIndexCursor = &next 1659 return false 1660 } 1661 1662 // Fast forward past indexes lower than page token 1663 if int64(entry.Index) < indexCursor { 1664 return true 1665 } 1666 1667 // Use a context here that we finalize immediately so the stream 1668 // readers can be returned to pool after we finish fetching the 1669 // metadata for this series. 1670 // NB(r): Use a pooled context for pooled finalizers/closers but 1671 // reuse so don't need to put and get from the pool each iteration. 1672 fetchCtx.Reset() 1673 metadata, err := entry.Series.FetchBlocksMetadata(ctx, start, end, opts) 1674 fetchCtx.BlockingCloseReset() 1675 if err != nil { 1676 loopErr = err 1677 return false 1678 } 1679 1680 // If the blocksMetadata is empty, the series have no data within the specified 1681 // time range so we don't return it to the client 1682 if len(metadata.Blocks.Results()) == 0 { 1683 metadata.Blocks.Close() 1684 return true 1685 } 1686 1687 // Otherwise add it to the result which takes care of closing the metadata 1688 res.Add(metadata) 1689 1690 return true 1691 }) 1692 1693 return res, nextIndexCursor, loopErr 1694 } 1695 1696 func (s *dbShard) FetchBlocksMetadataV2( 1697 ctx context.Context, 1698 start, end xtime.UnixNano, 1699 limit int64, 1700 encodedPageToken PageToken, 1701 opts block.FetchBlocksMetadataOptions, 1702 ) (block.FetchBlocksMetadataResults, PageToken, error) { 1703 token := new(pagetoken.PageToken) 1704 if encodedPageToken != nil { 1705 if err := proto.Unmarshal(encodedPageToken, token); err != nil { 1706 return nil, nil, xerrors.NewInvalidParamsError(errShardInvalidPageToken) 1707 } 1708 } else { 1709 // NB(bodu): Allow callers to specify that they only want results from disk. 1710 if opts.OnlyDisk { 1711 token.FlushedSeriesPhase = &pagetoken.PageToken_FlushedSeriesPhase{} 1712 } 1713 } 1714 1715 // NB(r): If returning mixed in memory and disk results, then we return anything 1716 // that's mutable in memory first then all disk results. 1717 // We work backwards so we don't hit race conditions with blocks 1718 // being flushed and potentially missed between paginations. Working 1719 // backwards means that we might duplicate metadata sent back switching 1720 // between active phase and flushed phase, but that's better than missing 1721 // data working in the opposite direction. De-duping which block time ranges 1722 // were actually sent is also difficult as it's not always a consistent view 1723 // across async pagination. 1724 // Duplicating the metadata sent back means that consumers get a consistent 1725 // view of the world if they merge all the results together. 1726 // In the future we should consider the lifecycle of fileset files rather 1727 // than directly working with them here while filesystem cleanup manager 1728 // could delete them mid-read, on linux this is ok as it's just an unlink 1729 // and we'll finish our read cleanly. If there's a race between us thinking 1730 // the file is accessible and us opening a reader to it then this will bubble 1731 // an error to the client which will be retried. 1732 var ( 1733 activePhase = token.ActiveSeriesPhase 1734 flushedPhase = token.FlushedSeriesPhase 1735 ) 1736 if flushedPhase == nil { 1737 // If first phase started or no phases started then return active 1738 // series metadata until we find a block start time that we have fileset 1739 // files for. 1740 indexCursor := int64(0) 1741 if activePhase != nil { 1742 indexCursor = activePhase.IndexCursor 1743 } 1744 // We do not include cached blocks because we'll send metadata for 1745 // those blocks when we send metadata directly from the flushed files. 1746 seriesFetchBlocksMetadataOpts := series.FetchBlocksMetadataOptions{ 1747 FetchBlocksMetadataOptions: opts, 1748 } 1749 result, nextIndexCursor, err := s.fetchActiveBlocksMetadata(ctx, start, end, 1750 limit, indexCursor, seriesFetchBlocksMetadataOpts) 1751 if err != nil { 1752 return nil, nil, err 1753 } 1754 1755 // Encode the next page token. 1756 if nextIndexCursor == nil { 1757 // Next phase, no more results from active series. 1758 token = &pagetoken.PageToken{ 1759 FlushedSeriesPhase: &pagetoken.PageToken_FlushedSeriesPhase{}, 1760 } 1761 } else { 1762 // This phase is still active. 1763 token = &pagetoken.PageToken{ 1764 ActiveSeriesPhase: &pagetoken.PageToken_ActiveSeriesPhase{ 1765 IndexCursor: *nextIndexCursor, 1766 }, 1767 } 1768 } 1769 1770 data, err := proto.Marshal(token) 1771 if err != nil { 1772 return nil, nil, err 1773 } 1774 1775 return result, PageToken(data), nil 1776 } 1777 1778 // Must be in the second phase, start with checking the latest possible 1779 // flushed block and work backwards. 1780 var ( 1781 result = s.opts.FetchBlocksMetadataResultsPool().Get() 1782 ropts = s.namespace.Options().RetentionOptions() 1783 blockSize = ropts.BlockSize() 1784 // Subtract one blocksize because all fetch requests are exclusive on the end side. 1785 blockStart = end.Truncate(blockSize).Add(-1 * blockSize) 1786 now = xtime.ToUnixNano(s.nowFn()) 1787 tokenBlockStart xtime.UnixNano 1788 numResults int64 1789 ) 1790 if flushedPhase.CurrBlockStartUnixNanos > 0 { 1791 tokenBlockStart = xtime.UnixNano(flushedPhase.CurrBlockStartUnixNanos) 1792 blockStart = tokenBlockStart 1793 } 1794 1795 // Work backwards while in requested range and not before retention. 1796 for !blockStart.Before(start) && 1797 !blockStart.Before(retention.FlushTimeStart(ropts, now)) { 1798 exists, err := s.namespaceReaderMgr.filesetExistsAt(s.shard, blockStart) 1799 if err != nil { 1800 return nil, nil, err 1801 } 1802 if !exists { 1803 // No fileset files here. 1804 blockStart = blockStart.Add(-1 * blockSize) 1805 continue 1806 } 1807 1808 var pos readerPosition 1809 if !tokenBlockStart.IsZero() { 1810 // Was previously seeking through a previous block, need to validate 1811 // this is the correct one we found otherwise the file just went missing. 1812 if !blockStart.Equal(tokenBlockStart) { 1813 return nil, nil, fmt.Errorf( 1814 "was reading block at %v but next available block is: %v", 1815 tokenBlockStart, blockStart) 1816 } 1817 1818 // Do not need to check if we move onto the next block that it matches 1819 // the token's block start on next iteration. 1820 tokenBlockStart = 0 1821 1822 pos.metadataIdx = int(flushedPhase.CurrBlockEntryIdx) 1823 pos.volume = int(flushedPhase.Volume) 1824 } 1825 1826 // Open a reader at this position, potentially from cache. 1827 reader, err := s.namespaceReaderMgr.get(s.shard, blockStart, pos) 1828 if err != nil { 1829 return nil, nil, err 1830 } 1831 1832 for numResults < limit { 1833 id, tags, size, checksum, err := reader.ReadMetadata() 1834 if err == io.EOF { 1835 // Clean end of volume, we can break now. 1836 if err := reader.Close(); err != nil { 1837 return nil, nil, fmt.Errorf( 1838 "could not close metadata reader for block %v: %v", 1839 blockStart, err) 1840 } 1841 break 1842 } 1843 if err != nil { 1844 // Best effort to close the reader on a read error. 1845 if err := reader.Close(); err != nil { 1846 s.logger.Error("could not close reader on unexpected err", zap.Error(err)) 1847 } 1848 return nil, nil, fmt.Errorf( 1849 "could not read metadata for block %v: %v", 1850 blockStart, err) 1851 } 1852 1853 blockResult := s.opts.FetchBlockMetadataResultsPool().Get() 1854 value := block.FetchBlockMetadataResult{ 1855 Start: blockStart, 1856 } 1857 if opts.IncludeSizes { 1858 value.Size = int64(size) 1859 } 1860 if opts.IncludeChecksums { 1861 v := checksum 1862 value.Checksum = &v 1863 } 1864 blockResult.Add(value) 1865 1866 numResults++ 1867 result.Add(block.NewFetchBlocksMetadataResult(id, tags, 1868 blockResult)) 1869 } 1870 1871 endPos := int64(reader.MetadataRead()) 1872 // This volume may be different from the one initially requested, 1873 // e.g. if there was a compaction between the last call and this 1874 // one, so be sure to update the state of the pageToken. If this is not 1875 // updated, the request would have to start from the beginning since it 1876 // would be requesting a stale volume, which could result in an infinite 1877 // loop of requests that never complete. 1878 volume := int64(reader.Status().Volume) 1879 1880 // Return the reader to the cache. Since this is effectively putting 1881 // the reader into a shared pool, don't use the reader after this call. 1882 err = s.namespaceReaderMgr.put(reader) 1883 if err != nil { 1884 return nil, nil, err 1885 } 1886 1887 if numResults >= limit { 1888 // We hit the limit, return results with page token. 1889 token = &pagetoken.PageToken{ 1890 FlushedSeriesPhase: &pagetoken.PageToken_FlushedSeriesPhase{ 1891 CurrBlockStartUnixNanos: int64(blockStart), 1892 CurrBlockEntryIdx: endPos, 1893 Volume: volume, 1894 }, 1895 } 1896 data, err := proto.Marshal(token) 1897 if err != nil { 1898 return nil, nil, err 1899 } 1900 return result, data, nil 1901 } 1902 1903 // Otherwise we move on to the previous block. 1904 blockStart = blockStart.Add(-1 * blockSize) 1905 } 1906 1907 // No more results if we fall through. 1908 return result, nil, nil 1909 } 1910 1911 func (s *dbShard) PrepareBootstrap(ctx context.Context) error { 1912 ctx, span, sampled := ctx.StartSampledTraceSpan(tracepoint.ShardPrepareBootstrap) 1913 defer span.Finish() 1914 1915 if sampled { 1916 span.LogFields(log.Int("shard", int(s.shard))) 1917 } 1918 1919 // Iterate flushed time ranges to determine which blocks are retrievable. 1920 // NB(r): This must be done before bootstrap since during bootstrapping 1921 // series will load blocks into series with series.LoadBlock(...) which 1922 // needs to ask the shard whether certain time windows have been flushed or 1923 // not. 1924 s.initializeFlushStates() 1925 return nil 1926 } 1927 1928 func (s *dbShard) initializeFlushStates() { 1929 s.flushState.RLock() 1930 initialized := s.flushState.initialized 1931 s.flushState.RUnlock() 1932 if initialized { 1933 return 1934 } 1935 1936 defer func() { 1937 s.flushState.Lock() 1938 s.flushState.initialized = true 1939 s.flushState.Unlock() 1940 }() 1941 1942 s.UpdateFlushStates() 1943 return 1944 } 1945 1946 func (s *dbShard) UpdateFlushStates() { 1947 fsOpts := s.opts.CommitLogOptions().FilesystemOptions() 1948 readInfoFilesResults := fs.ReadInfoFiles(fsOpts.FilePathPrefix(), s.namespace.ID(), s.shard, 1949 fsOpts.InfoReaderBufferSize(), fsOpts.DecodingOptions(), persist.FileSetFlushType) 1950 1951 for _, result := range readInfoFilesResults { 1952 if err := result.Err.Error(); err != nil { 1953 s.logger.Error("unable to read info files in shard bootstrap", 1954 zap.Uint32("shard", s.ID()), 1955 zap.Stringer("namespace", s.namespace.ID()), 1956 zap.String("filepath", result.Err.Filepath()), 1957 zap.Error(err)) 1958 continue 1959 } 1960 1961 info := result.Info 1962 at := xtime.UnixNano(info.BlockStart) 1963 currState := s.flushStateNoBootstrapCheck(at) 1964 1965 if currState.WarmStatus.DataFlushed != fileOpSuccess { 1966 s.markWarmDataFlushStateSuccess(at) 1967 } 1968 1969 // Cold version needs to get bootstrapped so that the 1:1 relationship 1970 // between volume number and cold version is maintained and the volume 1971 // numbers / flush versions remain monotonically increasing. 1972 // 1973 // Note that there can be multiple info files for the same block, for 1974 // example if the database didn't get to clean up compacted filesets 1975 // before terminating. 1976 if currState.ColdVersionRetrievable < info.VolumeIndex { 1977 s.setFlushStateColdVersionRetrievable(at, info.VolumeIndex) 1978 s.setFlushStateColdVersionFlushed(at, info.VolumeIndex) 1979 } 1980 } 1981 1982 // Populate index flush state only if enabled. 1983 if !s.indexEnabled { 1984 return 1985 } 1986 1987 blockSize := s.namespace.Options().RetentionOptions().BlockSize() 1988 indexBlockSize := s.namespace.Options().IndexOptions().BlockSize() 1989 1990 indexFlushedBlockStarts := s.reverseIndex.WarmFlushBlockStarts() 1991 for _, blockStart := range indexFlushedBlockStarts { 1992 // Index block size is wider than data block size, so we want to set all data blockStarts 1993 // within the range of a given index blockStart 1994 blockEnd := blockStart.Add(indexBlockSize) 1995 for at := blockStart; at < blockEnd; at = at.Add(blockSize) { 1996 currState := s.flushStateNoBootstrapCheck(at) 1997 if currState.WarmStatus.IndexFlushed != fileOpSuccess { 1998 s.markWarmIndexFlushStateSuccess(at) 1999 } 2000 } 2001 } 2002 } 2003 2004 func (s *dbShard) Bootstrap( 2005 ctx context.Context, 2006 nsCtx namespace.Context, 2007 ) error { 2008 ctx, span, sampled := ctx.StartSampledTraceSpan(tracepoint.ShardBootstrap) 2009 defer span.Finish() 2010 2011 if sampled { 2012 span.LogFields(log.Int("shard", int(s.shard))) 2013 } 2014 2015 s.Lock() 2016 if s.bootstrapState == Bootstrapped { 2017 s.Unlock() 2018 return errShardAlreadyBootstrapped 2019 } 2020 if s.bootstrapState == Bootstrapping { 2021 s.Unlock() 2022 return errShardIsBootstrapping 2023 } 2024 s.bootstrapState = Bootstrapping 2025 s.Unlock() 2026 2027 multiErr := xerrors.NewMultiError() 2028 2029 // Initialize the flush states if we haven't called prepare bootstrap. 2030 if err := s.PrepareBootstrap(ctx); err != nil { 2031 multiErr = multiErr.Add(err) 2032 } 2033 2034 // Now that this shard has finished bootstrapping, attempt to cache all of its seekers. Cannot call 2035 // this earlier as block lease verification will fail due to the shards not being bootstrapped 2036 // (and as a result no leases can be verified since the flush state is not yet known). 2037 if err := s.cacheShardIndices(); err != nil { 2038 multiErr = multiErr.Add(err) 2039 } 2040 2041 // Move any bootstrap buffers into position for reading. 2042 s.forEachShardEntry(func(entry *Entry) bool { 2043 if err := entry.Series.Bootstrap(nsCtx); err != nil { 2044 multiErr = multiErr.Add(err) 2045 } 2046 return true 2047 }) 2048 2049 s.Lock() 2050 s.bootstrapState = Bootstrapped 2051 s.Unlock() 2052 2053 return multiErr.FinalError() 2054 } 2055 2056 func (s *dbShard) LoadBlocks( 2057 seriesToLoad *result.Map, 2058 ) error { 2059 if seriesToLoad == nil { 2060 return errTriedToLoadNilSeries 2061 } 2062 2063 s.Lock() 2064 // Don't allow loads until the shard is bootstrapped because the shard flush states need to be 2065 // bootstrapped in order to safely load blocks. This also keeps things simpler to reason about. 2066 if s.bootstrapState != Bootstrapped { 2067 s.Unlock() 2068 return errShardIsNotBootstrapped 2069 } 2070 s.Unlock() 2071 2072 memTracker := s.opts.MemoryTracker() 2073 estimatedSize := result.EstimateMapBytesSize(seriesToLoad) 2074 ok := memTracker.IncNumLoadedBytes(estimatedSize) 2075 if !ok { 2076 return ErrDatabaseLoadLimitHit 2077 } 2078 2079 multiErr := xerrors.NewMultiError() 2080 for _, elem := range seriesToLoad.Iter() { 2081 dbBlocks := elem.Value() 2082 id := dbBlocks.ID 2083 tags := dbBlocks.Tags 2084 2085 canFinalizeTagsAll := true 2086 for _, block := range dbBlocks.Blocks.AllBlocks() { 2087 result, err := s.loadBlock(id, tags, block) 2088 if err != nil { 2089 multiErr = multiErr.Add(err) 2090 } 2091 2092 canFinalizeTagsAll = canFinalizeTagsAll && result.canFinalizeTags 2093 } 2094 2095 if canFinalizeTagsAll { 2096 tags.Finalize() 2097 } 2098 } 2099 2100 return multiErr.FinalError() 2101 } 2102 2103 type loadBlockResult struct { 2104 canFinalizeTags bool 2105 } 2106 2107 func (s *dbShard) loadBlock( 2108 id ident.ID, 2109 tags ident.Tags, 2110 block block.DatabaseBlock, 2111 ) (loadBlockResult, error) { 2112 var ( 2113 timestamp = block.StartTime() 2114 result loadBlockResult 2115 ) 2116 2117 // First lookup if series already exists. 2118 entry, shardOpts, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id) 2119 if err != nil && err != errShardEntryNotFound { 2120 return result, err 2121 } 2122 if entry == nil { 2123 // Synchronously insert to avoid waiting for the insert queue which could potentially 2124 // delay the insert. 2125 entry, err = s.insertSeriesSync(id, convert.NewTagsMetadataResolver(tags), 2126 insertSyncOptions{ 2127 // NB(r): Because insertSyncIncReaderWriterCount is used here we 2128 // don't need to explicitly increment the reader/writer count and it 2129 // will happen while the write lock is held so that it can't immediately 2130 // be expired. 2131 insertType: insertSyncIncReaderWriterCount, 2132 hasPendingIndex: s.reverseIndex != nil, 2133 pendingIndex: dbShardPendingIndex{ 2134 timestamp: timestamp, 2135 enqueuedAt: s.nowFn(), 2136 }, 2137 }) 2138 if err != nil { 2139 return result, err 2140 } 2141 } else { 2142 // No longer needed as we found the series and we don't require 2143 // them for insertion. 2144 // FOLLOWUP(r): Audit places that keep refs to the ID from a 2145 // bootstrap result, newShardEntry copies it but some of the 2146 // bootstrapped blocks when using certain series cache policies 2147 // keeps refs to the ID with seriesID, so for now these IDs will 2148 // be garbage collected) 2149 result.canFinalizeTags = true 2150 } 2151 2152 // Always decrement the reader writer count. 2153 defer entry.DecrementReaderWriterCount() 2154 2155 // NB(rartoul): The data being loaded is not part of the bootstrap process then it needs to be 2156 // loaded as a cold write because the load could be happening concurrently with 2157 // other processes like the flush (as opposed to bootstrap which cannot happen 2158 // concurrently with a flush) and there is no way to know if this series/block 2159 // combination has been warm flushed or not yet since updating the shard block state 2160 // doesn't happen until the entire flush completes. 2161 // 2162 // As a result the only safe operation is to load the block as a cold write which 2163 // ensures that the data will eventually be flushed and merged with the existing data 2164 // on disk in the two scenarios where the Load() API is used (cold writes and repairs). 2165 if err := entry.Series.LoadBlock(block, series.ColdWrite); err != nil { 2166 return result, err 2167 } 2168 // Cannot close blocks once done as series takes ref to them. 2169 2170 // Check if needs to be reverse indexed. 2171 if s.reverseIndex != nil && 2172 entry.NeedsIndexUpdate(s.reverseIndex.BlockStartForWriteTime(timestamp)) { 2173 err = s.insertSeriesForIndexingAsyncBatched(entry, timestamp, 2174 shardOpts.WriteNewSeriesAsync) 2175 if err != nil { 2176 return result, err 2177 } 2178 } 2179 2180 return result, nil 2181 } 2182 2183 func (s *dbShard) cacheShardIndices() error { 2184 retriever := s.DatabaseBlockRetriever 2185 // May be nil depending on the caching policy. 2186 if retriever == nil { 2187 return nil 2188 } 2189 2190 s.logger.Debug("caching shard indices", zap.Uint32("shard", s.ID())) 2191 if err := retriever.CacheShardIndices([]uint32{s.ID()}); err != nil { 2192 s.logger.Error("caching shard indices error", 2193 zap.Uint32("shard", s.ID()), 2194 zap.Error(err)) 2195 return err 2196 } 2197 2198 s.logger.Debug("caching shard indices completed successfully", 2199 zap.Uint32("shard", s.ID())) 2200 return nil 2201 } 2202 2203 func (s *dbShard) WarmFlush( 2204 blockStart xtime.UnixNano, 2205 flushPreparer persist.FlushPreparer, 2206 nsCtx namespace.Context, 2207 ) error { 2208 // We don't flush data when the shard is still bootstrapping 2209 s.RLock() 2210 if s.bootstrapState != Bootstrapped { 2211 s.RUnlock() 2212 return errShardNotBootstrappedToFlush 2213 } 2214 s.RUnlock() 2215 2216 prepareOpts := persist.DataPrepareOptions{ 2217 NamespaceMetadata: s.namespace, 2218 Shard: s.ID(), 2219 BlockStart: blockStart, 2220 // Volume index is always 0 for warm flushes because a warm flush must 2221 // happen first before cold flushes happen. 2222 VolumeIndex: 0, 2223 // We explicitly set delete if exists to false here as we track which 2224 // filesets exist at bootstrap time so we should never encounter a time 2225 // where a fileset already exists when we attempt to flush unless there 2226 // is a bug in the code. 2227 DeleteIfExists: false, 2228 FileSetType: persist.FileSetFlushType, 2229 } 2230 prepared, err := flushPreparer.PrepareData(prepareOpts) 2231 if err != nil { 2232 return err 2233 } 2234 2235 var multiErr xerrors.MultiError 2236 flushCtx := s.contextPool.Get() // From pool so finalizers are from pool. 2237 2238 flushResult := dbShardFlushResult{} 2239 s.forEachShardEntry(func(entry *Entry) bool { 2240 curr := entry.Series 2241 // Use a temporary context here so the stream readers can be returned to 2242 // the pool after we finish fetching flushing the series. 2243 flushCtx.Reset() 2244 flushOutcome, err := curr.WarmFlush(flushCtx, blockStart, prepared.Persist, nsCtx) 2245 // Use BlockingCloseReset so context doesn't get returned to the pool. 2246 flushCtx.BlockingCloseReset() 2247 2248 if err != nil { 2249 multiErr = multiErr.Add(err) 2250 // If we encounter an error when persisting a series, don't continue as 2251 // the file on disk could be in a corrupt state. 2252 return false 2253 } 2254 2255 flushResult.update(flushOutcome) 2256 2257 return true 2258 }) 2259 2260 s.logFlushResult(flushResult) 2261 2262 if err := prepared.Close(); err != nil { 2263 multiErr = multiErr.Add(err) 2264 } 2265 2266 return s.markWarmDataFlushStateSuccessOrError(blockStart, multiErr.FinalError()) 2267 } 2268 2269 func (s *dbShard) ColdFlush( 2270 flushPreparer persist.FlushPreparer, 2271 resources coldFlushReusableResources, 2272 nsCtx namespace.Context, 2273 onFlushSeries persist.OnFlushSeries, 2274 ) (ShardColdFlush, error) { 2275 // We don't flush data when the shard is still bootstrapping. 2276 s.RLock() 2277 if s.bootstrapState != Bootstrapped { 2278 s.RUnlock() 2279 return shardColdFlush{}, errShardNotBootstrappedToFlush 2280 } 2281 // Use blockStatesSnapshotWithRLock to avoid having to re-acquire read lock. 2282 blockStates := s.blockStatesSnapshotWithRLock() 2283 s.RUnlock() 2284 2285 resources.reset() 2286 var ( 2287 multiErr xerrors.MultiError 2288 dirtySeries = resources.dirtySeries 2289 dirtySeriesToWrite = resources.dirtySeriesToWrite 2290 idElementPool = resources.idElementPool 2291 ) 2292 2293 blockStatesSnapshot, bootstrapped := blockStates.UnwrapValue() 2294 if !bootstrapped { 2295 return shardColdFlush{}, errFlushStateIsNotInitialized 2296 } 2297 2298 var ( 2299 // forEachShardEntry should not execute in parallel, but protect with a lock anyways for paranoia. 2300 loopErrLock sync.Mutex 2301 loopErr error 2302 ) 2303 // First, loop through all series to capture data on which blocks have dirty 2304 // series and add them to the resources for further processing. 2305 s.forEachShardEntry(func(entry *Entry) bool { 2306 curr := entry.Series 2307 seriesMetadata := curr.Metadata() 2308 blockStarts := curr.ColdFlushBlockStarts(blockStatesSnapshot) 2309 blockStarts.ForEach(func(t xtime.UnixNano) { 2310 // Cold flushes can only happen on blockStarts that have been 2311 // warm flushed, because warm flush logic does not currently 2312 // perform any merging logic. 2313 hasWarmFlushed, err := s.hasWarmFlushed(t) 2314 if err != nil { 2315 loopErrLock.Lock() 2316 loopErr = err 2317 loopErrLock.Unlock() 2318 return 2319 } 2320 if !hasWarmFlushed { 2321 return 2322 } 2323 2324 seriesList := dirtySeriesToWrite[t] 2325 if seriesList == nil { 2326 seriesList = newIDList(idElementPool) 2327 dirtySeriesToWrite[t] = seriesList 2328 } 2329 element := seriesList.PushBack(seriesMetadata) 2330 2331 dirtySeries.Set(idAndBlockStart{ 2332 blockStart: t, 2333 id: seriesMetadata.ID, 2334 }, element) 2335 }) 2336 2337 return true 2338 }) 2339 if loopErr != nil { 2340 return shardColdFlush{}, loopErr 2341 } 2342 2343 if dirtySeries.Len() == 0 { 2344 // Early exit if there is nothing dirty to merge. dirtySeriesToWrite 2345 // may be non-empty when dirtySeries is empty because we purposely 2346 // leave empty seriesLists in the dirtySeriesToWrite map to avoid having 2347 // to reallocate them in subsequent usages of the shared resource. 2348 return shardColdFlush{}, nil 2349 } 2350 2351 flush := shardColdFlush{ 2352 shard: s, 2353 doneFns: make([]shardColdFlushDone, 0, len(dirtySeriesToWrite)), 2354 } 2355 merger := s.newMergerFn(resources.fsReader, s.opts.DatabaseBlockOptions().DatabaseBlockAllocSize(), 2356 s.opts.SegmentReaderPool(), s.opts.MultiReaderIteratorPool(), 2357 s.opts.IdentifierPool(), s.opts.EncoderPool(), s.opts.ContextPool(), 2358 s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix(), s.namespace.Options()) 2359 mergeWithMem := s.newFSMergeWithMemFn(s, s, dirtySeries, dirtySeriesToWrite) 2360 // Loop through each block that we know has ColdWrites. Since each block 2361 // has its own fileset, if we encounter an error while trying to persist 2362 // a block, we continue to try persisting other blocks. 2363 for startTime := range dirtySeriesToWrite { 2364 coldVersion, err := s.RetrievableBlockColdVersion(startTime) 2365 if err != nil { 2366 multiErr = multiErr.Add(err) 2367 continue 2368 } 2369 2370 fsID := fs.FileSetFileIdentifier{ 2371 Namespace: s.namespace.ID(), 2372 Shard: s.ID(), 2373 BlockStart: startTime, 2374 VolumeIndex: coldVersion, 2375 } 2376 2377 nextVersion := coldVersion + 1 2378 close, err := merger.Merge(fsID, mergeWithMem, nextVersion, flushPreparer, nsCtx, 2379 onFlushSeries) 2380 if err != nil { 2381 multiErr = multiErr.Add(err) 2382 continue 2383 } 2384 flush.doneFns = append(flush.doneFns, shardColdFlushDone{ 2385 startTime: startTime, 2386 nextVersion: nextVersion, 2387 close: close, 2388 }) 2389 } 2390 return flush, multiErr.FinalError() 2391 } 2392 2393 func (s *dbShard) FilterBlocksNeedSnapshot(blockStarts []xtime.UnixNano) []xtime.UnixNano { 2394 if !s.IsBootstrapped() { 2395 return nil 2396 } 2397 2398 needs := map[xtime.UnixNano]struct{}{} 2399 s.forEachShardEntry(func(entry *Entry) bool { 2400 entry.Series.MarkNonEmptyBlocks(needs) 2401 if len(needs) < len(blockStarts) { 2402 return true 2403 } 2404 // Note: entries.Series might have non empty blocks that are not contained in blockStarts. 2405 // This prevents usage of len(needs) < len(blockStarts) as early exit criteria. 2406 for _, bl := range blockStarts { 2407 if _, ok := needs[bl]; !ok { 2408 return true 2409 } 2410 } 2411 return false 2412 }) 2413 2414 // Note: doing this to keep original ordering. Not sure if that matters though. 2415 filtered := make([]xtime.UnixNano, 0, len(needs)) 2416 for _, bl := range blockStarts { 2417 if _, ok := needs[bl]; ok { 2418 filtered = append(filtered, bl) 2419 } 2420 } 2421 return filtered 2422 } 2423 2424 func (s *dbShard) Snapshot( 2425 blockStart xtime.UnixNano, 2426 snapshotTime xtime.UnixNano, 2427 snapshotPreparer persist.SnapshotPreparer, 2428 nsCtx namespace.Context, 2429 ) (ShardSnapshotResult, error) { 2430 // We don't snapshot data when the shard is still bootstrapping 2431 if !s.IsBootstrapped() { 2432 return ShardSnapshotResult{}, errShardNotBootstrappedToSnapshot 2433 } 2434 2435 // Record per-shard snapshot latency, not many shards so safe 2436 // to use a timer. 2437 totalTimer := s.metrics.snapshotTotalLatency.Start() 2438 defer totalTimer.Stop() 2439 2440 prepareOpts := persist.DataPrepareOptions{ 2441 NamespaceMetadata: s.namespace, 2442 Shard: s.ID(), 2443 BlockStart: blockStart, 2444 FileSetType: persist.FileSetSnapshotType, 2445 // We explicitly set delete if exists to false here as we do not 2446 // expect there to be a collision as snapshots files are appended 2447 // with a monotonically increasing number to avoid collisions, there 2448 // would have to be a competing process to cause a collision. 2449 DeleteIfExists: false, 2450 Snapshot: persist.DataPrepareSnapshotOptions{ 2451 SnapshotTime: snapshotTime, 2452 }, 2453 } 2454 prepareTimer := s.metrics.snapshotPrepareLatency.Start() 2455 prepared, err := snapshotPreparer.PrepareData(prepareOpts) 2456 prepareTimer.Stop() 2457 if err != nil { 2458 return ShardSnapshotResult{}, err 2459 } 2460 2461 var ( 2462 snapshotCtx = s.contextPool.Get() 2463 persist int 2464 stats series.SnapshotResultStats 2465 multiErr xerrors.MultiError 2466 ) 2467 s.forEachShardEntry(func(entry *Entry) bool { 2468 series := entry.Series 2469 // Use a temporary context here so the stream readers can be returned to 2470 // pool after we finish fetching flushing the series 2471 snapshotCtx.Reset() 2472 result, err := series.Snapshot(snapshotCtx, blockStart, prepared.Persist, nsCtx) 2473 snapshotCtx.BlockingCloseReset() 2474 2475 if err != nil { 2476 multiErr = multiErr.Add(err) 2477 // If we encounter an error when persisting a series, don't continue as 2478 // the file on disk could be in a corrupt state. 2479 return false 2480 } 2481 2482 if result.Persist { 2483 persist++ 2484 } 2485 2486 // Add snapshot result to cumulative result. 2487 stats.Add(result.Stats) 2488 return true 2489 }) 2490 2491 // Emit cumulative snapshot result timings. 2492 if multiErr.NumErrors() == 0 { 2493 s.metrics.snapshotMergeByBucketLatency.Record(stats.TimeMergeByBucket) 2494 s.metrics.snapshotMergeAcrossBucketsLatency.Record(stats.TimeMergeAcrossBuckets) 2495 s.metrics.snapshotChecksumLatency.Record(stats.TimeChecksum) 2496 s.metrics.snapshotPersistLatency.Record(stats.TimePersist) 2497 } 2498 2499 closeTimer := s.metrics.snapshotCloseLatency.Start() 2500 multiErr = multiErr.Add(prepared.Close()) 2501 closeTimer.Stop() 2502 2503 if err := multiErr.FinalError(); err != nil { 2504 return ShardSnapshotResult{}, err 2505 } 2506 2507 return ShardSnapshotResult{ 2508 SeriesPersist: persist, 2509 }, nil 2510 } 2511 2512 func (s *dbShard) FlushState(blockStart xtime.UnixNano) (fileOpState, error) { 2513 s.flushState.RLock() 2514 initialized := s.flushState.initialized 2515 state := s.flushStateWithRLock(blockStart) 2516 s.flushState.RUnlock() 2517 2518 if !initialized { 2519 return fileOpState{}, errFlushStateIsNotInitialized 2520 } 2521 2522 return state, nil 2523 } 2524 2525 func (s *dbShard) flushStateNoBootstrapCheck(blockStart xtime.UnixNano) fileOpState { 2526 s.flushState.RLock() 2527 check := s.flushStateWithRLock(blockStart) 2528 s.flushState.RUnlock() 2529 return check 2530 } 2531 2532 func (s *dbShard) flushStateWithRLock(blockStart xtime.UnixNano) fileOpState { 2533 state, ok := s.flushState.statesByTime[blockStart] 2534 if !ok { 2535 return fileOpState{WarmStatus: warmStatus{ 2536 DataFlushed: fileOpNotStarted, 2537 IndexFlushed: fileOpNotStarted, 2538 }} 2539 } 2540 return state 2541 } 2542 2543 func (s *dbShard) markWarmDataFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) error { 2544 // Track flush state for block state 2545 if err == nil { 2546 s.markWarmDataFlushStateSuccess(blockStart) 2547 } else { 2548 s.markWarmDataFlushStateFail(blockStart) 2549 } 2550 return err 2551 } 2552 2553 func (s *dbShard) markWarmDataFlushStateSuccess(blockStart xtime.UnixNano) { 2554 s.flushState.Lock() 2555 state := s.flushState.statesByTime[blockStart] 2556 state.WarmStatus.DataFlushed = fileOpSuccess 2557 s.flushState.statesByTime[blockStart] = state 2558 s.flushState.Unlock() 2559 } 2560 2561 func (s *dbShard) markWarmDataFlushStateFail(blockStart xtime.UnixNano) { 2562 s.flushState.Lock() 2563 state := s.flushState.statesByTime[blockStart] 2564 state.WarmStatus.DataFlushed = fileOpFailed 2565 state.NumFailures++ 2566 s.flushState.statesByTime[blockStart] = state 2567 s.flushState.Unlock() 2568 } 2569 2570 // MarkWarmIndexFlushStateSuccessOrError marks the blockStart as 2571 // success or fail based on the provided err. 2572 func (s *dbShard) MarkWarmIndexFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) { 2573 // Track flush state for block state 2574 if err == nil { 2575 s.markWarmIndexFlushStateSuccess(blockStart) 2576 } else { 2577 s.markWarmIndexFlushStateFail(blockStart) 2578 } 2579 } 2580 2581 func (s *dbShard) markWarmIndexFlushStateSuccess(blockStart xtime.UnixNano) { 2582 s.flushState.Lock() 2583 state := s.flushState.statesByTime[blockStart] 2584 state.WarmStatus.IndexFlushed = fileOpSuccess 2585 s.flushState.statesByTime[blockStart] = state 2586 s.flushState.Unlock() 2587 } 2588 2589 func (s *dbShard) markWarmIndexFlushStateFail(blockStart xtime.UnixNano) { 2590 s.flushState.Lock() 2591 state := s.flushState.statesByTime[blockStart] 2592 state.WarmStatus.IndexFlushed = fileOpFailed 2593 state.NumFailures++ 2594 s.flushState.statesByTime[blockStart] = state 2595 s.flushState.Unlock() 2596 } 2597 2598 func (s *dbShard) setFlushStateColdVersionRetrievable(blockStart xtime.UnixNano, version int) { 2599 s.flushState.Lock() 2600 state := s.flushState.statesByTime[blockStart] 2601 state.ColdVersionRetrievable = version 2602 s.flushState.statesByTime[blockStart] = state 2603 s.flushState.Unlock() 2604 } 2605 2606 func (s *dbShard) setFlushStateColdVersionFlushed(blockStart xtime.UnixNano, version int) { 2607 s.flushState.Lock() 2608 state := s.flushState.statesByTime[blockStart] 2609 state.ColdVersionFlushed = version 2610 s.flushState.statesByTime[blockStart] = state 2611 s.flushState.Unlock() 2612 } 2613 2614 func (s *dbShard) removeAnyFlushStatesTooEarly(startTime xtime.UnixNano) { 2615 s.flushState.Lock() 2616 earliestFlush := retention.FlushTimeStart(s.namespace.Options().RetentionOptions(), startTime) 2617 for t := range s.flushState.statesByTime { 2618 if t.Before(earliestFlush) { 2619 delete(s.flushState.statesByTime, t) 2620 } 2621 } 2622 s.flushState.Unlock() 2623 } 2624 2625 func (s *dbShard) CleanupExpiredFileSets(earliestToRetain xtime.UnixNano) error { 2626 filePathPrefix := s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() 2627 expired, err := s.filesetPathsBeforeFn(filePathPrefix, s.namespace.ID(), s.ID(), earliestToRetain) 2628 if err != nil { 2629 return fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v", 2630 filePathPrefix, s.namespace.ID(), s.ID(), err) 2631 } 2632 2633 return s.deleteFilesFn(expired) 2634 } 2635 2636 func (s *dbShard) CleanupCompactedFileSets() error { 2637 filePathPrefix := s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() 2638 filesets, err := s.filesetsFn(filePathPrefix, s.namespace.ID(), s.ID()) 2639 if err != nil { 2640 return fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v", 2641 filePathPrefix, s.namespace.ID(), s.ID(), err) 2642 } 2643 2644 // Get a snapshot of all states here to prevent constantly getting/releasing 2645 // locks in a tight loop below. This snapshot won't become stale halfway 2646 // through this because flushing and cleanup never happen in parallel. 2647 blockStates := s.BlockStatesSnapshot() 2648 blockStatesSnapshot, bootstrapped := blockStates.UnwrapValue() 2649 if !bootstrapped { 2650 return errShardIsNotBootstrapped 2651 } 2652 2653 toDelete := fs.FileSetFilesSlice(make([]fs.FileSetFile, 0, len(filesets))) 2654 for _, datafile := range filesets { 2655 fileID := datafile.ID 2656 blockState := blockStatesSnapshot.Snapshot[fileID.BlockStart] 2657 if fileID.VolumeIndex < blockState.ColdVersion { 2658 toDelete = append(toDelete, datafile) 2659 } 2660 } 2661 2662 return s.deleteFilesFn(toDelete.Filepaths()) 2663 } 2664 2665 func (s *dbShard) Repair( 2666 ctx context.Context, 2667 nsCtx namespace.Context, 2668 nsMeta namespace.Metadata, 2669 tr xtime.Range, 2670 repairer databaseShardRepairer, 2671 ) (repair.MetadataComparisonResult, error) { 2672 return repairer.Repair(ctx, nsCtx, nsMeta, tr, s) 2673 } 2674 2675 func (s *dbShard) AggregateTiles( 2676 ctx context.Context, 2677 sourceNs, targetNs Namespace, 2678 shardID uint32, 2679 onFlushSeries persist.OnFlushSeries, 2680 opts AggregateTilesOptions, 2681 ) (int64, error) { 2682 var multiErr xerrors.MultiError 2683 2684 processedTileCount, nextVolume, err := s.tileAggregator.AggregateTiles( 2685 ctx, sourceNs, targetNs, shardID, onFlushSeries, opts) 2686 if err != nil { 2687 // NB: cannot return on the error here, must finish writing. 2688 multiErr = multiErr.Add(err) 2689 } else { 2690 // Notify all block leasers that a new volume for the namespace/shard/blockstart 2691 // has been created. This will block until all leasers have relinquished their 2692 // leases. 2693 // NB: markWarmFlushStateSuccess=true because there are no flushes happening in this 2694 // flow, and we need to set WarmStatus to fileOpSuccess explicitly in order to make 2695 // the new blocks readable. 2696 if err = s.finishWriting(opts.Start, nextVolume, true); err != nil { 2697 multiErr = multiErr.Add(err) 2698 } 2699 } 2700 2701 if err := multiErr.FinalError(); err != nil { 2702 return 0, err 2703 } 2704 2705 s.logger.Debug("finished aggregating tiles", 2706 zap.Uint32("shard", s.ID()), 2707 zap.Int64("processedTiles", processedTileCount)) 2708 2709 return processedTileCount, nil 2710 } 2711 2712 func (s *dbShard) BootstrapState() BootstrapState { 2713 s.RLock() 2714 bs := s.bootstrapState 2715 s.RUnlock() 2716 return bs 2717 } 2718 2719 func (s *dbShard) DocRef(id ident.ID) (doc.Metadata, bool, error) { 2720 s.RLock() 2721 defer s.RUnlock() 2722 2723 entry, err := s.lookupEntryWithLock(id) 2724 if err == nil { 2725 return entry.Series.Metadata(), true, nil 2726 } 2727 if err == errShardEntryNotFound { 2728 return emptyDoc, false, nil 2729 } 2730 return emptyDoc, false, err 2731 } 2732 2733 func (s *dbShard) LatestVolume(blockStart xtime.UnixNano) (int, error) { 2734 return s.namespaceReaderMgr.latestVolume(s.shard, blockStart) 2735 } 2736 2737 func (s *dbShard) OpenStreamingReader(blockStart xtime.UnixNano) (fs.DataFileSetReader, error) { 2738 latestVolume, err := s.LatestVolume(blockStart) 2739 if err != nil { 2740 return nil, err 2741 } 2742 2743 reader, err := s.newReaderFn(s.opts.BytesPool(), s.opts.CommitLogOptions().FilesystemOptions()) 2744 if err != nil { 2745 return nil, err 2746 } 2747 2748 openOpts := fs.DataReaderOpenOptions{ 2749 Identifier: fs.FileSetFileIdentifier{ 2750 Namespace: s.namespace.ID(), 2751 Shard: s.ID(), 2752 BlockStart: blockStart, 2753 VolumeIndex: latestVolume, 2754 }, 2755 FileSetType: persist.FileSetFlushType, 2756 StreamingEnabled: true, 2757 } 2758 2759 if err := reader.Open(openOpts); err != nil { 2760 return nil, err 2761 } 2762 2763 return reader, nil 2764 } 2765 2766 func (s *dbShard) logFlushResult(r dbShardFlushResult) { 2767 s.logger.Debug("shard flush outcome", 2768 zap.Uint32("shard", s.ID()), 2769 zap.Int64("numBlockDoesNotExist", r.numBlockDoesNotExist), 2770 ) 2771 } 2772 2773 func (s *dbShard) finishWriting( 2774 blockStart xtime.UnixNano, 2775 nextVersion int, 2776 markWarmFlushStateSuccess bool, 2777 ) error { 2778 if markWarmFlushStateSuccess { 2779 s.markWarmDataFlushStateSuccess(blockStart) 2780 s.markWarmIndexFlushStateSuccess(blockStart) 2781 } 2782 2783 // After writing the full block successfully update the ColdVersionFlushed number. This will 2784 // allow the SeekerManager to open a lease on the latest version of the fileset files because 2785 // the BlockLeaseVerifier will check the ColdVersionFlushed value, but the buffer only looks at 2786 // ColdVersionRetrievable so a concurrent tick will not yet cause the blocks in memory to be 2787 // evicted (which is the desired behavior because we haven't updated the open leases yet which 2788 // means the newly written data is not available for querying via the SeekerManager yet.) 2789 s.setFlushStateColdVersionFlushed(blockStart, nextVersion) 2790 2791 // Notify all block leasers that a new volume for the namespace/shard/blockstart 2792 // has been created. This will block until all leasers have relinquished their 2793 // leases. 2794 _, err := s.opts.BlockLeaseManager().UpdateOpenLeases(block.LeaseDescriptor{ 2795 Namespace: s.namespace.ID(), 2796 Shard: s.ID(), 2797 BlockStart: blockStart, 2798 }, block.LeaseState{Volume: nextVersion}) 2799 // After writing the full block successfully **and** propagating the new lease to the 2800 // BlockLeaseManager, update the ColdVersionRetrievable in the flush state. Once this function 2801 // completes concurrent ticks will be able to evict the data from memory that was just flushed 2802 // (which is now safe to do since the SeekerManager has been notified of the presence of new 2803 // files). 2804 // 2805 // NB(rartoul): Ideally the ColdVersionRetrievable would only be updated if the call to UpdateOpenLeases 2806 // succeeded, but that would allow the ColdVersionRetrievable and ColdVersionFlushed numbers to drift 2807 // which would increase the complexity of the code to address a situation that is probably not 2808 // recoverable (failure to UpdateOpenLeases is an invariant violated error). 2809 s.setFlushStateColdVersionRetrievable(blockStart, nextVersion) 2810 if err != nil { 2811 instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), func(l *zap.Logger) { 2812 l.With( 2813 zap.String("namespace", s.namespace.ID().String()), 2814 zap.Uint32("shard", s.ID()), 2815 zap.Time("blockStart", blockStart.ToTime()), 2816 zap.Int("nextVersion", nextVersion), 2817 zap.Error(err), 2818 ).Error("failed to update open leases after updating flush state cold version") 2819 }) 2820 return err 2821 } 2822 return nil 2823 } 2824 2825 type shardColdFlushDone struct { 2826 startTime xtime.UnixNano 2827 nextVersion int 2828 close persist.DataCloser 2829 } 2830 2831 type shardColdFlush struct { 2832 shard *dbShard 2833 doneFns []shardColdFlushDone 2834 } 2835 2836 func (s shardColdFlush) Done() error { 2837 multiErr := xerrors.NewMultiError() 2838 for _, done := range s.doneFns { 2839 startTime := done.startTime 2840 nextVersion := done.nextVersion 2841 2842 if err := done.close(); err != nil { 2843 multiErr = multiErr.Add(err) 2844 continue 2845 } 2846 2847 err := s.shard.finishWriting(startTime, nextVersion, false) 2848 if err != nil { 2849 multiErr = multiErr.Add(err) 2850 } 2851 } 2852 return multiErr.FinalError() 2853 } 2854 2855 // dbShardFlushResult is a helper struct for keeping track of the result of flushing all the 2856 // series in the shard. 2857 type dbShardFlushResult struct { 2858 numBlockDoesNotExist int64 2859 } 2860 2861 func (r *dbShardFlushResult) update(u series.FlushOutcome) { 2862 if u == series.FlushOutcomeBlockDoesNotExist { 2863 r.numBlockDoesNotExist++ 2864 } 2865 }