github.com/m3db/m3@v1.5.0/src/dbnode/storage/index.go (about) 1 // Copyright (c) 2020 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "bytes" 25 "errors" 26 "fmt" 27 "io" 28 "math" 29 goruntime "runtime" 30 "sort" 31 "strconv" 32 "sync" 33 "time" 34 35 "github.com/m3db/m3/src/dbnode/namespace" 36 "github.com/m3db/m3/src/dbnode/persist" 37 "github.com/m3db/m3/src/dbnode/persist/fs" 38 "github.com/m3db/m3/src/dbnode/retention" 39 "github.com/m3db/m3/src/dbnode/runtime" 40 "github.com/m3db/m3/src/dbnode/sharding" 41 "github.com/m3db/m3/src/dbnode/storage/block" 42 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 43 m3dberrors "github.com/m3db/m3/src/dbnode/storage/errors" 44 "github.com/m3db/m3/src/dbnode/storage/index" 45 "github.com/m3db/m3/src/dbnode/storage/index/compaction" 46 "github.com/m3db/m3/src/dbnode/storage/index/convert" 47 "github.com/m3db/m3/src/dbnode/storage/limits" 48 "github.com/m3db/m3/src/dbnode/storage/limits/permits" 49 "github.com/m3db/m3/src/dbnode/storage/series" 50 "github.com/m3db/m3/src/dbnode/tracepoint" 51 "github.com/m3db/m3/src/dbnode/ts/writes" 52 "github.com/m3db/m3/src/m3ninx/doc" 53 "github.com/m3db/m3/src/m3ninx/idx" 54 m3ninxindex "github.com/m3db/m3/src/m3ninx/index" 55 "github.com/m3db/m3/src/m3ninx/index/segment" 56 "github.com/m3db/m3/src/m3ninx/index/segment/builder" 57 idxpersist "github.com/m3db/m3/src/m3ninx/persist" 58 "github.com/m3db/m3/src/m3ninx/x" 59 "github.com/m3db/m3/src/x/clock" 60 "github.com/m3db/m3/src/x/context" 61 xerrors "github.com/m3db/m3/src/x/errors" 62 "github.com/m3db/m3/src/x/ident" 63 "github.com/m3db/m3/src/x/instrument" 64 xopentracing "github.com/m3db/m3/src/x/opentracing" 65 xresource "github.com/m3db/m3/src/x/resource" 66 xtime "github.com/m3db/m3/src/x/time" 67 68 "github.com/m3db/bitset" 69 "github.com/opentracing/opentracing-go" 70 opentracinglog "github.com/opentracing/opentracing-go/log" 71 "github.com/uber-go/tally" 72 "go.uber.org/atomic" 73 "go.uber.org/zap" 74 ) 75 76 var ( 77 errDbIndexAlreadyClosed = errors.New("database index has already been closed") 78 errDbIndexUnableToWriteClosed = errors.New("unable to write to database index, already closed") 79 errDbIndexUnableToQueryClosed = errors.New("unable to query database index, already closed") 80 errDbIndexUnableToFlushClosed = errors.New("unable to flush database index, already closed") 81 errDbIndexUnableToCleanupClosed = errors.New("unable to cleanup database index, already closed") 82 errDbIndexTerminatingTickCancellation = errors.New("terminating tick early due to cancellation") 83 errDbIndexIsBootstrapping = errors.New("index is already bootstrapping") 84 errDbIndexDoNotIndexSeries = errors.New("series matched do not index fields") 85 ) 86 87 const ( 88 defaultFlushReadDataBlocksBatchSize = int64(4096) 89 nsIndexReportStatsInterval = 10 * time.Second 90 91 defaultFlushDocsBatchSize = 8192 92 ) 93 94 var allQuery = idx.NewAllQuery() 95 96 // nolint: maligned 97 type nsIndex struct { 98 state nsIndexState 99 100 // all the vars below this line are not modified past the ctor 101 // and don't require a lock when being accessed. 102 nowFn clock.NowFn 103 blockSize time.Duration 104 retentionPeriod time.Duration 105 futureRetentionPeriod time.Duration 106 bufferPast time.Duration 107 bufferFuture time.Duration 108 coldWritesEnabled bool 109 110 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager 111 indexFilesetsBeforeFn indexFilesetsBeforeFn 112 deleteFilesFn deleteFilesFn 113 readIndexInfoFilesFn readIndexInfoFilesFn 114 115 newBlockFn index.NewBlockFn 116 logger *zap.Logger 117 opts Options 118 nsMetadata namespace.Metadata 119 runtimeOptsListener xresource.SimpleCloser 120 runtimeNsOptsListener xresource.SimpleCloser 121 122 resultsPool index.QueryResultsPool 123 aggregateResultsPool index.AggregateResultsPool 124 125 permitsManager permits.Manager 126 127 // queriesWg tracks outstanding queries to ensure 128 // we wait for all queries to complete before actually closing 129 // blocks and other cleanup tasks on index close 130 queriesWg sync.WaitGroup 131 132 metrics nsIndexMetrics 133 134 // forwardIndexDice determines if an incoming index write should be dual 135 // written to the next block. 136 forwardIndexDice forwardIndexDice 137 138 doNotIndexWithFields []doc.Field 139 140 activeBlock index.Block 141 } 142 143 type nsIndexState struct { 144 sync.RWMutex // NB: guards all variables in this struct 145 146 closed bool 147 closeCh chan struct{} 148 bootstrapState BootstrapState 149 150 runtimeOpts nsIndexRuntimeOptions 151 152 insertQueue namespaceIndexInsertQueue 153 154 // NB: `latestBlock` v `blocksByTime`: blocksByTime contains all the blocks known to `nsIndex`. 155 // `latestBlock` refers to the block with greatest StartTime within blocksByTime. We do this 156 // to skip accessing the map blocksByTime in the vast majority of write/query requests. It's 157 // lazily updated, so it can point to an older element until a Tick()/write rotates it. 158 blocksByTime map[xtime.UnixNano]index.Block 159 latestBlock index.Block 160 161 // NB: `blockStartsDescOrder` contains the keys from the map `blocksByTime` in reverse 162 // chronological order. This is used at query time to enforce determinism about results 163 // returned. 164 // NB(r): Reference to this slice can be safely taken for iteration purposes 165 // for Query(..) since it is rebuilt each time and immutable once built. 166 blocksDescOrderImmutable []blockAndBlockStart 167 168 // shardsFilterID is set every time the shards change to correctly 169 // only return IDs that this node owns. 170 shardsFilterID func(ident.ID) bool 171 172 // shardFilteredForID is set every time the shards change to correctly 173 // only return IDs that this node owns, and the shard responsible for that ID. 174 shardFilteredForID func(id ident.ID) (uint32, bool) 175 176 shardsAssigned map[uint32]struct{} 177 } 178 179 type blockAndBlockStart struct { 180 block index.Block 181 blockStart xtime.UnixNano 182 } 183 184 // NB: nsIndexRuntimeOptions does not contain its own mutex as some of the variables 185 // are needed for each index write which already at least acquires read lock from 186 // nsIndex mutex, so to keep the lock acquisitions to a minimum these are protected 187 // under the same nsIndex mutex. 188 type nsIndexRuntimeOptions struct { 189 insertMode index.InsertMode 190 maxQuerySeriesLimit int64 191 maxQueryDocsLimit int64 192 } 193 194 // NB(prateek): the returned filesets are strictly before the given time, i.e. they 195 // live in the period (-infinity, exclusiveTime). 196 type indexFilesetsBeforeFn func(dir string, 197 nsID ident.ID, 198 exclusiveTime xtime.UnixNano, 199 ) ([]string, error) 200 201 type readIndexInfoFilesFn func(opts fs.ReadIndexInfoFilesOptions) []fs.ReadIndexInfoFileResult 202 203 type newNamespaceIndexOpts struct { 204 md namespace.Metadata 205 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager 206 shardSet sharding.ShardSet 207 opts Options 208 newIndexQueueFn newNamespaceIndexInsertQueueFn 209 newBlockFn index.NewBlockFn 210 } 211 212 // execBlockQueryFn executes a query against the given block whilst tracking state. 213 type execBlockQueryFn func( 214 ctx context.Context, 215 block index.Block, 216 permit permits.Permit, 217 iter index.ResultIterator, 218 opts index.QueryOptions, 219 state *asyncQueryExecState, 220 results index.BaseResults, 221 logFields []opentracinglog.Field, 222 ) 223 224 // newBlockIterFn returns a new ResultIterator for the query. 225 type newBlockIterFn func( 226 ctx context.Context, 227 block index.Block, 228 query index.Query, 229 results index.BaseResults, 230 ) (index.ResultIterator, error) 231 232 // asyncQueryExecState tracks the async execution errors for a query. 233 type asyncQueryExecState struct { 234 sync.RWMutex 235 multiErr xerrors.MultiError 236 waitCount atomic.Uint64 237 } 238 239 func (s *asyncQueryExecState) hasErr() bool { 240 s.RLock() 241 defer s.RUnlock() 242 return s.multiErr.NumErrors() > 0 243 } 244 245 func (s *asyncQueryExecState) addErr(err error) { 246 s.Lock() 247 s.multiErr = s.multiErr.Add(err) 248 s.Unlock() 249 } 250 251 func (s *asyncQueryExecState) incWaited(i int) { 252 s.waitCount.Add(uint64(i)) 253 } 254 255 func (s *asyncQueryExecState) waited() int { 256 return int(s.waitCount.Load()) 257 } 258 259 // newNamespaceIndex returns a new namespaceIndex for the provided namespace. 260 func newNamespaceIndex( 261 nsMD namespace.Metadata, 262 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 263 shardSet sharding.ShardSet, 264 opts Options, 265 ) (NamespaceIndex, error) { 266 return newNamespaceIndexWithOptions(newNamespaceIndexOpts{ 267 md: nsMD, 268 namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr, 269 shardSet: shardSet, 270 opts: opts, 271 newIndexQueueFn: newNamespaceIndexInsertQueue, 272 newBlockFn: index.NewBlock, 273 }) 274 } 275 276 // newNamespaceIndexWithInsertQueueFn is a ctor used in tests to override the insert queue. 277 func newNamespaceIndexWithInsertQueueFn( 278 nsMD namespace.Metadata, 279 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 280 shardSet sharding.ShardSet, 281 newIndexQueueFn newNamespaceIndexInsertQueueFn, 282 opts Options, 283 ) (NamespaceIndex, error) { 284 return newNamespaceIndexWithOptions(newNamespaceIndexOpts{ 285 md: nsMD, 286 namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr, 287 shardSet: shardSet, 288 opts: opts, 289 newIndexQueueFn: newIndexQueueFn, 290 newBlockFn: index.NewBlock, 291 }) 292 } 293 294 // newNamespaceIndexWithNewBlockFn is a ctor used in tests to inject blocks. 295 func newNamespaceIndexWithNewBlockFn( 296 nsMD namespace.Metadata, 297 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 298 shardSet sharding.ShardSet, 299 newBlockFn index.NewBlockFn, 300 opts Options, 301 ) (NamespaceIndex, error) { 302 return newNamespaceIndexWithOptions(newNamespaceIndexOpts{ 303 md: nsMD, 304 namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr, 305 shardSet: shardSet, 306 opts: opts, 307 newIndexQueueFn: newNamespaceIndexInsertQueue, 308 newBlockFn: newBlockFn, 309 }) 310 } 311 312 // newNamespaceIndexWithOptions returns a new namespaceIndex with the provided configuration options. 313 func newNamespaceIndexWithOptions( 314 newIndexOpts newNamespaceIndexOpts, 315 ) (NamespaceIndex, error) { 316 var ( 317 nsMD = newIndexOpts.md 318 shardSet = newIndexOpts.shardSet 319 indexOpts = newIndexOpts.opts.IndexOptions() 320 instrumentOpts = newIndexOpts.opts.InstrumentOptions() 321 newIndexQueueFn = newIndexOpts.newIndexQueueFn 322 newBlockFn = newIndexOpts.newBlockFn 323 coreFn = newIndexOpts.opts.CoreFn() 324 runtimeOptsMgr = newIndexOpts.opts.RuntimeOptionsManager() 325 ) 326 if err := indexOpts.Validate(); err != nil { 327 return nil, err 328 } 329 330 scope := instrumentOpts.MetricsScope(). 331 SubScope("dbindex"). 332 Tagged(map[string]string{ 333 "namespace": nsMD.ID().String(), 334 }) 335 instrumentOpts = instrumentOpts.SetMetricsScope(scope) 336 indexOpts = indexOpts.SetInstrumentOptions(instrumentOpts) 337 338 nowFn := indexOpts.ClockOptions().NowFn() 339 logger := indexOpts.InstrumentOptions().Logger() 340 341 var doNotIndexWithFields []doc.Field 342 if m := newIndexOpts.opts.DoNotIndexWithFieldsMap(); m != nil && len(m) != 0 { 343 for k, v := range m { 344 doNotIndexWithFields = append(doNotIndexWithFields, doc.Field{ 345 Name: []byte(k), 346 Value: []byte(v), 347 }) 348 } 349 } 350 351 idx := &nsIndex{ 352 state: nsIndexState{ 353 closeCh: make(chan struct{}), 354 runtimeOpts: nsIndexRuntimeOptions{ 355 insertMode: indexOpts.InsertMode(), // FOLLOWUP(prateek): wire to allow this to be tweaked at runtime 356 }, 357 blocksByTime: make(map[xtime.UnixNano]index.Block), 358 shardsAssigned: make(map[uint32]struct{}), 359 }, 360 361 nowFn: nowFn, 362 blockSize: nsMD.Options().IndexOptions().BlockSize(), 363 retentionPeriod: nsMD.Options().RetentionOptions().RetentionPeriod(), 364 futureRetentionPeriod: nsMD.Options().RetentionOptions().FutureRetentionPeriod(), 365 bufferPast: nsMD.Options().RetentionOptions().BufferPast(), 366 bufferFuture: nsMD.Options().RetentionOptions().BufferFuture(), 367 coldWritesEnabled: nsMD.Options().ColdWritesEnabled(), 368 369 namespaceRuntimeOptsMgr: newIndexOpts.namespaceRuntimeOptsMgr, 370 indexFilesetsBeforeFn: fs.IndexFileSetsBefore, 371 readIndexInfoFilesFn: fs.ReadIndexInfoFiles, 372 deleteFilesFn: fs.DeleteFiles, 373 374 newBlockFn: newBlockFn, 375 opts: newIndexOpts.opts, 376 logger: logger, 377 nsMetadata: nsMD, 378 379 resultsPool: indexOpts.QueryResultsPool(), 380 aggregateResultsPool: indexOpts.AggregateResultsPool(), 381 382 permitsManager: newIndexOpts.opts.PermitsOptions().IndexQueryPermitsManager(), 383 metrics: newNamespaceIndexMetrics(indexOpts, instrumentOpts), 384 385 doNotIndexWithFields: doNotIndexWithFields, 386 } 387 388 activeBlock, err := idx.newBlockFn(xtime.UnixNano(0), idx.nsMetadata, 389 index.BlockOptions{ActiveBlock: true}, idx.namespaceRuntimeOptsMgr, 390 idx.opts.IndexOptions()) 391 if err != nil { 392 return nil, idx.unableToAllocBlockInvariantError(err) 393 } 394 395 idx.activeBlock = activeBlock 396 397 // Assign shard set upfront. 398 idx.AssignShardSet(shardSet) 399 400 idx.runtimeOptsListener = runtimeOptsMgr.RegisterListener(idx) 401 idx.runtimeNsOptsListener = idx.namespaceRuntimeOptsMgr.RegisterListener(idx) 402 403 // set up forward index dice. 404 dice, err := newForwardIndexDice(newIndexOpts.opts) 405 if err != nil { 406 return nil, err 407 } 408 409 if dice.enabled { 410 logger.Info("namespace forward indexing configured", 411 zap.Stringer("namespace", nsMD.ID()), 412 zap.Bool("enabled", dice.enabled), 413 zap.Duration("threshold", dice.forwardIndexThreshold), 414 zap.Float64("rate", dice.forwardIndexDice.Rate())) 415 } else { 416 idxOpts := newIndexOpts.opts.IndexOptions() 417 logger.Info("namespace forward indexing not enabled", 418 zap.Stringer("namespace", nsMD.ID()), 419 zap.Bool("enabled", false), 420 zap.Float64("threshold", idxOpts.ForwardIndexThreshold()), 421 zap.Float64("probability", idxOpts.ForwardIndexProbability())) 422 } 423 424 idx.forwardIndexDice = dice 425 426 // allocate indexing queue and start it up. 427 queue := newIndexQueueFn(idx.writeBatches, nsMD, nowFn, coreFn, scope) 428 if err := queue.Start(); err != nil { 429 return nil, err 430 } 431 idx.state.insertQueue = queue 432 433 // allocate the current block to ensure we're able to index as soon as we return 434 currentBlock := xtime.ToUnixNano(nowFn()).Truncate(idx.blockSize) 435 idx.state.RLock() 436 _, err = idx.ensureBlockPresentWithRLock(currentBlock) 437 idx.state.RUnlock() 438 if err != nil { 439 return nil, err 440 } 441 442 // Report stats 443 go idx.reportStatsUntilClosed() 444 445 return idx, nil 446 } 447 448 func (i *nsIndex) SetRuntimeOptions(runtime.Options) { 449 } 450 451 func (i *nsIndex) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptions) { 452 // We don't like to log from every single index segment that has 453 // settings updated so we log the changes here. 454 i.logger.Info("set namespace runtime index options", 455 zap.Stringer("namespace", i.nsMetadata.ID()), 456 zap.Any("writeIndexingPerCPUConcurrency", opts.WriteIndexingPerCPUConcurrency()), 457 zap.Any("flushIndexingPerCPUConcurrency", opts.FlushIndexingPerCPUConcurrency())) 458 } 459 460 func (i *nsIndex) reportStatsUntilClosed() { 461 ticker := time.NewTicker(nsIndexReportStatsInterval) 462 defer ticker.Stop() 463 464 for { 465 select { 466 case <-ticker.C: 467 err := i.reportStats() 468 if err != nil { 469 i.logger.Warn("could not report index stats", zap.Error(err)) 470 } 471 case <-i.state.closeCh: 472 return 473 } 474 } 475 } 476 477 type nsIndexCompactionLevelStats struct { 478 numSegments int64 479 numTotalDocs int64 480 } 481 482 func (i *nsIndex) reportStats() error { 483 i.state.RLock() 484 defer i.state.RUnlock() 485 486 foregroundLevels := i.metrics.blockMetrics.ForegroundSegments.Levels 487 foregroundLevelStats := make([]nsIndexCompactionLevelStats, len(foregroundLevels)) 488 489 backgroundLevels := i.metrics.blockMetrics.BackgroundSegments.Levels 490 backgroundLevelStats := make([]nsIndexCompactionLevelStats, len(backgroundLevels)) 491 492 flushedLevels := i.metrics.blockMetrics.FlushedSegments.Levels 493 flushedLevelStats := make([]nsIndexCompactionLevelStats, len(flushedLevels)) 494 495 minIndexConcurrency := 0 496 maxIndexConcurrency := 0 497 sumIndexConcurrency := 0 498 numIndexingStats := 0 499 reporter := index.NewBlockStatsReporter( 500 func(s index.BlockSegmentStats) { 501 var ( 502 levels []nsIndexBlocksSegmentsLevelMetrics 503 levelStats []nsIndexCompactionLevelStats 504 ) 505 switch s.Type { 506 case index.ActiveForegroundSegment: 507 levels = foregroundLevels 508 levelStats = foregroundLevelStats 509 case index.ActiveBackgroundSegment: 510 levels = backgroundLevels 511 levelStats = backgroundLevelStats 512 case index.FlushedSegment: 513 levels = flushedLevels 514 levelStats = flushedLevelStats 515 } 516 517 for i, l := range levels { 518 contained := s.Size >= l.MinSizeInclusive && s.Size < l.MaxSizeExclusive 519 if !contained { 520 continue 521 } 522 523 l.SegmentsAge.Record(s.Age) 524 levelStats[i].numSegments++ 525 levelStats[i].numTotalDocs += s.Size 526 527 break 528 } 529 }, 530 func(s index.BlockIndexingStats) { 531 first := numIndexingStats == 0 532 numIndexingStats++ 533 534 if first { 535 minIndexConcurrency = s.IndexConcurrency 536 maxIndexConcurrency = s.IndexConcurrency 537 sumIndexConcurrency = s.IndexConcurrency 538 return 539 } 540 541 if v := s.IndexConcurrency; v < minIndexConcurrency { 542 minIndexConcurrency = v 543 } 544 if v := s.IndexConcurrency; v > maxIndexConcurrency { 545 maxIndexConcurrency = v 546 } 547 sumIndexConcurrency += s.IndexConcurrency 548 }) 549 550 // iterate known blocks in a defined order of time (newest first) 551 // for debug log ordering 552 for _, b := range i.state.blocksDescOrderImmutable { 553 err := b.block.Stats(reporter) 554 if err == index.ErrUnableReportStatsBlockClosed { 555 // Closed blocks are temporarily in the list still 556 continue 557 } 558 if err != nil { 559 return err 560 } 561 } 562 // Active block should always be open. 563 if err := i.activeBlock.Stats(reporter); err != nil { 564 return err 565 } 566 567 // Update level stats. 568 for _, elem := range []struct { 569 levels []nsIndexBlocksSegmentsLevelMetrics 570 levelStats []nsIndexCompactionLevelStats 571 }{ 572 {foregroundLevels, foregroundLevelStats}, 573 {backgroundLevels, backgroundLevelStats}, 574 } { 575 for i, v := range elem.levelStats { 576 elem.levels[i].NumSegments.Update(float64(v.numSegments)) 577 elem.levels[i].NumTotalDocs.Update(float64(v.numTotalDocs)) 578 } 579 } 580 581 // Update the indexing stats. 582 i.metrics.indexingConcurrencyMin.Update(float64(minIndexConcurrency)) 583 i.metrics.indexingConcurrencyMax.Update(float64(maxIndexConcurrency)) 584 avgIndexConcurrency := float64(sumIndexConcurrency) / float64(numIndexingStats) 585 i.metrics.indexingConcurrencyAvg.Update(avgIndexConcurrency) 586 587 return nil 588 } 589 590 func (i *nsIndex) BlockStartForWriteTime(writeTime xtime.UnixNano) xtime.UnixNano { 591 return writeTime.Truncate(i.blockSize) 592 } 593 594 func (i *nsIndex) BlockForBlockStart(blockStart xtime.UnixNano) (index.Block, error) { 595 result, err := i.ensureBlockPresent(blockStart) 596 if err != nil { 597 return nil, err 598 } 599 return result.block, nil 600 } 601 602 // NB(prateek): including the call chains leading to this point: 603 // 604 // - For new entry (previously unseen in the shard): 605 // shard.WriteTagged() 606 // => shard.insertSeriesAsyncBatched() 607 // => shardInsertQueue.Insert() 608 // => shard.writeBatch() 609 // => index.WriteBatch() 610 // => indexQueue.Insert() 611 // => index.writeBatch() 612 // 613 // - For entry which exists in the shard, but needs indexing (either past 614 // the TTL or the last indexing hasn't happened/failed): 615 // shard.WriteTagged() 616 // => shard.insertSeriesForIndexingAsyncBatched() 617 // => shardInsertQueue.Insert() 618 // => shard.writeBatch() 619 // => index.Write() 620 // => indexQueue.Insert() 621 // => index.writeBatch() 622 623 func (i *nsIndex) WriteBatch( 624 batch *index.WriteBatch, 625 ) error { 626 // Filter anything with a pending index out before acquiring lock. 627 batch.MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize() 628 if !batch.PendingAny() { 629 return nil 630 } 631 632 i.state.RLock() 633 if !i.isOpenWithRLock() { 634 i.state.RUnlock() 635 i.metrics.insertAfterClose.Inc(1) 636 err := errDbIndexUnableToWriteClosed 637 batch.MarkUnmarkedEntriesError(err) 638 return err 639 } 640 641 // NB(prateek): retrieving insertMode here while we have the RLock. 642 insertMode := i.state.runtimeOpts.insertMode 643 wg, err := i.state.insertQueue.InsertBatch(batch) 644 645 // release the lock because we don't need it past this point. 646 i.state.RUnlock() 647 648 // if we're unable to index, we still have to finalize the reference we hold. 649 if err != nil { 650 batch.MarkUnmarkedEntriesError(err) 651 return err 652 } 653 // once the write has been queued in the indexInsertQueue, it assumes 654 // responsibility for calling the resource hooks. 655 656 // wait/terminate depending on if we are indexing synchronously or not. 657 if insertMode != index.InsertAsync { 658 wg.Wait() 659 660 // Re-sort the batch by initial enqueue order 661 if numErrs := batch.NumErrs(); numErrs > 0 { 662 // Restore the sort order from when enqueued for the caller. 663 batch.SortByEnqueued() 664 return fmt.Errorf("check batch: %d insert errors", numErrs) 665 } 666 } 667 668 return nil 669 } 670 671 func (i *nsIndex) WritePending( 672 pending []writes.PendingIndexInsert, 673 ) error { 674 // Filter anything with a pending index out before acquiring lock. 675 incoming := pending 676 pending = pending[:0] 677 for j := range incoming { 678 t := i.BlockStartForWriteTime(incoming[j].Entry.Timestamp) 679 if incoming[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) { 680 continue 681 } 682 // Continue to add this element. 683 pending = append(pending, incoming[j]) 684 } 685 if len(pending) == 0 { 686 return nil 687 } 688 689 i.state.RLock() 690 if !i.isOpenWithRLock() { 691 i.state.RUnlock() 692 i.metrics.insertAfterClose.Inc(1) 693 return errDbIndexUnableToWriteClosed 694 } 695 _, err := i.state.insertQueue.InsertPending(pending) 696 // release the lock because we don't need it past this point. 697 i.state.RUnlock() 698 699 return err 700 } 701 702 // WriteBatches is called by the indexInsertQueue. 703 func (i *nsIndex) writeBatches( 704 batch *index.WriteBatch, 705 ) { 706 // NB(prateek): we use a read lock to guard against mutation of the 707 // indexBlocks, mutations within the underlying blocks are guarded 708 // by primitives internal to it. 709 i.state.RLock() 710 if !i.isOpenWithRLock() { 711 i.state.RUnlock() 712 // NB(prateek): deliberately skip calling any of the `OnIndexFinalize` methods 713 // on the provided inserts to terminate quicker during shutdown. 714 return 715 } 716 var ( 717 now = xtime.ToUnixNano(i.nowFn()) 718 blockSize = i.blockSize 719 futureLimit = now.Add(1 * i.bufferFuture) 720 pastLimit = now.Add(-1 * i.bufferPast) 721 earliestBlockStartToRetain = retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now) 722 batchOptions = batch.Options() 723 forwardIndexDice = i.forwardIndexDice 724 forwardIndexEnabled = forwardIndexDice.enabled 725 total int 726 notSkipped int 727 forwardIndexHits int 728 forwardIndexMiss int 729 730 forwardIndexBatch *index.WriteBatch 731 ) 732 // NB(r): Release lock early to avoid writing batches impacting ticking 733 // speed, etc. 734 // Sometimes foreground compaction can take a long time during heavy inserts. 735 // Each lookup to ensureBlockPresent checks that index is still open, etc. 736 i.state.RUnlock() 737 738 if forwardIndexEnabled { 739 // NB(arnikola): Don't initialize forward index batch if forward indexing 740 // is not enabled. 741 forwardIndexBatch = index.NewWriteBatch(batchOptions) 742 } 743 744 // Ensure timestamp is not too old/new based on retention policies and that 745 // doc is valid. Add potential forward writes to the forwardWriteBatch. 746 batch.ForEach( 747 func(idx int, entry index.WriteBatchEntry, 748 d doc.Metadata, _ index.WriteBatchEntryResult) { 749 total++ 750 751 if len(i.doNotIndexWithFields) != 0 { 752 // This feature rarely used, do not optimize and just do n*m checks. 753 drop := true 754 for _, matchField := range i.doNotIndexWithFields { 755 matchedField := false 756 for _, actualField := range d.Fields { 757 if bytes.Equal(actualField.Name, matchField.Name) { 758 matchedField = bytes.Equal(actualField.Value, matchField.Value) 759 break 760 } 761 } 762 if !matchedField { 763 drop = false 764 break 765 } 766 } 767 if drop { 768 batch.MarkUnmarkedEntryError(errDbIndexDoNotIndexSeries, idx) 769 return 770 } 771 } 772 773 ts := entry.Timestamp 774 // NB(bodu): Always check first to see if the write is within retention. 775 if !ts.After(earliestBlockStartToRetain) { 776 batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx) 777 return 778 } 779 780 if !futureLimit.After(ts) { 781 batch.MarkUnmarkedEntryError(m3dberrors.ErrTooFuture, idx) 782 return 783 } 784 785 if ts.Before(pastLimit) && !i.coldWritesEnabled { 786 // NB(bodu): We only mark entries as too far in the past if 787 // cold writes are not enabled. 788 batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx) 789 return 790 } 791 792 if forwardIndexEnabled { 793 if forwardIndexDice.roll(ts) { 794 forwardIndexHits++ 795 forwardEntryTimestamp := ts.Truncate(blockSize).Add(blockSize) 796 if entry.OnIndexSeries.NeedsIndexUpdate(forwardEntryTimestamp) { 797 forwardIndexEntry := entry 798 forwardIndexEntry.Timestamp = forwardEntryTimestamp 799 t := i.BlockStartForWriteTime(forwardEntryTimestamp) 800 forwardIndexEntry.OnIndexSeries.OnIndexPrepare(t) 801 forwardIndexBatch.Append(forwardIndexEntry, d) 802 } 803 } else { 804 forwardIndexMiss++ 805 } 806 } 807 808 notSkipped++ 809 }) 810 811 if forwardIndexEnabled && forwardIndexBatch.Len() > 0 { 812 i.metrics.forwardIndexCounter.Inc(int64(forwardIndexBatch.Len())) 813 batch.AppendAll(forwardIndexBatch) 814 } 815 816 // Sort the inserts by which block they're applicable for, and do the inserts 817 // for each block, making sure to not try to insert any entries already marked 818 // with a result. 819 batch.ForEachUnmarkedBatchByBlockStart(i.writeBatchForBlockStart) 820 821 // Track index insertions. 822 // Note: attemptTotal should = attemptSkip + attemptWrite. 823 i.metrics.asyncInsertAttemptTotal.Inc(int64(total)) 824 i.metrics.asyncInsertAttemptSkip.Inc(int64(total - notSkipped)) 825 i.metrics.forwardIndexHits.Inc(int64(forwardIndexHits)) 826 i.metrics.forwardIndexMisses.Inc(int64(forwardIndexMiss)) 827 } 828 829 func (i *nsIndex) writeBatchForBlockStart( 830 blockStart xtime.UnixNano, batch *index.WriteBatch, 831 ) { 832 // NB(r): Capture pending entries so we can emit the latencies 833 pending := batch.PendingEntries() 834 numPending := len(pending) 835 836 // Track attempted write. 837 // Note: attemptTotal should = attemptSkip + attemptWrite. 838 i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending)) 839 840 // i.e. we have the block and the inserts, perform the writes. 841 result, err := i.activeBlock.WriteBatch(batch) 842 843 // Record the end to end indexing latency. 844 now := i.nowFn() 845 for idx := range pending { 846 took := now.Sub(pending[idx].EnqueuedAt) 847 i.metrics.insertEndToEndLatency.Record(took) 848 } 849 850 // NB: we don't need to do anything to the OnIndexSeries refs in `inserts` at this point, 851 // the index.Block WriteBatch assumes responsibility for calling the appropriate methods. 852 if n := result.NumSuccess; n > 0 { 853 i.metrics.asyncInsertSuccess.Inc(n) 854 } 855 856 // Record mutable segments count foreground/background if latest block. 857 if stats := result.MutableSegmentsStats; !stats.Empty() { 858 i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.Foreground.NumSegments)) 859 i.metrics.latestBlockNumDocsForeground.Update(float64(stats.Foreground.NumDocs)) 860 i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.Background.NumSegments)) 861 i.metrics.latestBlockNumDocsBackground.Update(float64(stats.Background.NumDocs)) 862 } 863 864 // Allow for duplicate write errors since due to re-indexing races 865 // we may try to re-index a series more than once. 866 if err := i.sanitizeAllowDuplicatesWriteError(err); err != nil { 867 numErrors := numPending - int(result.NumSuccess) 868 if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok { 869 // If it was a batch partial error we know exactly how many failed 870 // after filtering out for duplicate ID errors. 871 numErrors = len(partialError.Errs()) 872 } 873 i.metrics.asyncInsertErrors.Inc(int64(numErrors)) 874 i.logger.Error("error writing to index block", zap.Error(err)) 875 } 876 } 877 878 // Bootstrap bootstraps the index with the provided blocks. 879 func (i *nsIndex) Bootstrap( 880 bootstrapResults result.IndexResults, 881 ) error { 882 i.state.Lock() 883 if i.state.bootstrapState == Bootstrapping { 884 i.state.Unlock() 885 return errDbIndexIsBootstrapping 886 } 887 i.state.bootstrapState = Bootstrapping 888 i.state.Unlock() 889 890 i.state.RLock() 891 defer func() { 892 i.state.RUnlock() 893 i.state.Lock() 894 i.state.bootstrapState = Bootstrapped 895 i.state.Unlock() 896 }() 897 898 var multiErr xerrors.MultiError 899 for blockStart, blockResults := range bootstrapResults { 900 blockResult, err := i.ensureBlockPresentWithRLock(blockStart) 901 if err != nil { // should never happen 902 multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err)) 903 continue 904 } 905 if err := blockResult.block.AddResults(blockResults); err != nil { 906 multiErr = multiErr.Add(err) 907 } 908 } 909 910 return multiErr.FinalError() 911 } 912 913 func (i *nsIndex) Bootstrapped() bool { 914 i.state.RLock() 915 result := i.state.bootstrapState == Bootstrapped 916 i.state.RUnlock() 917 return result 918 } 919 920 func (i *nsIndex) Tick( 921 c context.Cancellable, 922 startTime xtime.UnixNano, 923 ) (namespaceIndexTickResult, error) { 924 var result namespaceIndexTickResult 925 926 // First collect blocks and acquire lock to remove those that need removing 927 // but then release lock so can Tick and do other expensive tasks 928 // such as notify of sealed blocks. 929 tickingBlocks, multiErr := i.tickingBlocks(startTime) 930 931 result.NumBlocks = int64(tickingBlocks.totalBlocks) 932 for _, block := range tickingBlocks.tickingBlocks { 933 if c.IsCancelled() { 934 multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation) 935 return result, multiErr.FinalError() 936 } 937 938 blockTickResult, tickErr := block.Tick(c) 939 multiErr = multiErr.Add(tickErr) 940 result.NumSegments += blockTickResult.NumSegments 941 result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped 942 result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable 943 result.NumTotalDocs += blockTickResult.NumDocs 944 result.FreeMmap += blockTickResult.FreeMmap 945 } 946 947 blockTickResult, tickErr := tickingBlocks.activeBlock.Tick(c) 948 multiErr = multiErr.Add(tickErr) 949 result.NumSegments += blockTickResult.NumSegments 950 result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped 951 result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable 952 result.NumTotalDocs += blockTickResult.NumDocs 953 result.FreeMmap += blockTickResult.FreeMmap 954 955 i.metrics.tick.Inc(1) 956 957 return result, multiErr.FinalError() 958 } 959 960 type tickingBlocksResult struct { 961 totalBlocks int 962 activeBlock index.Block 963 tickingBlocks []index.Block 964 } 965 966 func (i *nsIndex) tickingBlocks( 967 startTime xtime.UnixNano, 968 ) (tickingBlocksResult, xerrors.MultiError) { 969 multiErr := xerrors.NewMultiError() 970 earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod( 971 i.retentionPeriod, i.blockSize, startTime) 972 973 i.state.Lock() 974 activeBlock := i.activeBlock 975 tickingBlocks := make([]index.Block, 0, len(i.state.blocksByTime)) 976 defer func() { 977 i.updateBlockStartsWithLock() 978 i.state.Unlock() 979 }() 980 981 for blockStart, block := range i.state.blocksByTime { 982 // Drop any blocks past the retention period. 983 if blockStart.Before(earliestBlockStartToRetain) { 984 multiErr = multiErr.Add(block.Close()) 985 delete(i.state.blocksByTime, blockStart) 986 continue 987 } 988 989 // Tick any blocks we're going to retain, but don't tick inline here 990 // we'll do this out of the block. 991 tickingBlocks = append(tickingBlocks, block) 992 993 // Seal any blocks that are sealable while holding lock (seal is fast). 994 if !blockStart.After(i.lastSealableBlockStart(startTime)) && !block.IsSealed() { 995 multiErr = multiErr.Add(block.Seal()) 996 } 997 } 998 999 return tickingBlocksResult{ 1000 totalBlocks: len(i.state.blocksByTime), 1001 activeBlock: activeBlock, 1002 tickingBlocks: tickingBlocks, 1003 }, multiErr 1004 } 1005 1006 func (i *nsIndex) WarmFlush( 1007 flush persist.IndexFlush, 1008 shards []databaseShard, 1009 ) error { 1010 if len(shards) == 0 { 1011 // No-op if no shards currently owned. 1012 return nil 1013 } 1014 1015 flushable, err := i.flushableBlocks(shards, series.WarmWrite) 1016 if err != nil { 1017 return err 1018 } 1019 1020 // Determine the current flush indexing concurrency. 1021 namespaceRuntimeOpts := i.namespaceRuntimeOptsMgr.Get() 1022 perCPUFraction := namespaceRuntimeOpts.FlushIndexingPerCPUConcurrencyOrDefault() 1023 cpus := math.Ceil(perCPUFraction * float64(goruntime.GOMAXPROCS(0))) 1024 concurrency := int(math.Max(1, cpus)) 1025 1026 builderOpts := i.opts.IndexOptions().SegmentBuilderOptions(). 1027 SetConcurrency(concurrency) 1028 1029 builder, err := builder.NewBuilderFromDocuments(builderOpts) 1030 if err != nil { 1031 return err 1032 } 1033 defer builder.Close() 1034 1035 // Emit concurrency, then reset gauge to zero to show time 1036 // active during flushing broken down per namespace. 1037 i.metrics.flushIndexingConcurrency.Update(float64(concurrency)) 1038 defer i.metrics.flushIndexingConcurrency.Update(0) 1039 1040 var evicted int 1041 for _, block := range flushable { 1042 immutableSegments, err := i.flushBlock(flush, block, shards, builder) 1043 if err != nil { 1044 return err 1045 } 1046 // Make a result that covers the entire time ranges for the 1047 // block for each shard 1048 fulfilled := result.NewShardTimeRangesFromRange(block.StartTime(), block.EndTime(), 1049 dbShards(shards).IDs()...) 1050 1051 // Add the results to the block. 1052 persistedSegments := make([]result.Segment, 0, len(immutableSegments)) 1053 for _, elem := range immutableSegments { 1054 persistedSegment := result.NewSegment(elem, true) 1055 persistedSegments = append(persistedSegments, persistedSegment) 1056 } 1057 blockResult := result.NewIndexBlock(persistedSegments, fulfilled) 1058 results := result.NewIndexBlockByVolumeType(block.StartTime()) 1059 results.SetBlock(idxpersist.DefaultIndexVolumeType, blockResult) 1060 if err := block.AddResults(results); err != nil { 1061 return err 1062 } 1063 1064 evicted++ 1065 1066 // It's now safe to remove the mutable segments as anything the block 1067 // held is covered by the owned shards we just read 1068 if err := block.EvictMutableSegments(); err != nil { 1069 // deliberately choosing to not mark this as an error as we have successfully 1070 // flushed any mutable data. 1071 i.logger.Warn("encountered error while evicting mutable segments for index block", 1072 zap.Error(err), 1073 zap.Time("blockStart", block.StartTime().ToTime()), 1074 ) 1075 } 1076 1077 for _, t := range i.blockStartsFromIndexBlockStart(block.StartTime()) { 1078 for _, s := range shards { 1079 s.MarkWarmIndexFlushStateSuccessOrError(t, err) 1080 } 1081 } 1082 } 1083 i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted)) 1084 return nil 1085 } 1086 1087 func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) { 1088 if len(shards) == 0 { 1089 // No-op if no shards currently owned. 1090 return func() error { return nil }, nil 1091 } 1092 1093 flushable, err := i.flushableBlocks(shards, series.ColdWrite) 1094 if err != nil { 1095 return nil, err 1096 } 1097 // We only rotate cold mutable segments in phase I of cold flushing. 1098 for _, block := range flushable { 1099 if err := block.RotateColdMutableSegments(); err != nil { 1100 return nil, err 1101 } 1102 } 1103 // We can't immediately evict cold mutable segments so we return a callback to do so 1104 // when cold flush finishes. 1105 return func() error { 1106 multiErr := xerrors.NewMultiError() 1107 for _, block := range flushable { 1108 multiErr = multiErr.Add(block.EvictColdMutableSegments()) 1109 } 1110 return multiErr.FinalError() 1111 }, nil 1112 } 1113 1114 // WarmFlushBlockStarts returns all index blockStarts which have been flushed to disk. 1115 func (i *nsIndex) WarmFlushBlockStarts() []xtime.UnixNano { 1116 flushed := make([]xtime.UnixNano, 0) 1117 infoFiles := i.readInfoFilesAsMap() 1118 1119 for blockStart := range infoFiles { 1120 if i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) { 1121 flushed = append(flushed, blockStart) 1122 } 1123 } 1124 return flushed 1125 } 1126 1127 // BackgroundCompact background compacts eligible segments. 1128 func (i *nsIndex) BackgroundCompact() { 1129 if i.activeBlock != nil { 1130 i.activeBlock.BackgroundCompact() 1131 } 1132 for _, b := range i.state.blocksByTime { 1133 b.BackgroundCompact() 1134 } 1135 } 1136 1137 func (i *nsIndex) readInfoFilesAsMap() map[xtime.UnixNano][]fs.ReadIndexInfoFileResult { 1138 fsOpts := i.opts.CommitLogOptions().FilesystemOptions() 1139 infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ 1140 FilePathPrefix: fsOpts.FilePathPrefix(), 1141 Namespace: i.nsMetadata.ID(), 1142 ReaderBufferSize: fsOpts.InfoReaderBufferSize(), 1143 }) 1144 result := make(map[xtime.UnixNano][]fs.ReadIndexInfoFileResult) 1145 for _, infoFile := range infoFiles { 1146 t := xtime.UnixNano(infoFile.Info.BlockStart) 1147 files := result[t] 1148 result[t] = append(files, infoFile) 1149 } 1150 return result 1151 } 1152 1153 func (i *nsIndex) flushableBlocks( 1154 shards []databaseShard, 1155 flushType series.WriteType, 1156 ) ([]index.Block, error) { 1157 i.state.RLock() 1158 defer i.state.RUnlock() 1159 if !i.isOpenWithRLock() { 1160 return nil, errDbIndexUnableToFlushClosed 1161 } 1162 // NB(bodu): We read index info files once here to avoid re-reading all of them 1163 // for each block. 1164 infoFiles := i.readInfoFilesAsMap() 1165 flushable := make([]index.Block, 0, len(i.state.blocksByTime)) 1166 1167 now := xtime.ToUnixNano(i.nowFn()) 1168 earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now) 1169 currentBlockStart := now.Truncate(i.blockSize) 1170 // Check for flushable blocks by iterating through all block starts w/in retention. 1171 for blockStart := earliestBlockStartToRetain; blockStart.Before(currentBlockStart); blockStart = blockStart.Add(i.blockSize) { 1172 blockResult, err := i.ensureBlockPresentWithRLock(blockStart) 1173 if err != nil { 1174 return nil, err 1175 } 1176 1177 canFlush, err := i.canFlushBlockWithRLock(infoFiles, blockStart, 1178 blockResult.block, shards, flushType) 1179 if err != nil { 1180 return nil, err 1181 } 1182 if !canFlush { 1183 continue 1184 } 1185 1186 flushable = append(flushable, blockResult.block) 1187 } 1188 return flushable, nil 1189 } 1190 1191 func (i *nsIndex) canFlushBlockWithRLock( 1192 infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult, 1193 blockStart xtime.UnixNano, 1194 block index.Block, 1195 shards []databaseShard, 1196 flushType series.WriteType, 1197 ) (bool, error) { 1198 switch flushType { 1199 case series.WarmWrite: 1200 // NB(bodu): We should always attempt to warm flush sealed blocks to disk if 1201 // there doesn't already exist data on disk. We're checking this instead of 1202 // `block.NeedsMutableSegmentsEvicted()` since bootstrap writes for cold block starts 1203 // get marked as warm writes if there doesn't already exist data on disk and need to 1204 // properly go through the warm flush lifecycle. 1205 if !block.IsSealed() || i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) { 1206 return false, nil 1207 } 1208 case series.ColdWrite: 1209 if !block.NeedsColdMutableSegmentsEvicted() { 1210 return false, nil 1211 } 1212 } 1213 1214 // Check all data files exist for the shards we own 1215 for _, shard := range shards { 1216 if !shard.IsBootstrapped() { 1217 i.logger. 1218 With(zap.Uint32("shard", shard.ID())). 1219 Debug("skipping index cold flush due to shard not bootstrapped yet") 1220 continue 1221 } 1222 1223 for _, t := range i.blockStartsFromIndexBlockStart(blockStart) { 1224 flushState, err := shard.FlushState(t) 1225 if err != nil { 1226 return false, err 1227 } 1228 1229 // Skip if the data flushing failed. Data flushing precedes index flushing. 1230 if flushState.WarmStatus.DataFlushed != fileOpSuccess { 1231 return false, nil 1232 } 1233 } 1234 } 1235 1236 return true, nil 1237 } 1238 1239 // blockStartsFromIndexBlockStart returns the possibly many blocksStarts that exist within 1240 // a given index block (since index block size >= data block size) 1241 func (i *nsIndex) blockStartsFromIndexBlockStart(blockStart xtime.UnixNano) []xtime.UnixNano { 1242 start := blockStart 1243 end := blockStart.Add(i.blockSize) 1244 dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize() 1245 blockStarts := make([]xtime.UnixNano, 0) 1246 for t := start; t.Before(end); t = t.Add(dataBlockSize) { 1247 blockStarts = append(blockStarts, t) 1248 } 1249 return blockStarts 1250 } 1251 1252 func (i *nsIndex) hasIndexWarmFlushedToDisk( 1253 infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult, 1254 blockStart xtime.UnixNano, 1255 ) bool { 1256 // NB(bodu): We consider the block to have been warm flushed if there are any 1257 // filesets on disk. This is consistent with the "has warm flushed" check in the db shard. 1258 // Shard block starts are marked as having warm flushed if an info file is successfully read from disk. 1259 f, ok := infoFiles[blockStart] 1260 if !ok { 1261 return false 1262 } 1263 1264 for _, fileInfo := range f { 1265 indexVolumeType := idxpersist.DefaultIndexVolumeType 1266 if fileInfo.Info.IndexVolumeType != nil { 1267 indexVolumeType = idxpersist.IndexVolumeType(fileInfo.Info.IndexVolumeType.Value) 1268 } 1269 match := fileInfo.ID.BlockStart == blockStart && indexVolumeType == idxpersist.DefaultIndexVolumeType 1270 if match { 1271 return true 1272 } 1273 } 1274 return false 1275 } 1276 1277 func (i *nsIndex) flushBlock( 1278 flush persist.IndexFlush, 1279 indexBlock index.Block, 1280 shards []databaseShard, 1281 builder segment.DocumentsBuilder, 1282 ) ([]segment.Segment, error) { 1283 allShards := make(map[uint32]struct{}) 1284 for _, shard := range shards { 1285 // Populate all shards 1286 allShards[shard.ID()] = struct{}{} 1287 } 1288 1289 volumeIndex, err := i.opts.IndexClaimsManager().ClaimNextIndexFileSetVolumeIndex( 1290 i.nsMetadata, 1291 indexBlock.StartTime(), 1292 ) 1293 if err != nil { 1294 return nil, fmt.Errorf("failed to claim next index volume index: %w", err) 1295 } 1296 1297 preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{ 1298 NamespaceMetadata: i.nsMetadata, 1299 BlockStart: indexBlock.StartTime(), 1300 FileSetType: persist.FileSetFlushType, 1301 Shards: allShards, 1302 // NB(bodu): By default, we always write to the "default" index volume type. 1303 IndexVolumeType: idxpersist.DefaultIndexVolumeType, 1304 VolumeIndex: volumeIndex, 1305 }) 1306 if err != nil { 1307 return nil, err 1308 } 1309 1310 var closed bool 1311 defer func() { 1312 if !closed { 1313 segments, _ := preparedPersist.Close() 1314 // NB(r): Safe to for over a nil array so disregard error here. 1315 for _, segment := range segments { 1316 segment.Close() 1317 } 1318 } 1319 }() 1320 1321 // Flush a single block segment. 1322 if err := i.flushBlockSegment(preparedPersist, indexBlock, shards, builder); err != nil { 1323 return nil, err 1324 } 1325 1326 closed = true 1327 1328 // Now return the immutable segments 1329 return preparedPersist.Close() 1330 } 1331 1332 func (i *nsIndex) flushBlockSegment( 1333 preparedPersist persist.PreparedIndexPersist, 1334 indexBlock index.Block, 1335 shards []databaseShard, 1336 builder segment.DocumentsBuilder, 1337 ) error { 1338 // Reset the builder 1339 builder.Reset() 1340 1341 var ( 1342 batch = m3ninxindex.Batch{AllowPartialUpdates: true} 1343 batchSize = defaultFlushDocsBatchSize 1344 ) 1345 ctx := i.opts.ContextPool().Get() 1346 defer ctx.Close() 1347 1348 for _, shard := range shards { 1349 var ( 1350 first = true 1351 pageToken PageToken 1352 ) 1353 for first || pageToken != nil { 1354 first = false 1355 1356 var ( 1357 opts = block.FetchBlocksMetadataOptions{ 1358 // NB(bodu): There is a lag between when data gets flushed 1359 // to disk and when it gets removed from memory during the next 1360 // Tick. In this case, the same series can exist both on disk 1361 // and in memory at the same time resulting in dupe series IDs. 1362 // Only read data from disk when flushing index segments. 1363 OnlyDisk: true, 1364 } 1365 limit = defaultFlushReadDataBlocksBatchSize 1366 results block.FetchBlocksMetadataResults 1367 err error 1368 ) 1369 ctx.Reset() 1370 results, pageToken, err = shard.FetchBlocksMetadataV2(ctx, 1371 indexBlock.StartTime(), indexBlock.EndTime(), 1372 limit, pageToken, opts) 1373 if err != nil { 1374 return err 1375 } 1376 1377 // Reset docs batch before use. 1378 batch.Docs = batch.Docs[:0] 1379 for _, result := range results.Results() { 1380 doc, exists, err := shard.DocRef(result.ID) 1381 if err != nil { 1382 return err 1383 } 1384 if !exists { 1385 doc, err = convert.FromSeriesIDAndTagIter(result.ID, result.Tags) 1386 if err != nil { 1387 return err 1388 } 1389 i.metrics.flushDocsNew.Inc(1) 1390 } else { 1391 i.metrics.flushDocsCached.Inc(1) 1392 } 1393 1394 batch.Docs = append(batch.Docs, doc) 1395 if len(batch.Docs) < batchSize { 1396 continue 1397 } 1398 1399 err = i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch)) 1400 if err != nil { 1401 return err 1402 } 1403 1404 // Reset docs after insertions. 1405 batch.Docs = batch.Docs[:0] 1406 } 1407 1408 // Add last batch if remaining. 1409 if len(batch.Docs) > 0 { 1410 err := i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch)) 1411 if err != nil { 1412 return err 1413 } 1414 } 1415 1416 results.Close() 1417 1418 // Use BlockingCloseReset so that we can reuse the context without 1419 // it going back to the pool. 1420 ctx.BlockingCloseReset() 1421 } 1422 } 1423 1424 // Finally flush this segment 1425 return preparedPersist.Persist(builder) 1426 } 1427 1428 func (i *nsIndex) sanitizeAllowDuplicatesWriteError(err error) error { 1429 if err == nil { 1430 return nil 1431 } 1432 1433 // NB: dropping duplicate id error messages from logs as they're expected when we see 1434 // repeated inserts. as long as a block has an ID, it's not an error so we don't need 1435 // to pollute the logs with these messages. 1436 if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok { 1437 err = partialError.FilterDuplicateIDErrors() 1438 } 1439 1440 return err 1441 } 1442 1443 func (i *nsIndex) AssignShardSet(shardSet sharding.ShardSet) { 1444 // NB(r): Allocate the filter function once, it can be used outside 1445 // of locks as it depends on no internal state. 1446 set := bitset.NewBitSet(uint(shardSet.Max())) 1447 assigned := make(map[uint32]struct{}) 1448 for _, shardID := range shardSet.AllIDs() { 1449 set.Set(uint(shardID)) 1450 assigned[shardID] = struct{}{} 1451 } 1452 1453 i.state.Lock() 1454 i.state.shardsFilterID = func(id ident.ID) bool { 1455 // NB(r): Use a bitset for fast lookups. 1456 return set.Test(uint(shardSet.Lookup(id))) 1457 } 1458 1459 i.state.shardFilteredForID = func(id ident.ID) (uint32, bool) { 1460 shard := shardSet.Lookup(id) 1461 return shard, set.Test(uint(shard)) 1462 } 1463 1464 i.state.shardsAssigned = assigned 1465 i.state.Unlock() 1466 } 1467 1468 func (i *nsIndex) shardsFilterID() func(id ident.ID) bool { 1469 i.state.RLock() 1470 v := i.state.shardsFilterID 1471 i.state.RUnlock() 1472 return v 1473 } 1474 1475 func (i *nsIndex) shardForID() func(id ident.ID) (uint32, bool) { 1476 i.state.RLock() 1477 v := i.state.shardFilteredForID 1478 i.state.RUnlock() 1479 return v 1480 } 1481 1482 func (i *nsIndex) Query( 1483 ctx context.Context, 1484 query index.Query, 1485 opts index.QueryOptions, 1486 ) (index.QueryResult, error) { 1487 var logFields []opentracinglog.Field 1488 ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQuery) 1489 defer sp.Finish() 1490 if sampled { 1491 // Only allocate metadata such as query string if sampling trace. 1492 logFields = []opentracinglog.Field{ 1493 opentracinglog.String("query", query.String()), 1494 opentracinglog.String("namespace", i.nsMetadata.ID().String()), 1495 opentracinglog.Int("seriesLimit", opts.SeriesLimit), 1496 opentracinglog.Int("docsLimit", opts.DocsLimit), 1497 xopentracing.Time("queryStart", opts.StartInclusive.ToTime()), 1498 xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()), 1499 } 1500 sp.LogFields(logFields...) 1501 } 1502 1503 // Get results and set the namespace ID and size limit. 1504 results := i.resultsPool.Get() 1505 results.Reset(i.nsMetadata.ID(), index.QueryResultsOptions{ 1506 SizeLimit: opts.SeriesLimit, 1507 FilterID: i.shardsFilterID(), 1508 }) 1509 ctx.RegisterFinalizer(results) 1510 queryRes, err := i.query(ctx, query, results, opts, i.execBlockQueryFn, 1511 i.newBlockQueryIterFn, logFields) 1512 if err != nil { 1513 sp.LogFields(opentracinglog.Error(err)) 1514 return index.QueryResult{}, err 1515 } 1516 1517 return index.QueryResult{ 1518 Results: results, 1519 Exhaustive: queryRes.exhaustive, 1520 Waited: queryRes.waited, 1521 }, nil 1522 } 1523 1524 func (i *nsIndex) AggregateQuery( 1525 ctx context.Context, 1526 query index.Query, 1527 opts index.AggregationOptions, 1528 ) (index.AggregateQueryResult, error) { 1529 id := i.nsMetadata.ID() 1530 logFields := []opentracinglog.Field{ 1531 opentracinglog.String("query", query.String()), 1532 opentracinglog.String("namespace", id.String()), 1533 opentracinglog.Int("seriesLimit", opts.SeriesLimit), 1534 opentracinglog.Int("docsLimit", opts.DocsLimit), 1535 xopentracing.Time("queryStart", opts.StartInclusive.ToTime()), 1536 xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()), 1537 } 1538 1539 ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxAggregateQuery) 1540 sp.LogFields(logFields...) 1541 defer sp.Finish() 1542 1543 metrics := index.NewAggregateUsageMetrics(id, i.opts.InstrumentOptions()) 1544 // Get results and set the filters, namespace ID and size limit. 1545 results := i.aggregateResultsPool.Get() 1546 aopts := index.AggregateResultsOptions{ 1547 SizeLimit: opts.SeriesLimit, 1548 DocsLimit: opts.DocsLimit, 1549 FieldFilter: opts.FieldFilter, 1550 Type: opts.Type, 1551 AggregateUsageMetrics: metrics, 1552 } 1553 ctx.RegisterFinalizer(results) 1554 // use appropriate fn to query underlying blocks. 1555 // use block.Aggregate() for querying and set the query if required. 1556 fn := i.execBlockAggregateQueryFn 1557 isAllQuery := query.Equal(allQuery) 1558 if !isAllQuery { 1559 if field, isFieldQuery := idx.FieldQuery(query.Query); isFieldQuery { 1560 aopts.FieldFilter = aopts.FieldFilter.AddIfMissing(field) 1561 } else { 1562 // Need to actually restrict whether we should return a term or not 1563 // based on running the actual query to resolve a postings list and 1564 // then seeing if that intersects the aggregated term postings list 1565 // at all. 1566 aopts.RestrictByQuery = &query 1567 } 1568 } 1569 aopts.FieldFilter = aopts.FieldFilter.SortAndDedupe() 1570 results.Reset(id, aopts) 1571 queryRes, err := i.query(ctx, query, results, opts.QueryOptions, fn, 1572 i.newBlockAggregatorIterFn, logFields) 1573 if err != nil { 1574 return index.AggregateQueryResult{}, err 1575 } 1576 return index.AggregateQueryResult{ 1577 Results: results, 1578 Exhaustive: queryRes.exhaustive, 1579 Waited: queryRes.waited, 1580 }, nil 1581 } 1582 1583 type queryResult struct { 1584 exhaustive bool 1585 waited int 1586 } 1587 1588 func (i *nsIndex) query( 1589 ctx context.Context, 1590 query index.Query, 1591 results index.BaseResults, 1592 opts index.QueryOptions, 1593 execBlockFn execBlockQueryFn, 1594 newBlockIterFn newBlockIterFn, 1595 logFields []opentracinglog.Field, 1596 ) (queryResult, error) { 1597 ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQueryHelper) 1598 sp.LogFields(logFields...) 1599 defer sp.Finish() 1600 if sampled { 1601 // Only log fields if sampled. 1602 sp.LogFields(logFields...) 1603 } 1604 1605 queryRes, err := i.queryWithSpan(ctx, query, results, opts, execBlockFn, 1606 newBlockIterFn, sp, logFields) 1607 if err != nil { 1608 sp.LogFields(opentracinglog.Error(err)) 1609 1610 if queryRes.exhaustive { 1611 i.metrics.queryExhaustiveInternalError.Inc(1) 1612 } else { 1613 i.metrics.queryNonExhaustiveInternalError.Inc(1) 1614 } 1615 return queryRes, err 1616 } 1617 1618 if queryRes.exhaustive { 1619 i.metrics.queryExhaustiveSuccess.Inc(1) 1620 return queryRes, nil 1621 } 1622 1623 // If require exhaustive but not, return error. 1624 if opts.RequireExhaustive { 1625 seriesCount := results.Size() 1626 docsCount := results.TotalDocsCount() 1627 if opts.SeriesLimitExceeded(seriesCount) { 1628 i.metrics.queryNonExhaustiveSeriesLimitError.Inc(1) 1629 } else if opts.DocsLimitExceeded(docsCount) { 1630 i.metrics.queryNonExhaustiveDocsLimitError.Inc(1) 1631 } else { 1632 i.metrics.queryNonExhaustiveLimitError.Inc(1) 1633 } 1634 1635 // NB(r): Make sure error is not retried and returns as bad request. 1636 return queryRes, xerrors.NewInvalidParamsError(limits.NewQueryLimitExceededError(fmt.Sprintf( 1637 "query exceeded limit: require_exhaustive=%v, series_limit=%d, series_matched=%d, docs_limit=%d, docs_matched=%d", 1638 opts.RequireExhaustive, 1639 opts.SeriesLimit, 1640 seriesCount, 1641 opts.DocsLimit, 1642 docsCount, 1643 ))) 1644 } 1645 1646 // Otherwise non-exhaustive but not required to be. 1647 i.metrics.queryNonExhaustiveSuccess.Inc(1) 1648 return queryRes, nil 1649 } 1650 1651 // blockIter is a composite type to hold various state about a block while iterating over the results. 1652 type blockIter struct { 1653 iter index.ResultIterator 1654 iterCloser io.Closer 1655 block index.Block 1656 waitTime time.Duration 1657 processingTime time.Duration 1658 } 1659 1660 func (i *nsIndex) queryWithSpan( 1661 ctx context.Context, 1662 query index.Query, 1663 results index.BaseResults, 1664 opts index.QueryOptions, 1665 execBlockFn execBlockQueryFn, 1666 newBlockIterFn newBlockIterFn, 1667 span opentracing.Span, 1668 logFields []opentracinglog.Field, 1669 ) (queryResult, error) { 1670 i.state.RLock() 1671 if !i.isOpenWithRLock() { 1672 i.state.RUnlock() 1673 return queryResult{}, errDbIndexUnableToQueryClosed 1674 } 1675 1676 // Track this as an inflight query that needs to finish 1677 // when the index is closed. 1678 i.queriesWg.Add(1) 1679 defer i.queriesWg.Done() 1680 1681 // Enact overrides for query options 1682 opts = i.overriddenOptsForQueryWithRLock(opts) 1683 1684 // Retrieve blocks to query, then we can release lock. 1685 // NB(r): Important not to block ticking, and other tasks by 1686 // holding the RLock during a query. 1687 qryRange := xtime.NewRanges(xtime.Range{ 1688 Start: opts.StartInclusive, 1689 End: opts.EndExclusive, 1690 }) 1691 // NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's 1692 // immutable and we only create an iterator over it. 1693 blocks := newBlocksIterStackAlloc(i.activeBlock, i.state.blocksDescOrderImmutable, qryRange) 1694 1695 // Can now release the lock and execute the query without holding the lock. 1696 i.state.RUnlock() 1697 1698 var ( 1699 // State contains concurrent mutable state for async execution below. 1700 state = &asyncQueryExecState{} 1701 wg sync.WaitGroup 1702 ) 1703 perms, err := i.permitsManager.NewPermits(ctx) 1704 if err != nil { 1705 return queryResult{}, err 1706 } 1707 defer perms.Close() 1708 1709 var blockIters []*blockIter 1710 for b, ok := blocks.Next(); ok; b, ok = b.Next() { 1711 block := b.Current() 1712 iter, err := newBlockIterFn(ctx, block, query, results) 1713 if err != nil { 1714 return queryResult{}, err 1715 } 1716 blockIters = append(blockIters, &blockIter{ 1717 iter: iter, 1718 iterCloser: x.NewSafeCloser(iter), 1719 block: block, 1720 }) 1721 } 1722 1723 defer func() { 1724 for _, iter := range blockIters { 1725 // safe to call Close multiple times, so it's fine to eagerly close in the loop below and here. 1726 _ = iter.iterCloser.Close() 1727 } 1728 }() 1729 1730 // queryCanceled returns true if the query has been canceled and the current iteration should terminate. 1731 queryCanceled := func() bool { 1732 return opts.LimitsExceeded(results.Size(), results.TotalDocsCount()) || state.hasErr() 1733 } 1734 // waitForPermit waits for a permit. returns non-nil if the permit was acquired and the wait time. 1735 waitForPermit := func() (permits.Permit, time.Duration) { 1736 // make sure the query hasn't been canceled before waiting for a permit. 1737 if queryCanceled() { 1738 return nil, 0 1739 } 1740 1741 startWait := time.Now() 1742 acquireResult, err := perms.Acquire(ctx) 1743 waitTime := time.Since(startWait) 1744 var success bool 1745 defer func() { 1746 // Note: ALWAYS release if we do not successfully return back 1747 // the permit and we checked one out. 1748 if !success && acquireResult.Permit != nil { 1749 perms.Release(acquireResult.Permit) 1750 } 1751 }() 1752 if acquireResult.Waited { 1753 // Potentially break an error if require no wait set. 1754 if err == nil && opts.RequireNoWait { 1755 // Fail iteration if request requires no waiting occurs. 1756 err = permits.ErrOperationWaitedOnRequireNoWait 1757 } 1758 state.incWaited(1) 1759 } 1760 if err != nil { 1761 state.addErr(err) 1762 return nil, waitTime 1763 } 1764 1765 // make sure the query hasn't been canceled while waiting for a permit. 1766 if queryCanceled() { 1767 return nil, waitTime 1768 } 1769 1770 success = true 1771 return acquireResult.Permit, waitTime 1772 } 1773 1774 // We're looping through all the blocks that we need to query and kicking 1775 // off parallel queries which are bounded by the permits maximum 1776 // concurrency. It's possible at this point that we've completed querying one or more blocks and already exhausted 1777 // the maximum number of results that we're allowed to return. If thats the case, there is no value in kicking off 1778 // more parallel queries, so we break out of the loop. 1779 for _, blockIter := range blockIters { 1780 // Capture for async query execution below. 1781 blockIter := blockIter 1782 1783 // acquire a permit before kicking off the goroutine to process the iterator. this limits the number of 1784 // concurrent goroutines to # of permits + large queries that needed multiple iterations to finish. 1785 permit, waitTime := waitForPermit() 1786 blockIter.waitTime += waitTime 1787 if permit == nil { 1788 break 1789 } 1790 1791 wg.Add(1) 1792 // kick off a go routine to process the entire iterator. 1793 go func() { 1794 defer wg.Done() 1795 first := true 1796 for !blockIter.iter.Done() { 1797 // if this is not the first iteration of the iterator, need to acquire another permit. 1798 if !first { 1799 permit, waitTime = waitForPermit() 1800 blockIter.waitTime += waitTime 1801 if permit == nil { 1802 break 1803 } 1804 } 1805 blockLogFields := append(logFields, xopentracing.Duration("permitWaitTime", waitTime)) 1806 first = false 1807 startProcessing := time.Now() 1808 execBlockFn(ctx, blockIter.block, permit, blockIter.iter, opts, state, results, blockLogFields) 1809 processingTime := time.Since(startProcessing) 1810 blockIter.processingTime += processingTime 1811 permit.Use(int64(processingTime)) 1812 perms.Release(permit) 1813 } 1814 if first { 1815 // this should never happen since a new iter cannot be Done, but just to be safe. 1816 perms.Release(permit) 1817 } 1818 1819 // close the iterator since it's no longer needed. it's safe to call Close multiple times, here and in the 1820 // defer when the function returns. 1821 if err := blockIter.iterCloser.Close(); err != nil { 1822 state.addErr(err) 1823 } 1824 }() 1825 } 1826 1827 // wait for all workers to finish. if the caller cancels the call, the workers will be interrupted and eventually 1828 // finish. 1829 wg.Wait() 1830 1831 i.metrics.loadedDocsPerQuery.RecordValue(float64(results.TotalDocsCount())) 1832 1833 exhaustive := opts.Exhaustive(results.Size(), results.TotalDocsCount()) 1834 // ok to read state without lock since all parallel queries are done. 1835 multiErr := state.multiErr 1836 err = multiErr.FinalError() 1837 1838 return queryResult{ 1839 exhaustive: exhaustive, 1840 waited: state.waited(), 1841 }, err 1842 } 1843 1844 func (i *nsIndex) newBlockQueryIterFn( 1845 ctx context.Context, 1846 block index.Block, 1847 query index.Query, 1848 _ index.BaseResults, 1849 ) (index.ResultIterator, error) { 1850 return block.QueryIter(ctx, query) 1851 } 1852 1853 //nolint: dupl 1854 func (i *nsIndex) execBlockQueryFn( 1855 ctx context.Context, 1856 block index.Block, 1857 permit permits.Permit, 1858 iter index.ResultIterator, 1859 opts index.QueryOptions, 1860 state *asyncQueryExecState, 1861 results index.BaseResults, 1862 logFields []opentracinglog.Field, 1863 ) { 1864 logFields = append(logFields, 1865 xopentracing.Time("blockStart", block.StartTime().ToTime()), 1866 xopentracing.Time("blockEnd", block.EndTime().ToTime()), 1867 ) 1868 1869 ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockQuery) 1870 sp.LogFields(logFields...) 1871 defer sp.Finish() 1872 1873 docResults, ok := results.(index.DocumentResults) 1874 if !ok { // should never happen 1875 state.addErr(fmt.Errorf("unknown results type [%T] received during query", results)) 1876 return 1877 } 1878 queryIter, ok := iter.(index.QueryIterator) 1879 if !ok { // should never happen 1880 state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter)) 1881 return 1882 } 1883 1884 deadline := time.Now().Add(time.Duration(permit.AllowedQuota())) 1885 err := block.QueryWithIter(ctx, opts, queryIter, docResults, deadline, logFields) 1886 if err == index.ErrUnableToQueryBlockClosed { 1887 // NB(r): Because we query this block outside of the results lock, it's 1888 // possible this block may get closed if it slides out of retention, in 1889 // that case those results are no longer considered valid and outside of 1890 // retention regardless, so this is a non-issue. 1891 err = nil 1892 } 1893 1894 if err != nil { 1895 sp.LogFields(opentracinglog.Error(err)) 1896 state.addErr(err) 1897 } 1898 } 1899 1900 func (i *nsIndex) newBlockAggregatorIterFn( 1901 ctx context.Context, 1902 block index.Block, 1903 _ index.Query, 1904 results index.BaseResults, 1905 ) (index.ResultIterator, error) { 1906 aggResults, ok := results.(index.AggregateResults) 1907 if !ok { // should never happen 1908 return nil, fmt.Errorf("unknown results type [%T] received during aggregation", results) 1909 } 1910 return block.AggregateIter(ctx, aggResults.AggregateResultsOptions()) 1911 } 1912 1913 func (i *nsIndex) execBlockAggregateQueryFn( 1914 ctx context.Context, 1915 block index.Block, 1916 permit permits.Permit, 1917 iter index.ResultIterator, 1918 opts index.QueryOptions, 1919 state *asyncQueryExecState, 1920 results index.BaseResults, 1921 logFields []opentracinglog.Field, 1922 ) { 1923 logFields = append(logFields, 1924 xopentracing.Time("blockStart", block.StartTime().ToTime()), 1925 xopentracing.Time("blockEnd", block.EndTime().ToTime()), 1926 ) 1927 1928 ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockAggregateQuery) 1929 sp.LogFields(logFields...) 1930 defer sp.Finish() 1931 1932 aggResults, ok := results.(index.AggregateResults) 1933 if !ok { // should never happen 1934 state.addErr(fmt.Errorf("unknown results type [%T] received during aggregation", results)) 1935 return 1936 } 1937 aggIter, ok := iter.(index.AggregateIterator) 1938 if !ok { // should never happen 1939 state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter)) 1940 return 1941 } 1942 1943 deadline := time.Now().Add(time.Duration(permit.AllowedQuota())) 1944 err := block.AggregateWithIter(ctx, aggIter, opts, aggResults, deadline, logFields) 1945 if err == index.ErrUnableToQueryBlockClosed { 1946 // NB(r): Because we query this block outside of the results lock, it's 1947 // possible this block may get closed if it slides out of retention, in 1948 // that case those results are no longer considered valid and outside of 1949 // retention regardless, so this is a non-issue. 1950 err = nil 1951 } 1952 1953 if err != nil { 1954 sp.LogFields(opentracinglog.Error(err)) 1955 state.addErr(err) 1956 } 1957 } 1958 1959 func (i *nsIndex) overriddenOptsForQueryWithRLock( 1960 opts index.QueryOptions, 1961 ) index.QueryOptions { 1962 // Override query response limits if needed. 1963 if i.state.runtimeOpts.maxQuerySeriesLimit > 0 && (opts.SeriesLimit == 0 || 1964 int64(opts.SeriesLimit) > i.state.runtimeOpts.maxQuerySeriesLimit) { 1965 i.logger.Debug("overriding query response series limit", 1966 zap.Int("requested", opts.SeriesLimit), 1967 zap.Int64("maxAllowed", i.state.runtimeOpts.maxQuerySeriesLimit)) // FOLLOWUP(prateek): log query too once it's serializable. 1968 opts.SeriesLimit = int(i.state.runtimeOpts.maxQuerySeriesLimit) 1969 } 1970 if i.state.runtimeOpts.maxQueryDocsLimit > 0 && (opts.DocsLimit == 0 || 1971 int64(opts.DocsLimit) > i.state.runtimeOpts.maxQueryDocsLimit) { 1972 i.logger.Debug("overriding query response docs limit", 1973 zap.Int("requested", opts.DocsLimit), 1974 zap.Int64("maxAllowed", i.state.runtimeOpts.maxQueryDocsLimit)) // FOLLOWUP(prateek): log query too once it's serializable. 1975 opts.DocsLimit = int(i.state.runtimeOpts.maxQueryDocsLimit) 1976 } 1977 return opts 1978 } 1979 1980 type blockPresentResult struct { 1981 block index.Block 1982 latest bool 1983 } 1984 1985 func (i *nsIndex) ensureBlockPresent(blockStart xtime.UnixNano) (blockPresentResult, error) { 1986 i.state.RLock() 1987 defer i.state.RUnlock() 1988 if !i.isOpenWithRLock() { 1989 return blockPresentResult{}, errDbIndexUnableToWriteClosed 1990 } 1991 return i.ensureBlockPresentWithRLock(blockStart) 1992 } 1993 1994 func (i *nsIndex) isLatestBlockWithRLock(blockStart xtime.UnixNano) bool { 1995 return i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart) 1996 } 1997 1998 // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified 1999 // blockStart, allocating one if it does not. It returns the desired block, or 2000 // error if it's unable to do so. 2001 func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (blockPresentResult, error) { 2002 // check if the current latest block matches the required block, this 2003 // is the usual path and can short circuit the rest of the logic in this 2004 // function in most cases. 2005 if i.isLatestBlockWithRLock(blockStart) { 2006 return blockPresentResult{ 2007 block: i.state.latestBlock, 2008 latest: true, 2009 }, nil 2010 } 2011 2012 // check if exists in the map (this can happen if the latestBlock has not 2013 // been rotated yet). 2014 if block, ok := i.state.blocksByTime[blockStart]; ok { 2015 return blockPresentResult{block: block}, nil 2016 } 2017 2018 // i.e. block start does not exist, so we have to alloc. 2019 // we release the RLock (the function is called with this lock), and acquire 2020 // the write lock to do the extra allocation. 2021 i.state.RUnlock() 2022 i.state.Lock() 2023 2024 // need to guarantee all exit paths from the function leave with the RLock 2025 // so we release the write lock and re-acquire a read lock. 2026 defer func() { 2027 i.state.Unlock() 2028 i.state.RLock() 2029 }() 2030 2031 // re-check if exists in the map (another routine did the alloc) 2032 if block, ok := i.state.blocksByTime[blockStart]; ok { 2033 return blockPresentResult{ 2034 block: block, 2035 latest: i.isLatestBlockWithRLock(blockStart), 2036 }, nil 2037 } 2038 2039 // ok now we know for sure we have to alloc 2040 block, err := i.newBlockFn(blockStart, i.nsMetadata, 2041 index.BlockOptions{}, i.namespaceRuntimeOptsMgr, i.opts.IndexOptions()) 2042 if err != nil { // unable to allocate the block, should never happen. 2043 return blockPresentResult{}, i.unableToAllocBlockInvariantError(err) 2044 } 2045 2046 // NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent. 2047 // We need to seal cold blocks write away for cold writes. 2048 if !blockStart.After(i.lastSealableBlockStart(xtime.ToUnixNano(i.nowFn()))) { 2049 if err := block.Seal(); err != nil { 2050 return blockPresentResult{}, err 2051 } 2052 } 2053 2054 // add to tracked blocks map 2055 i.state.blocksByTime[blockStart] = block 2056 2057 // update ordered blockStarts slice, and latestBlock 2058 i.updateBlockStartsWithLock() 2059 2060 return blockPresentResult{ 2061 block: block, 2062 latest: i.isLatestBlockWithRLock(blockStart), 2063 }, nil 2064 } 2065 2066 func (i *nsIndex) lastSealableBlockStart(t xtime.UnixNano) xtime.UnixNano { 2067 return retention.FlushTimeEndForBlockSize(i.blockSize, t.Add(-i.bufferPast)) 2068 } 2069 2070 func (i *nsIndex) updateBlockStartsWithLock() { 2071 // update ordered blockStarts slice 2072 var ( 2073 latestBlockStart xtime.UnixNano 2074 latestBlock index.Block 2075 ) 2076 2077 blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime)+1) 2078 for ts, block := range i.state.blocksByTime { 2079 if ts >= latestBlockStart { 2080 latestBlockStart = ts 2081 latestBlock = block 2082 } 2083 blocks = append(blocks, blockAndBlockStart{ 2084 block: block, 2085 blockStart: ts, 2086 }) 2087 } 2088 2089 // order in desc order (i.e. reverse chronological) 2090 sort.Slice(blocks, func(i, j int) bool { 2091 return blocks[i].blockStart > blocks[j].blockStart 2092 }) 2093 2094 // NB(r): Important not to modify this once set since we take reference 2095 // to this slice with an RLock, release with RUnlock and then loop over it 2096 // during query time so it must not be altered and stay immutable. 2097 // This is done to avoid allocating a copy of the slice at query time for 2098 // each query. 2099 i.state.blocksDescOrderImmutable = blocks 2100 2101 // rotate latestBlock 2102 i.state.latestBlock = latestBlock 2103 } 2104 2105 func (i *nsIndex) isOpenWithRLock() bool { 2106 return !i.state.closed 2107 } 2108 2109 func (i *nsIndex) CleanupExpiredFileSets(t xtime.UnixNano) error { 2110 // we only expire data on drive that we don't hold a reference to, and is 2111 // past the expiration period. the earliest data we have to retain is given 2112 // by the following computation: 2113 // Min(FIRST_EXPIRED_BLOCK, EARLIEST_RETAINED_BLOCK) 2114 i.state.RLock() 2115 defer i.state.RUnlock() 2116 if i.state.closed { 2117 return errDbIndexUnableToCleanupClosed 2118 } 2119 2120 // earliest block to retain based on retention period 2121 earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, t) 2122 2123 // now we loop through the blocks we hold, to ensure we don't delete any data for them. 2124 for t := range i.state.blocksByTime { 2125 if t.Before(earliestBlockStartToRetain) { 2126 earliestBlockStartToRetain = t 2127 } 2128 } 2129 2130 // know the earliest block to retain, find all blocks earlier than it 2131 var ( 2132 pathPrefix = i.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() 2133 nsID = i.nsMetadata.ID() 2134 ) 2135 filesets, err := i.indexFilesetsBeforeFn(pathPrefix, nsID, earliestBlockStartToRetain) 2136 if err != nil { 2137 return err 2138 } 2139 2140 // and delete them 2141 return i.deleteFilesFn(filesets) 2142 } 2143 2144 func (i *nsIndex) CleanupCorruptedFileSets() error { 2145 /* 2146 Corrupted index filesets can be safely cleaned up if its not 2147 the latest volume index per index volume type/block start combo. 2148 2149 We are guaranteed not to be actively writing to an index fileset once 2150 we're already writing to later volume indices. 2151 */ 2152 fsOpts := i.opts.CommitLogOptions().FilesystemOptions() 2153 infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ 2154 FilePathPrefix: fsOpts.FilePathPrefix(), 2155 Namespace: i.nsMetadata.ID(), 2156 ReaderBufferSize: fsOpts.InfoReaderBufferSize(), 2157 IncludeCorrupted: true, 2158 }) 2159 2160 if len(infoFiles) == 0 { 2161 return nil 2162 } 2163 2164 var ( 2165 toDelete []string 2166 begin = 0 // marks the beginning of a subslice that contains filesets with same block starts 2167 ) 2168 // It's expected that info files are ordered by block start and volume index 2169 for j := range infoFiles { 2170 if infoFiles[begin].ID.BlockStart.Before(infoFiles[j].ID.BlockStart) { 2171 files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:j]) 2172 if err != nil { 2173 return err 2174 } 2175 toDelete = append(toDelete, files...) 2176 begin = j 2177 } else if infoFiles[begin].ID.BlockStart.After(infoFiles[j].ID.BlockStart) { 2178 errorMessage := "filesets are expected to be ordered by block start" 2179 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2180 l.Error(errorMessage) 2181 }) 2182 return instrument.InvariantErrorf(errorMessage) 2183 } 2184 } 2185 2186 // Process the volumes in the last block, which are not covered by the loop. 2187 files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:]) 2188 if err != nil { 2189 return err 2190 } 2191 toDelete = append(toDelete, files...) 2192 2193 return i.deleteFilesFn(toDelete) 2194 } 2195 2196 func (i *nsIndex) getCorruptedVolumesForDeletion(filesets []fs.ReadIndexInfoFileResult) ([]string, error) { 2197 if len(filesets) <= 1 { 2198 return nil, nil 2199 } 2200 2201 // Check for invariants. 2202 for j := 1; j < len(filesets); j++ { 2203 if !filesets[j-1].ID.BlockStart.Equal(filesets[j].ID.BlockStart) { 2204 errorMessage := "all the filesets passed to this function should have the same block start" 2205 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2206 l.Error(errorMessage) 2207 }) 2208 return nil, instrument.InvariantErrorf(errorMessage) 2209 } else if filesets[j-1].ID.VolumeIndex >= filesets[j].ID.VolumeIndex { 2210 errorMessage := "filesets should be ordered by volume index in increasing order" 2211 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2212 l.Error(errorMessage) 2213 }) 2214 return nil, instrument.InvariantErrorf(errorMessage) 2215 } 2216 } 2217 2218 toDelete := make([]string, 0) 2219 hasMoreRecentVolumeOfType := make(map[idxpersist.IndexVolumeType]struct{}) 2220 // Iterate filesets in reverse order to process higher volume indexes first. 2221 for j := len(filesets) - 1; j >= 0; j-- { 2222 f := filesets[j] 2223 2224 // NB: If the fileset info fields contains inconsistent information (e.g. block start inside 2225 // info file doesn't match the block start extracted from the filename), it means that info file 2226 // is missing or corrupted. Thus we cannot trust the information of this fileset 2227 // and we cannot be sure what's the actual volume type of it. However, a part of corrupted 2228 // fileset cleanup logic depends on knowing the volume type. 2229 // 2230 // Such fileset is deleted, except when it is the most recent volume in the block. 2231 // 2232 // The most recent volume is excluded because it is more likely to be actively written to. 2233 // If info file writes are not atomic, due to timing readers might observe the file 2234 // to be corrupted, even though at that moment the file is being written/re-written. 2235 if f.Corrupted && !f.ID.BlockStart.Equal(xtime.UnixNano(f.Info.BlockStart)) { 2236 if j != len(filesets)-1 { 2237 toDelete = append(toDelete, f.AbsoluteFilePaths...) 2238 } 2239 continue 2240 } 2241 2242 volType := idxpersist.DefaultIndexVolumeType 2243 if f.Info.IndexVolumeType != nil { 2244 volType = idxpersist.IndexVolumeType(f.Info.IndexVolumeType.Value) 2245 } 2246 // Delete corrupted filesets if there are more recent volumes with the same volume type. 2247 if _, ok := hasMoreRecentVolumeOfType[volType]; !ok { 2248 hasMoreRecentVolumeOfType[volType] = struct{}{} 2249 } else if f.Corrupted { 2250 toDelete = append(toDelete, f.AbsoluteFilePaths...) 2251 } 2252 } 2253 return toDelete, nil 2254 } 2255 2256 func (i *nsIndex) CleanupDuplicateFileSets(activeShards []uint32) error { 2257 fsOpts := i.opts.CommitLogOptions().FilesystemOptions() 2258 infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ 2259 FilePathPrefix: fsOpts.FilePathPrefix(), 2260 Namespace: i.nsMetadata.ID(), 2261 ReaderBufferSize: fsOpts.InfoReaderBufferSize(), 2262 }) 2263 2264 segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart := make(map[xtime.UnixNano]map[idxpersist.IndexVolumeType][]fs.Segments) 2265 for _, file := range infoFiles { 2266 seg := fs.NewSegments(file.Info, file.ID.VolumeIndex, file.AbsoluteFilePaths) 2267 blockStart := seg.BlockStart() 2268 segmentsOrderByVolumeIndexByVolumeType, ok := segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart] 2269 if !ok { 2270 segmentsOrderByVolumeIndexByVolumeType = make(map[idxpersist.IndexVolumeType][]fs.Segments) 2271 segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart] = segmentsOrderByVolumeIndexByVolumeType 2272 } 2273 2274 volumeType := seg.VolumeType() 2275 if _, ok := segmentsOrderByVolumeIndexByVolumeType[volumeType]; !ok { 2276 segmentsOrderByVolumeIndexByVolumeType[volumeType] = make([]fs.Segments, 0) 2277 } 2278 segmentsOrderByVolumeIndexByVolumeType[volumeType] = append(segmentsOrderByVolumeIndexByVolumeType[volumeType], seg) 2279 } 2280 2281 // Ensure that segments are sorted by volume index. 2282 for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart { 2283 for _, segs := range segmentsOrderByVolumeIndexByVolumeType { 2284 sort.SliceStable(segs, func(i, j int) bool { 2285 return segs[i].VolumeIndex() < segs[j].VolumeIndex() 2286 }) 2287 } 2288 } 2289 2290 multiErr := xerrors.NewMultiError() 2291 // Check for dupes and remove. 2292 filesToDelete := make([]string, 0) 2293 for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart { 2294 for _, segmentsOrderByVolumeIndex := range segmentsOrderByVolumeIndexByVolumeType { 2295 segmentsToKeep := make([]fs.Segments, 0) 2296 for _, seg := range segmentsOrderByVolumeIndex { 2297 for len(segmentsToKeep) > 0 { 2298 idx := len(segmentsToKeep) - 1 2299 if previous := segmentsToKeep[idx]; seg.ShardTimeRanges().IsSuperset( 2300 previous.ShardTimeRanges().FilterShards(activeShards)) { 2301 filesToDelete = append(filesToDelete, previous.AbsoluteFilePaths()...) 2302 segmentsToKeep = segmentsToKeep[:idx] 2303 } else { 2304 break 2305 } 2306 } 2307 segmentsToKeep = append(segmentsToKeep, seg) 2308 } 2309 } 2310 } 2311 multiErr = multiErr.Add(i.deleteFilesFn(filesToDelete)) 2312 return multiErr.FinalError() 2313 } 2314 2315 func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error { 2316 i.state.RLock() 2317 defer i.state.RUnlock() 2318 if i.state.closed { 2319 return errDbIndexAlreadyClosed 2320 } 2321 2322 ctx := context.NewBackground() 2323 defer ctx.Close() 2324 2325 // Create a new set of file system options to output to new directory. 2326 fsOpts := i.opts.CommitLogOptions(). 2327 FilesystemOptions(). 2328 SetFilePathPrefix(opts.OutputDirectory) 2329 2330 for _, block := range i.state.blocksByTime { 2331 segmentsData, err := block.MemorySegmentsData(ctx) 2332 if err != nil { 2333 return err 2334 } 2335 2336 for numSegment, segmentData := range segmentsData { 2337 indexWriter, err := fs.NewIndexWriter(fsOpts) 2338 if err != nil { 2339 return err 2340 } 2341 2342 fileSetID := fs.FileSetFileIdentifier{ 2343 FileSetContentType: persist.FileSetIndexContentType, 2344 Namespace: i.nsMetadata.ID(), 2345 BlockStart: block.StartTime(), 2346 VolumeIndex: numSegment, 2347 } 2348 openOpts := fs.IndexWriterOpenOptions{ 2349 Identifier: fileSetID, 2350 BlockSize: i.blockSize, 2351 FileSetType: persist.FileSetFlushType, 2352 Shards: i.state.shardsAssigned, 2353 IndexVolumeType: idxpersist.DefaultIndexVolumeType, 2354 } 2355 if err := indexWriter.Open(openOpts); err != nil { 2356 return err 2357 } 2358 2359 segWriter, err := idxpersist.NewFSTSegmentDataFileSetWriter(segmentData) 2360 if err != nil { 2361 return err 2362 } 2363 2364 if err := indexWriter.WriteSegmentFileSet(segWriter); err != nil { 2365 return err 2366 } 2367 2368 if err := indexWriter.Close(); err != nil { 2369 return err 2370 } 2371 } 2372 } 2373 2374 return nil 2375 } 2376 2377 func (i *nsIndex) Close() error { 2378 i.state.Lock() 2379 if !i.isOpenWithRLock() { 2380 i.state.Unlock() 2381 return errDbIndexAlreadyClosed 2382 } 2383 2384 i.state.closed = true 2385 close(i.state.closeCh) 2386 2387 var multiErr xerrors.MultiError 2388 multiErr = multiErr.Add(i.state.insertQueue.Stop()) 2389 2390 blocks := make([]index.Block, 0, len(i.state.blocksByTime)+1) 2391 for _, block := range i.state.blocksByTime { 2392 blocks = append(blocks, block) 2393 } 2394 blocks = append(blocks, i.activeBlock) 2395 2396 i.activeBlock = nil 2397 i.state.latestBlock = nil 2398 i.state.blocksByTime = nil 2399 i.state.blocksDescOrderImmutable = nil 2400 2401 if i.runtimeOptsListener != nil { 2402 i.runtimeOptsListener.Close() 2403 i.runtimeOptsListener = nil 2404 } 2405 2406 if i.runtimeNsOptsListener != nil { 2407 i.runtimeNsOptsListener.Close() 2408 i.runtimeNsOptsListener = nil 2409 } 2410 2411 // Can now unlock after collecting blocks to close and setting closed state. 2412 i.state.Unlock() 2413 2414 // Wait for inflight queries to finish before closing blocks, do this 2415 // outside of lock in case an inflight query needs to acquire a read lock 2416 // to finish but can't acquire it because close was holding the lock waiting 2417 // for queries to drain first. 2418 i.queriesWg.Wait() 2419 2420 for _, block := range blocks { 2421 multiErr = multiErr.Add(block.Close()) 2422 } 2423 2424 return multiErr.FinalError() 2425 } 2426 2427 func (i *nsIndex) unableToAllocBlockInvariantError(err error) error { 2428 ierr := fmt.Errorf("index unable to allocate block: %v", err) 2429 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2430 l.Error(ierr.Error()) 2431 }) 2432 return ierr 2433 } 2434 2435 type nsIndexMetrics struct { 2436 tick tally.Counter 2437 2438 asyncInsertAttemptTotal tally.Counter 2439 asyncInsertAttemptSkip tally.Counter 2440 asyncInsertAttemptWrite tally.Counter 2441 2442 asyncInsertSuccess tally.Counter 2443 asyncInsertErrors tally.Counter 2444 insertAfterClose tally.Counter 2445 queryAfterClose tally.Counter 2446 forwardIndexHits tally.Counter 2447 forwardIndexMisses tally.Counter 2448 forwardIndexCounter tally.Counter 2449 insertEndToEndLatency tally.Timer 2450 blocksEvictedMutableSegments tally.Counter 2451 blockMetrics nsIndexBlocksMetrics 2452 indexingConcurrencyMin tally.Gauge 2453 indexingConcurrencyMax tally.Gauge 2454 indexingConcurrencyAvg tally.Gauge 2455 flushIndexingConcurrency tally.Gauge 2456 flushDocsNew tally.Counter 2457 flushDocsCached tally.Counter 2458 latestBlockNumSegmentsForeground tally.Gauge 2459 latestBlockNumDocsForeground tally.Gauge 2460 latestBlockNumSegmentsBackground tally.Gauge 2461 latestBlockNumDocsBackground tally.Gauge 2462 2463 loadedDocsPerQuery tally.Histogram 2464 queryExhaustiveSuccess tally.Counter 2465 queryExhaustiveInternalError tally.Counter 2466 queryNonExhaustiveSuccess tally.Counter 2467 queryNonExhaustiveInternalError tally.Counter 2468 queryNonExhaustiveLimitError tally.Counter 2469 queryNonExhaustiveSeriesLimitError tally.Counter 2470 queryNonExhaustiveDocsLimitError tally.Counter 2471 } 2472 2473 func newNamespaceIndexMetrics( 2474 opts index.Options, 2475 iopts instrument.Options, 2476 ) nsIndexMetrics { 2477 const ( 2478 indexAttemptName = "index-attempt" 2479 forwardIndexName = "forward-index" 2480 indexingConcurrency = "indexing-concurrency" 2481 flushIndexingConcurrency = "flush-indexing-concurrency" 2482 ) 2483 scope := iopts.MetricsScope() 2484 blocksScope := scope.SubScope("blocks") 2485 m := nsIndexMetrics{ 2486 tick: scope.Counter("index-tick"), 2487 asyncInsertAttemptTotal: scope.Tagged(map[string]string{ 2488 "stage": "process", 2489 }).Counter(indexAttemptName), 2490 asyncInsertAttemptSkip: scope.Tagged(map[string]string{ 2491 "stage": "skip", 2492 }).Counter(indexAttemptName), 2493 asyncInsertAttemptWrite: scope.Tagged(map[string]string{ 2494 "stage": "write", 2495 }).Counter(indexAttemptName), 2496 asyncInsertSuccess: scope.Counter("index-success"), 2497 asyncInsertErrors: scope.Tagged(map[string]string{ 2498 "error_type": "async-insert", 2499 }).Counter("index-error"), 2500 insertAfterClose: scope.Tagged(map[string]string{ 2501 "error_type": "insert-closed", 2502 }).Counter("insert-after-close"), 2503 queryAfterClose: scope.Tagged(map[string]string{ 2504 "error_type": "query-closed", 2505 }).Counter("query-after-error"), 2506 forwardIndexHits: scope.Tagged(map[string]string{ 2507 "status": "hit", 2508 }).Counter(forwardIndexName), 2509 forwardIndexMisses: scope.Tagged(map[string]string{ 2510 "status": "miss", 2511 }).Counter(forwardIndexName), 2512 forwardIndexCounter: scope.Tagged(map[string]string{ 2513 "status": "count", 2514 }).Counter(forwardIndexName), 2515 insertEndToEndLatency: instrument.NewTimer(scope, 2516 "insert-end-to-end-latency", iopts.TimerOptions()), 2517 blocksEvictedMutableSegments: scope.Counter("blocks-evicted-mutable-segments"), 2518 blockMetrics: newNamespaceIndexBlocksMetrics(opts, blocksScope), 2519 indexingConcurrencyMin: scope.Tagged(map[string]string{ 2520 "stat": "min", 2521 }).Gauge(indexingConcurrency), 2522 indexingConcurrencyMax: scope.Tagged(map[string]string{ 2523 "stat": "max", 2524 }).Gauge(indexingConcurrency), 2525 indexingConcurrencyAvg: scope.Tagged(map[string]string{ 2526 "stat": "avg", 2527 }).Gauge(indexingConcurrency), 2528 flushIndexingConcurrency: scope.Gauge(flushIndexingConcurrency), 2529 flushDocsNew: scope.Tagged(map[string]string{ 2530 "status": "new", 2531 }).Counter("flush-docs"), 2532 flushDocsCached: scope.Tagged(map[string]string{ 2533 "status": "cached", 2534 }).Counter("flush-docs"), 2535 latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{ 2536 "segment_type": "foreground", 2537 }).Gauge("latest-block-num-segments"), 2538 latestBlockNumDocsForeground: scope.Tagged(map[string]string{ 2539 "segment_type": "foreground", 2540 }).Gauge("latest-block-num-docs"), 2541 latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{ 2542 "segment_type": "background", 2543 }).Gauge("latest-block-num-segments"), 2544 latestBlockNumDocsBackground: scope.Tagged(map[string]string{ 2545 "segment_type": "background", 2546 }).Gauge("latest-block-num-docs"), 2547 loadedDocsPerQuery: scope.Histogram( 2548 "loaded-docs-per-query", 2549 tally.MustMakeExponentialValueBuckets(10, 2, 16), 2550 ), 2551 queryExhaustiveSuccess: scope.Tagged(map[string]string{ 2552 "exhaustive": "true", 2553 "result": "success", 2554 }).Counter("query"), 2555 queryExhaustiveInternalError: scope.Tagged(map[string]string{ 2556 "exhaustive": "true", 2557 "result": "error_internal", 2558 }).Counter("query"), 2559 queryNonExhaustiveSuccess: scope.Tagged(map[string]string{ 2560 "exhaustive": "false", 2561 "result": "success", 2562 }).Counter("query"), 2563 queryNonExhaustiveInternalError: scope.Tagged(map[string]string{ 2564 "exhaustive": "false", 2565 "result": "error_internal", 2566 }).Counter("query"), 2567 queryNonExhaustiveLimitError: scope.Tagged(map[string]string{ 2568 "exhaustive": "false", 2569 "result": "error_require_exhaustive", 2570 }).Counter("query"), 2571 queryNonExhaustiveSeriesLimitError: scope.Tagged(map[string]string{ 2572 "exhaustive": "false", 2573 "result": "error_series_require_exhaustive", 2574 }).Counter("query"), 2575 queryNonExhaustiveDocsLimitError: scope.Tagged(map[string]string{ 2576 "exhaustive": "false", 2577 "result": "error_docs_require_exhaustive", 2578 }).Counter("query"), 2579 } 2580 2581 // Initialize gauges that should default to zero before 2582 // returning results so that they are exported with an 2583 // explicit zero value at process startup. 2584 m.flushIndexingConcurrency.Update(0) 2585 2586 return m 2587 } 2588 2589 type nsIndexBlocksMetrics struct { 2590 ForegroundSegments nsIndexBlocksSegmentsMetrics 2591 BackgroundSegments nsIndexBlocksSegmentsMetrics 2592 FlushedSegments nsIndexBlocksSegmentsMetrics 2593 } 2594 2595 func newNamespaceIndexBlocksMetrics( 2596 opts index.Options, 2597 scope tally.Scope, 2598 ) nsIndexBlocksMetrics { 2599 return nsIndexBlocksMetrics{ 2600 ForegroundSegments: newNamespaceIndexBlocksSegmentsMetrics( 2601 opts.ForegroundCompactionPlannerOptions(), 2602 scope.Tagged(map[string]string{ 2603 "segment-type": "foreground", 2604 })), 2605 BackgroundSegments: newNamespaceIndexBlocksSegmentsMetrics( 2606 opts.BackgroundCompactionPlannerOptions(), 2607 scope.Tagged(map[string]string{ 2608 "segment-type": "background", 2609 })), 2610 FlushedSegments: newNamespaceIndexBlocksSegmentsMetrics( 2611 opts.BackgroundCompactionPlannerOptions(), 2612 scope.Tagged(map[string]string{ 2613 "segment-type": "flushed", 2614 })), 2615 } 2616 } 2617 2618 type nsIndexBlocksSegmentsMetrics struct { 2619 Levels []nsIndexBlocksSegmentsLevelMetrics 2620 } 2621 2622 type nsIndexBlocksSegmentsLevelMetrics struct { 2623 MinSizeInclusive int64 2624 MaxSizeExclusive int64 2625 NumSegments tally.Gauge 2626 NumTotalDocs tally.Gauge 2627 SegmentsAge tally.Timer 2628 } 2629 2630 func newNamespaceIndexBlocksSegmentsMetrics( 2631 compactionOpts compaction.PlannerOptions, 2632 scope tally.Scope, 2633 ) nsIndexBlocksSegmentsMetrics { 2634 segmentLevelsScope := scope.SubScope("segment-levels") 2635 levels := make([]nsIndexBlocksSegmentsLevelMetrics, 0, len(compactionOpts.Levels)) 2636 for _, level := range compactionOpts.Levels { 2637 subScope := segmentLevelsScope.Tagged(map[string]string{ 2638 "level-min-size": strconv.Itoa(int(level.MinSizeInclusive)), 2639 "level-max-size": strconv.Itoa(int(level.MaxSizeExclusive)), 2640 }) 2641 levels = append(levels, nsIndexBlocksSegmentsLevelMetrics{ 2642 MinSizeInclusive: level.MinSizeInclusive, 2643 MaxSizeExclusive: level.MaxSizeExclusive, 2644 NumSegments: subScope.Gauge("num-segments"), 2645 NumTotalDocs: subScope.Gauge("num-total-docs"), 2646 SegmentsAge: subScope.Timer("segments-age"), 2647 }) 2648 } 2649 2650 return nsIndexBlocksSegmentsMetrics{ 2651 Levels: levels, 2652 } 2653 } 2654 2655 type dbShards []databaseShard 2656 2657 func (shards dbShards) IDs() []uint32 { 2658 ids := make([]uint32, 0, len(shards)) 2659 for _, s := range shards { 2660 ids = append(ids, s.ID()) 2661 } 2662 return ids 2663 } 2664 2665 // blocksIterStackAlloc is a stack allocated block iterator, ensuring no 2666 // allocations per query. 2667 type blocksIterStackAlloc struct { 2668 activeBlock index.Block 2669 blocks []blockAndBlockStart 2670 queryRanges xtime.Ranges 2671 idx int 2672 } 2673 2674 func newBlocksIterStackAlloc( 2675 activeBlock index.Block, 2676 blocks []blockAndBlockStart, 2677 queryRanges xtime.Ranges, 2678 ) blocksIterStackAlloc { 2679 return blocksIterStackAlloc{ 2680 activeBlock: activeBlock, 2681 blocks: blocks, 2682 queryRanges: queryRanges, 2683 idx: -2, 2684 } 2685 } 2686 2687 func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) { 2688 iter := i 2689 2690 for { 2691 iter.idx++ 2692 if iter.idx == -1 { 2693 // This will return the active block. 2694 return iter, true 2695 } 2696 2697 // No more ranges to query, perform this second so that 2698 // the in memory block always returns results. 2699 if i.queryRanges.IsEmpty() { 2700 return iter, false 2701 } 2702 2703 if iter.idx >= len(i.blocks) { 2704 return iter, false 2705 } 2706 2707 block := i.blocks[iter.idx].block 2708 2709 // Ensure the block has data requested by the query. 2710 blockRange := xtime.Range{ 2711 Start: block.StartTime(), 2712 End: block.EndTime(), 2713 } 2714 if !i.queryRanges.Overlaps(blockRange) { 2715 continue 2716 } 2717 2718 // Remove this range from the query range. 2719 i.queryRanges.RemoveRange(blockRange) 2720 2721 return iter, true 2722 } 2723 } 2724 2725 func (i blocksIterStackAlloc) Current() index.Block { 2726 if i.idx == -1 { 2727 return i.activeBlock 2728 } 2729 return i.blocks[i.idx].block 2730 }