github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/index.go (about) 1 // Copyright (c) 2020 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "bytes" 25 "errors" 26 "fmt" 27 "io" 28 "math" 29 goruntime "runtime" 30 "sort" 31 "strconv" 32 "sync" 33 "time" 34 35 "github.com/m3db/m3/src/dbnode/namespace" 36 "github.com/m3db/m3/src/dbnode/persist" 37 "github.com/m3db/m3/src/dbnode/persist/fs" 38 "github.com/m3db/m3/src/dbnode/retention" 39 "github.com/m3db/m3/src/dbnode/runtime" 40 "github.com/m3db/m3/src/dbnode/sharding" 41 "github.com/m3db/m3/src/dbnode/storage/block" 42 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 43 m3dberrors "github.com/m3db/m3/src/dbnode/storage/errors" 44 "github.com/m3db/m3/src/dbnode/storage/index" 45 "github.com/m3db/m3/src/dbnode/storage/index/compaction" 46 "github.com/m3db/m3/src/dbnode/storage/index/convert" 47 "github.com/m3db/m3/src/dbnode/storage/limits" 48 "github.com/m3db/m3/src/dbnode/storage/limits/permits" 49 "github.com/m3db/m3/src/dbnode/storage/series" 50 "github.com/m3db/m3/src/dbnode/tracepoint" 51 "github.com/m3db/m3/src/dbnode/ts/writes" 52 "github.com/m3db/m3/src/m3ninx/doc" 53 "github.com/m3db/m3/src/m3ninx/idx" 54 m3ninxindex "github.com/m3db/m3/src/m3ninx/index" 55 "github.com/m3db/m3/src/m3ninx/index/segment" 56 "github.com/m3db/m3/src/m3ninx/index/segment/builder" 57 idxpersist "github.com/m3db/m3/src/m3ninx/persist" 58 "github.com/m3db/m3/src/m3ninx/x" 59 "github.com/m3db/m3/src/x/clock" 60 "github.com/m3db/m3/src/x/context" 61 xerrors "github.com/m3db/m3/src/x/errors" 62 "github.com/m3db/m3/src/x/ident" 63 "github.com/m3db/m3/src/x/instrument" 64 xopentracing "github.com/m3db/m3/src/x/opentracing" 65 xresource "github.com/m3db/m3/src/x/resource" 66 xtime "github.com/m3db/m3/src/x/time" 67 68 "github.com/m3db/bitset" 69 "github.com/opentracing/opentracing-go" 70 opentracinglog "github.com/opentracing/opentracing-go/log" 71 "github.com/uber-go/tally" 72 "go.uber.org/atomic" 73 "go.uber.org/zap" 74 ) 75 76 var ( 77 errDbIndexAlreadyClosed = errors.New("database index has already been closed") 78 errDbIndexUnableToWriteClosed = errors.New("unable to write to database index, already closed") 79 errDbIndexUnableToQueryClosed = errors.New("unable to query database index, already closed") 80 errDbIndexUnableToFlushClosed = errors.New("unable to flush database index, already closed") 81 errDbIndexUnableToCleanupClosed = errors.New("unable to cleanup database index, already closed") 82 errDbIndexTerminatingTickCancellation = errors.New("terminating tick early due to cancellation") 83 errDbIndexIsBootstrapping = errors.New("index is already bootstrapping") 84 errDbIndexDoNotIndexSeries = errors.New("series matched do not index fields") 85 ) 86 87 const ( 88 defaultFlushReadDataBlocksBatchSize = int64(4096) 89 nsIndexReportStatsInterval = 10 * time.Second 90 91 defaultFlushDocsBatchSize = 8192 92 ) 93 94 var allQuery = idx.NewAllQuery() 95 96 // nolint: maligned 97 type nsIndex struct { 98 state nsIndexState 99 100 // all the vars below this line are not modified past the ctor 101 // and don't require a lock when being accessed. 102 nowFn clock.NowFn 103 blockSize time.Duration 104 retentionPeriod time.Duration 105 futureRetentionPeriod time.Duration 106 bufferPast time.Duration 107 bufferFuture time.Duration 108 coldWritesEnabled bool 109 110 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager 111 indexFilesetsBeforeFn indexFilesetsBeforeFn 112 deleteFilesFn deleteFilesFn 113 readIndexInfoFilesFn readIndexInfoFilesFn 114 115 newBlockFn index.NewBlockFn 116 logger *zap.Logger 117 opts Options 118 nsMetadata namespace.Metadata 119 runtimeOptsListener xresource.SimpleCloser 120 runtimeNsOptsListener xresource.SimpleCloser 121 122 resultsPool index.QueryResultsPool 123 aggregateResultsPool index.AggregateResultsPool 124 125 permitsManager permits.Manager 126 127 // queriesWg tracks outstanding queries to ensure 128 // we wait for all queries to complete before actually closing 129 // blocks and other cleanup tasks on index close 130 queriesWg sync.WaitGroup 131 132 metrics nsIndexMetrics 133 134 // forwardIndexDice determines if an incoming index write should be dual 135 // written to the next block. 136 forwardIndexDice forwardIndexDice 137 138 doNotIndexWithFields []doc.Field 139 140 activeBlock index.Block 141 } 142 143 type nsIndexState struct { 144 sync.RWMutex // NB: guards all variables in this struct 145 146 closed bool 147 closeCh chan struct{} 148 bootstrapState BootstrapState 149 150 runtimeOpts nsIndexRuntimeOptions 151 152 insertQueue namespaceIndexInsertQueue 153 154 // NB: `latestBlock` v `blocksByTime`: blocksByTime contains all the blocks known to `nsIndex`. 155 // `latestBlock` refers to the block with greatest StartTime within blocksByTime. We do this 156 // to skip accessing the map blocksByTime in the vast majority of write/query requests. It's 157 // lazily updated, so it can point to an older element until a Tick()/write rotates it. 158 blocksByTime map[xtime.UnixNano]index.Block 159 latestBlock index.Block 160 161 // NB: `blockStartsDescOrder` contains the keys from the map `blocksByTime` in reverse 162 // chronological order. This is used at query time to enforce determinism about results 163 // returned. 164 // NB(r): Reference to this slice can be safely taken for iteration purposes 165 // for Query(..) since it is rebuilt each time and immutable once built. 166 blocksDescOrderImmutable []blockAndBlockStart 167 168 // shardsFilterID is set every time the shards change to correctly 169 // only return IDs that this node owns. 170 shardsFilterID func(ident.ID) bool 171 172 // shardFilteredForID is set every time the shards change to correctly 173 // only return IDs that this node owns, and the shard responsible for that ID. 174 shardFilteredForID func(id ident.ID) (uint32, bool) 175 176 shardsAssigned map[uint32]struct{} 177 } 178 179 type blockAndBlockStart struct { 180 block index.Block 181 blockStart xtime.UnixNano 182 } 183 184 // NB: nsIndexRuntimeOptions does not contain its own mutex as some of the variables 185 // are needed for each index write which already at least acquires read lock from 186 // nsIndex mutex, so to keep the lock acquisitions to a minimum these are protected 187 // under the same nsIndex mutex. 188 type nsIndexRuntimeOptions struct { 189 insertMode index.InsertMode 190 maxQuerySeriesLimit int64 191 maxQueryDocsLimit int64 192 } 193 194 // NB(prateek): the returned filesets are strictly before the given time, i.e. they 195 // live in the period (-infinity, exclusiveTime). 196 type indexFilesetsBeforeFn func(dir string, 197 nsID ident.ID, 198 exclusiveTime xtime.UnixNano, 199 ) ([]string, error) 200 201 type readIndexInfoFilesFn func(opts fs.ReadIndexInfoFilesOptions) []fs.ReadIndexInfoFileResult 202 203 type newNamespaceIndexOpts struct { 204 md namespace.Metadata 205 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager 206 shardSet sharding.ShardSet 207 opts Options 208 newIndexQueueFn newNamespaceIndexInsertQueueFn 209 newBlockFn index.NewBlockFn 210 } 211 212 // execBlockQueryFn executes a query against the given block whilst tracking state. 213 type execBlockQueryFn func( 214 ctx context.Context, 215 block index.Block, 216 permit permits.Permit, 217 iter index.ResultIterator, 218 opts index.QueryOptions, 219 state *asyncQueryExecState, 220 results index.BaseResults, 221 logFields []opentracinglog.Field, 222 ) 223 224 // newBlockIterFn returns a new ResultIterator for the query. 225 type newBlockIterFn func( 226 ctx context.Context, 227 block index.Block, 228 query index.Query, 229 results index.BaseResults, 230 ) (index.ResultIterator, error) 231 232 // asyncQueryExecState tracks the async execution errors for a query. 233 type asyncQueryExecState struct { 234 sync.RWMutex 235 multiErr xerrors.MultiError 236 waitCount atomic.Uint64 237 } 238 239 func (s *asyncQueryExecState) hasErr() bool { 240 s.RLock() 241 defer s.RUnlock() 242 return s.multiErr.NumErrors() > 0 243 } 244 245 func (s *asyncQueryExecState) addErr(err error) { 246 s.Lock() 247 s.multiErr = s.multiErr.Add(err) 248 s.Unlock() 249 } 250 251 func (s *asyncQueryExecState) incWaited(i int) { 252 s.waitCount.Add(uint64(i)) 253 } 254 255 func (s *asyncQueryExecState) waited() int { 256 return int(s.waitCount.Load()) 257 } 258 259 // newNamespaceIndex returns a new namespaceIndex for the provided namespace. 260 func newNamespaceIndex( 261 nsMD namespace.Metadata, 262 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 263 shardSet sharding.ShardSet, 264 opts Options, 265 ) (NamespaceIndex, error) { 266 return newNamespaceIndexWithOptions(newNamespaceIndexOpts{ 267 md: nsMD, 268 namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr, 269 shardSet: shardSet, 270 opts: opts, 271 newIndexQueueFn: newNamespaceIndexInsertQueue, 272 newBlockFn: index.NewBlock, 273 }) 274 } 275 276 // newNamespaceIndexWithInsertQueueFn is a ctor used in tests to override the insert queue. 277 func newNamespaceIndexWithInsertQueueFn( 278 nsMD namespace.Metadata, 279 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 280 shardSet sharding.ShardSet, 281 newIndexQueueFn newNamespaceIndexInsertQueueFn, 282 opts Options, 283 ) (NamespaceIndex, error) { 284 return newNamespaceIndexWithOptions(newNamespaceIndexOpts{ 285 md: nsMD, 286 namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr, 287 shardSet: shardSet, 288 opts: opts, 289 newIndexQueueFn: newIndexQueueFn, 290 newBlockFn: index.NewBlock, 291 }) 292 } 293 294 // newNamespaceIndexWithNewBlockFn is a ctor used in tests to inject blocks. 295 func newNamespaceIndexWithNewBlockFn( 296 nsMD namespace.Metadata, 297 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 298 shardSet sharding.ShardSet, 299 newBlockFn index.NewBlockFn, 300 opts Options, 301 ) (NamespaceIndex, error) { 302 return newNamespaceIndexWithOptions(newNamespaceIndexOpts{ 303 md: nsMD, 304 namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr, 305 shardSet: shardSet, 306 opts: opts, 307 newIndexQueueFn: newNamespaceIndexInsertQueue, 308 newBlockFn: newBlockFn, 309 }) 310 } 311 312 // newNamespaceIndexWithOptions returns a new namespaceIndex with the provided configuration options. 313 func newNamespaceIndexWithOptions( 314 newIndexOpts newNamespaceIndexOpts, 315 ) (NamespaceIndex, error) { 316 var ( 317 nsMD = newIndexOpts.md 318 shardSet = newIndexOpts.shardSet 319 indexOpts = newIndexOpts.opts.IndexOptions() 320 instrumentOpts = newIndexOpts.opts.InstrumentOptions() 321 newIndexQueueFn = newIndexOpts.newIndexQueueFn 322 newBlockFn = newIndexOpts.newBlockFn 323 coreFn = newIndexOpts.opts.CoreFn() 324 runtimeOptsMgr = newIndexOpts.opts.RuntimeOptionsManager() 325 ) 326 if err := indexOpts.Validate(); err != nil { 327 return nil, err 328 } 329 330 scope := instrumentOpts.MetricsScope(). 331 SubScope("dbindex"). 332 Tagged(map[string]string{ 333 "namespace": nsMD.ID().String(), 334 }) 335 instrumentOpts = instrumentOpts.SetMetricsScope(scope) 336 indexOpts = indexOpts.SetInstrumentOptions(instrumentOpts) 337 338 nowFn := indexOpts.ClockOptions().NowFn() 339 logger := indexOpts.InstrumentOptions().Logger() 340 341 var doNotIndexWithFields []doc.Field 342 if m := newIndexOpts.opts.DoNotIndexWithFieldsMap(); m != nil && len(m) != 0 { 343 for k, v := range m { 344 doNotIndexWithFields = append(doNotIndexWithFields, doc.Field{ 345 Name: []byte(k), 346 Value: []byte(v), 347 }) 348 } 349 } 350 351 idx := &nsIndex{ 352 state: nsIndexState{ 353 closeCh: make(chan struct{}), 354 runtimeOpts: nsIndexRuntimeOptions{ 355 insertMode: indexOpts.InsertMode(), // FOLLOWUP(prateek): wire to allow this to be tweaked at runtime 356 }, 357 blocksByTime: make(map[xtime.UnixNano]index.Block), 358 shardsAssigned: make(map[uint32]struct{}), 359 }, 360 361 nowFn: nowFn, 362 blockSize: nsMD.Options().IndexOptions().BlockSize(), 363 retentionPeriod: nsMD.Options().RetentionOptions().RetentionPeriod(), 364 futureRetentionPeriod: nsMD.Options().RetentionOptions().FutureRetentionPeriod(), 365 bufferPast: nsMD.Options().RetentionOptions().BufferPast(), 366 bufferFuture: nsMD.Options().RetentionOptions().BufferFuture(), 367 coldWritesEnabled: nsMD.Options().ColdWritesEnabled(), 368 369 namespaceRuntimeOptsMgr: newIndexOpts.namespaceRuntimeOptsMgr, 370 indexFilesetsBeforeFn: fs.IndexFileSetsBefore, 371 readIndexInfoFilesFn: fs.ReadIndexInfoFiles, 372 deleteFilesFn: fs.DeleteFiles, 373 374 newBlockFn: newBlockFn, 375 opts: newIndexOpts.opts, 376 logger: logger, 377 nsMetadata: nsMD, 378 379 resultsPool: indexOpts.QueryResultsPool(), 380 aggregateResultsPool: indexOpts.AggregateResultsPool(), 381 382 permitsManager: newIndexOpts.opts.PermitsOptions().IndexQueryPermitsManager(), 383 metrics: newNamespaceIndexMetrics(indexOpts, instrumentOpts), 384 385 doNotIndexWithFields: doNotIndexWithFields, 386 } 387 388 activeBlock, err := idx.newBlockFn(xtime.UnixNano(0), idx.nsMetadata, 389 index.BlockOptions{ActiveBlock: true}, idx.namespaceRuntimeOptsMgr, 390 idx.opts.IndexOptions()) 391 if err != nil { 392 return nil, idx.unableToAllocBlockInvariantError(err) 393 } 394 395 idx.activeBlock = activeBlock 396 397 // Assign shard set upfront. 398 idx.AssignShardSet(shardSet) 399 400 idx.runtimeOptsListener = runtimeOptsMgr.RegisterListener(idx) 401 idx.runtimeNsOptsListener = idx.namespaceRuntimeOptsMgr.RegisterListener(idx) 402 403 // set up forward index dice. 404 dice, err := newForwardIndexDice(newIndexOpts.opts) 405 if err != nil { 406 return nil, err 407 } 408 409 if dice.enabled { 410 logger.Info("namespace forward indexing configured", 411 zap.Stringer("namespace", nsMD.ID()), 412 zap.Bool("enabled", dice.enabled), 413 zap.Duration("threshold", dice.forwardIndexThreshold), 414 zap.Float64("rate", dice.forwardIndexDice.Rate())) 415 } else { 416 idxOpts := newIndexOpts.opts.IndexOptions() 417 logger.Info("namespace forward indexing not enabled", 418 zap.Stringer("namespace", nsMD.ID()), 419 zap.Bool("enabled", false), 420 zap.Float64("threshold", idxOpts.ForwardIndexThreshold()), 421 zap.Float64("probability", idxOpts.ForwardIndexProbability())) 422 } 423 424 idx.forwardIndexDice = dice 425 426 // allocate indexing queue and start it up. 427 queue := newIndexQueueFn(idx.writeBatches, nsMD, nowFn, coreFn, scope) 428 if err := queue.Start(); err != nil { 429 return nil, err 430 } 431 idx.state.insertQueue = queue 432 433 // allocate the current block to ensure we're able to index as soon as we return 434 currentBlock := xtime.ToUnixNano(nowFn()).Truncate(idx.blockSize) 435 idx.state.RLock() 436 _, err = idx.ensureBlockPresentWithRLock(currentBlock) 437 idx.state.RUnlock() 438 if err != nil { 439 return nil, err 440 } 441 442 // Report stats 443 go idx.reportStatsUntilClosed() 444 445 return idx, nil 446 } 447 448 func (i *nsIndex) SetRuntimeOptions(runtime.Options) { 449 } 450 451 func (i *nsIndex) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptions) { 452 // We don't like to log from every single index segment that has 453 // settings updated so we log the changes here. 454 i.logger.Info("set namespace runtime index options", 455 zap.Stringer("namespace", i.nsMetadata.ID()), 456 zap.Any("writeIndexingPerCPUConcurrency", opts.WriteIndexingPerCPUConcurrency()), 457 zap.Any("flushIndexingPerCPUConcurrency", opts.FlushIndexingPerCPUConcurrency())) 458 } 459 460 func (i *nsIndex) reportStatsUntilClosed() { 461 ticker := time.NewTicker(nsIndexReportStatsInterval) 462 defer ticker.Stop() 463 464 for { 465 select { 466 case <-ticker.C: 467 err := i.reportStats() 468 if err != nil { 469 i.logger.Warn("could not report index stats", zap.Error(err)) 470 } 471 case <-i.state.closeCh: 472 return 473 } 474 } 475 } 476 477 type nsIndexCompactionLevelStats struct { 478 numSegments int64 479 numTotalDocs int64 480 } 481 482 func (i *nsIndex) reportStats() error { 483 i.state.RLock() 484 defer i.state.RUnlock() 485 486 foregroundLevels := i.metrics.blockMetrics.ForegroundSegments.Levels 487 foregroundLevelStats := make([]nsIndexCompactionLevelStats, len(foregroundLevels)) 488 489 backgroundLevels := i.metrics.blockMetrics.BackgroundSegments.Levels 490 backgroundLevelStats := make([]nsIndexCompactionLevelStats, len(backgroundLevels)) 491 492 flushedLevels := i.metrics.blockMetrics.FlushedSegments.Levels 493 flushedLevelStats := make([]nsIndexCompactionLevelStats, len(flushedLevels)) 494 495 minIndexConcurrency := 0 496 maxIndexConcurrency := 0 497 sumIndexConcurrency := 0 498 numIndexingStats := 0 499 reporter := index.NewBlockStatsReporter( 500 func(s index.BlockSegmentStats) { 501 var ( 502 levels []nsIndexBlocksSegmentsLevelMetrics 503 levelStats []nsIndexCompactionLevelStats 504 ) 505 switch s.Type { 506 case index.ActiveForegroundSegment: 507 levels = foregroundLevels 508 levelStats = foregroundLevelStats 509 case index.ActiveBackgroundSegment: 510 levels = backgroundLevels 511 levelStats = backgroundLevelStats 512 case index.FlushedSegment: 513 levels = flushedLevels 514 levelStats = flushedLevelStats 515 } 516 517 for i, l := range levels { 518 contained := s.Size >= l.MinSizeInclusive && s.Size < l.MaxSizeExclusive 519 if !contained { 520 continue 521 } 522 523 l.SegmentsAge.Record(s.Age) 524 levelStats[i].numSegments++ 525 levelStats[i].numTotalDocs += s.Size 526 527 break 528 } 529 }, 530 func(s index.BlockIndexingStats) { 531 first := numIndexingStats == 0 532 numIndexingStats++ 533 534 if first { 535 minIndexConcurrency = s.IndexConcurrency 536 maxIndexConcurrency = s.IndexConcurrency 537 sumIndexConcurrency = s.IndexConcurrency 538 return 539 } 540 541 if v := s.IndexConcurrency; v < minIndexConcurrency { 542 minIndexConcurrency = v 543 } 544 if v := s.IndexConcurrency; v > maxIndexConcurrency { 545 maxIndexConcurrency = v 546 } 547 sumIndexConcurrency += s.IndexConcurrency 548 }) 549 550 // iterate known blocks in a defined order of time (newest first) 551 // for debug log ordering 552 for _, b := range i.state.blocksDescOrderImmutable { 553 err := b.block.Stats(reporter) 554 if err == index.ErrUnableReportStatsBlockClosed { 555 // Closed blocks are temporarily in the list still 556 continue 557 } 558 if err != nil { 559 return err 560 } 561 } 562 // Active block should always be open. 563 if err := i.activeBlock.Stats(reporter); err != nil { 564 return err 565 } 566 567 // Update level stats. 568 for _, elem := range []struct { 569 levels []nsIndexBlocksSegmentsLevelMetrics 570 levelStats []nsIndexCompactionLevelStats 571 }{ 572 {foregroundLevels, foregroundLevelStats}, 573 {backgroundLevels, backgroundLevelStats}, 574 } { 575 for i, v := range elem.levelStats { 576 elem.levels[i].NumSegments.Update(float64(v.numSegments)) 577 elem.levels[i].NumTotalDocs.Update(float64(v.numTotalDocs)) 578 } 579 } 580 581 // Update the indexing stats. 582 i.metrics.indexingConcurrencyMin.Update(float64(minIndexConcurrency)) 583 i.metrics.indexingConcurrencyMax.Update(float64(maxIndexConcurrency)) 584 avgIndexConcurrency := float64(sumIndexConcurrency) / float64(numIndexingStats) 585 i.metrics.indexingConcurrencyAvg.Update(avgIndexConcurrency) 586 587 return nil 588 } 589 590 func (i *nsIndex) BlockStartForWriteTime(writeTime xtime.UnixNano) xtime.UnixNano { 591 return writeTime.Truncate(i.blockSize) 592 } 593 594 func (i *nsIndex) BlockForBlockStart(blockStart xtime.UnixNano) (index.Block, error) { 595 result, err := i.ensureBlockPresent(blockStart) 596 if err != nil { 597 return nil, err 598 } 599 return result.block, nil 600 } 601 602 // NB(prateek): including the call chains leading to this point: 603 // 604 // - For new entry (previously unseen in the shard): 605 // shard.WriteTagged() 606 // => shard.insertSeriesAsyncBatched() 607 // => shardInsertQueue.Insert() 608 // => shard.writeBatch() 609 // => index.WriteBatch() 610 // => indexQueue.Insert() 611 // => index.writeBatch() 612 // 613 // - For entry which exists in the shard, but needs indexing (either past 614 // the TTL or the last indexing hasn't happened/failed): 615 // shard.WriteTagged() 616 // => shard.insertSeriesForIndexingAsyncBatched() 617 // => shardInsertQueue.Insert() 618 // => shard.writeBatch() 619 // => index.Write() 620 // => indexQueue.Insert() 621 // => index.writeBatch() 622 623 func (i *nsIndex) WriteBatch( 624 batch *index.WriteBatch, 625 ) error { 626 // Filter anything with a pending index out before acquiring lock. 627 batch.MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize() 628 if !batch.PendingAny() { 629 return nil 630 } 631 632 i.state.RLock() 633 if !i.isOpenWithRLock() { 634 i.state.RUnlock() 635 i.metrics.insertAfterClose.Inc(1) 636 err := errDbIndexUnableToWriteClosed 637 batch.MarkUnmarkedEntriesError(err) 638 return err 639 } 640 641 // NB(prateek): retrieving insertMode here while we have the RLock. 642 insertMode := i.state.runtimeOpts.insertMode 643 wg, err := i.state.insertQueue.InsertBatch(batch) 644 645 // release the lock because we don't need it past this point. 646 i.state.RUnlock() 647 648 // if we're unable to index, we still have to finalize the reference we hold. 649 if err != nil { 650 batch.MarkUnmarkedEntriesError(err) 651 return err 652 } 653 // once the write has been queued in the indexInsertQueue, it assumes 654 // responsibility for calling the resource hooks. 655 656 // wait/terminate depending on if we are indexing synchronously or not. 657 if insertMode != index.InsertAsync { 658 wg.Wait() 659 660 // Re-sort the batch by initial enqueue order 661 if numErrs := batch.NumErrs(); numErrs > 0 { 662 // Restore the sort order from when enqueued for the caller. 663 batch.SortByEnqueued() 664 return fmt.Errorf("check batch: %d insert errors", numErrs) 665 } 666 } 667 668 return nil 669 } 670 671 func (i *nsIndex) WritePending( 672 pending []writes.PendingIndexInsert, 673 ) error { 674 // Filter anything with a pending index out before acquiring lock. 675 incoming := pending 676 pending = pending[:0] 677 for j := range incoming { 678 t := i.BlockStartForWriteTime(incoming[j].Entry.Timestamp) 679 if incoming[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) { 680 continue 681 } 682 // Continue to add this element. 683 pending = append(pending, incoming[j]) 684 } 685 if len(pending) == 0 { 686 return nil 687 } 688 689 i.state.RLock() 690 if !i.isOpenWithRLock() { 691 i.state.RUnlock() 692 i.metrics.insertAfterClose.Inc(1) 693 return errDbIndexUnableToWriteClosed 694 } 695 _, err := i.state.insertQueue.InsertPending(pending) 696 // release the lock because we don't need it past this point. 697 i.state.RUnlock() 698 699 return err 700 } 701 702 // WriteBatches is called by the indexInsertQueue. 703 func (i *nsIndex) writeBatches( 704 batch *index.WriteBatch, 705 ) { 706 // NB(prateek): we use a read lock to guard against mutation of the 707 // indexBlocks, mutations within the underlying blocks are guarded 708 // by primitives internal to it. 709 i.state.RLock() 710 if !i.isOpenWithRLock() { 711 i.state.RUnlock() 712 // NB(prateek): deliberately skip calling any of the `OnIndexFinalize` methods 713 // on the provided inserts to terminate quicker during shutdown. 714 return 715 } 716 var ( 717 now = xtime.ToUnixNano(i.nowFn()) 718 blockSize = i.blockSize 719 futureLimit = now.Add(1 * i.bufferFuture) 720 pastLimit = now.Add(-1 * i.bufferPast) 721 earliestBlockStartToRetain = retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now) 722 batchOptions = batch.Options() 723 forwardIndexDice = i.forwardIndexDice 724 forwardIndexEnabled = forwardIndexDice.enabled 725 total int 726 notSkipped int 727 forwardIndexHits int 728 forwardIndexMiss int 729 730 forwardIndexBatch *index.WriteBatch 731 ) 732 // NB(r): Release lock early to avoid writing batches impacting ticking 733 // speed, etc. 734 // Sometimes foreground compaction can take a long time during heavy inserts. 735 // Each lookup to ensureBlockPresent checks that index is still open, etc. 736 i.state.RUnlock() 737 738 if forwardIndexEnabled { 739 // NB(arnikola): Don't initialize forward index batch if forward indexing 740 // is not enabled. 741 forwardIndexBatch = index.NewWriteBatch(batchOptions) 742 } 743 744 // Ensure timestamp is not too old/new based on retention policies and that 745 // doc is valid. Add potential forward writes to the forwardWriteBatch. 746 batch.ForEach( 747 func(idx int, entry index.WriteBatchEntry, 748 d doc.Metadata, _ index.WriteBatchEntryResult) { 749 total++ 750 751 if len(i.doNotIndexWithFields) != 0 { 752 // This feature rarely used, do not optimize and just do n*m checks. 753 drop := true 754 for _, matchField := range i.doNotIndexWithFields { 755 matchedField := false 756 for _, actualField := range d.Fields { 757 if bytes.Equal(actualField.Name, matchField.Name) { 758 matchedField = bytes.Equal(actualField.Value, matchField.Value) 759 break 760 } 761 } 762 if !matchedField { 763 drop = false 764 break 765 } 766 } 767 if drop { 768 batch.MarkUnmarkedEntryError(errDbIndexDoNotIndexSeries, idx) 769 return 770 } 771 } 772 773 ts := entry.Timestamp 774 // NB(bodu): Always check first to see if the write is within retention. 775 if !ts.After(earliestBlockStartToRetain) { 776 batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx) 777 return 778 } 779 780 if !futureLimit.After(ts) { 781 batch.MarkUnmarkedEntryError(m3dberrors.ErrTooFuture, idx) 782 return 783 } 784 785 if ts.Before(pastLimit) && !i.coldWritesEnabled { 786 // NB(bodu): We only mark entries as too far in the past if 787 // cold writes are not enabled. 788 batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx) 789 return 790 } 791 792 if forwardIndexEnabled { 793 if forwardIndexDice.roll(ts) { 794 forwardIndexHits++ 795 forwardEntryTimestamp := ts.Truncate(blockSize).Add(blockSize) 796 if entry.OnIndexSeries.NeedsIndexUpdate(forwardEntryTimestamp) { 797 forwardIndexEntry := entry 798 forwardIndexEntry.Timestamp = forwardEntryTimestamp 799 t := i.BlockStartForWriteTime(forwardEntryTimestamp) 800 forwardIndexEntry.OnIndexSeries.OnIndexPrepare(t) 801 forwardIndexBatch.Append(forwardIndexEntry, d) 802 } 803 } else { 804 forwardIndexMiss++ 805 } 806 } 807 808 notSkipped++ 809 }) 810 811 if forwardIndexEnabled && forwardIndexBatch.Len() > 0 { 812 i.metrics.forwardIndexCounter.Inc(int64(forwardIndexBatch.Len())) 813 batch.AppendAll(forwardIndexBatch) 814 } 815 816 // Sort the inserts by which block they're applicable for, and do the inserts 817 // for each block, making sure to not try to insert any entries already marked 818 // with a result. 819 batch.ForEachUnmarkedBatchByBlockStart(i.writeBatchForBlockStart) 820 821 // Track index insertions. 822 // Note: attemptTotal should = attemptSkip + attemptWrite. 823 i.metrics.asyncInsertAttemptTotal.Inc(int64(total)) 824 i.metrics.asyncInsertAttemptSkip.Inc(int64(total - notSkipped)) 825 i.metrics.forwardIndexHits.Inc(int64(forwardIndexHits)) 826 i.metrics.forwardIndexMisses.Inc(int64(forwardIndexMiss)) 827 } 828 829 func (i *nsIndex) writeBatchForBlockStart( 830 blockStart xtime.UnixNano, batch *index.WriteBatch, 831 ) { 832 // NB(r): Capture pending entries so we can emit the latencies 833 pending := batch.PendingEntries() 834 numPending := len(pending) 835 836 // Track attempted write. 837 // Note: attemptTotal should = attemptSkip + attemptWrite. 838 i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending)) 839 840 // i.e. we have the block and the inserts, perform the writes. 841 result, err := i.activeBlock.WriteBatch(batch) 842 843 // Record the end to end indexing latency. 844 now := i.nowFn() 845 for idx := range pending { 846 took := now.Sub(pending[idx].EnqueuedAt) 847 i.metrics.insertEndToEndLatency.Record(took) 848 } 849 850 // NB: we don't need to do anything to the OnIndexSeries refs in `inserts` at this point, 851 // the index.Block WriteBatch assumes responsibility for calling the appropriate methods. 852 if n := result.NumSuccess; n > 0 { 853 i.metrics.asyncInsertSuccess.Inc(n) 854 } 855 856 // Record mutable segments count foreground/background if latest block. 857 if stats := result.MutableSegmentsStats; !stats.Empty() { 858 i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.Foreground.NumSegments)) 859 i.metrics.latestBlockNumDocsForeground.Update(float64(stats.Foreground.NumDocs)) 860 i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.Background.NumSegments)) 861 i.metrics.latestBlockNumDocsBackground.Update(float64(stats.Background.NumDocs)) 862 } 863 864 // Allow for duplicate write errors since due to re-indexing races 865 // we may try to re-index a series more than once. 866 if err := i.sanitizeAllowDuplicatesWriteError(err); err != nil { 867 numErrors := numPending - int(result.NumSuccess) 868 if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok { 869 // If it was a batch partial error we know exactly how many failed 870 // after filtering out for duplicate ID errors. 871 numErrors = len(partialError.Errs()) 872 } 873 i.metrics.asyncInsertErrors.Inc(int64(numErrors)) 874 i.logger.Error("error writing to index block", zap.Error(err)) 875 } 876 } 877 878 // Bootstrap bootstraps the index with the provided blocks. 879 func (i *nsIndex) Bootstrap( 880 bootstrapResults result.IndexResults, 881 ) error { 882 i.state.Lock() 883 if i.state.bootstrapState == Bootstrapping { 884 i.state.Unlock() 885 return errDbIndexIsBootstrapping 886 } 887 i.state.bootstrapState = Bootstrapping 888 i.state.Unlock() 889 890 i.state.RLock() 891 defer func() { 892 i.state.RUnlock() 893 i.state.Lock() 894 i.state.bootstrapState = Bootstrapped 895 i.state.Unlock() 896 }() 897 898 var multiErr xerrors.MultiError 899 for blockStart, blockResults := range bootstrapResults { 900 blockResult, err := i.ensureBlockPresentWithRLock(blockStart) 901 if err != nil { // should never happen 902 multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err)) 903 continue 904 } 905 if err := blockResult.block.AddResults(blockResults); err != nil { 906 multiErr = multiErr.Add(err) 907 } 908 } 909 910 return multiErr.FinalError() 911 } 912 913 func (i *nsIndex) Bootstrapped() bool { 914 i.state.RLock() 915 result := i.state.bootstrapState == Bootstrapped 916 i.state.RUnlock() 917 return result 918 } 919 920 func (i *nsIndex) Tick( 921 c context.Cancellable, 922 startTime xtime.UnixNano, 923 ) (namespaceIndexTickResult, error) { 924 var result namespaceIndexTickResult 925 926 // First collect blocks and acquire lock to remove those that need removing 927 // but then release lock so can Tick and do other expensive tasks 928 // such as notify of sealed blocks. 929 tickingBlocks, multiErr := i.tickingBlocks(startTime) 930 931 result.NumBlocks = int64(tickingBlocks.totalBlocks) 932 for _, block := range tickingBlocks.tickingBlocks { 933 if c.IsCancelled() { 934 multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation) 935 return result, multiErr.FinalError() 936 } 937 938 blockTickResult, tickErr := block.Tick(c) 939 multiErr = multiErr.Add(tickErr) 940 result.NumSegments += blockTickResult.NumSegments 941 result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped 942 result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable 943 result.NumTotalDocs += blockTickResult.NumDocs 944 result.FreeMmap += blockTickResult.FreeMmap 945 } 946 947 blockTickResult, tickErr := tickingBlocks.activeBlock.Tick(c) 948 multiErr = multiErr.Add(tickErr) 949 result.NumSegments += blockTickResult.NumSegments 950 result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped 951 result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable 952 result.NumTotalDocs += blockTickResult.NumDocs 953 result.FreeMmap += blockTickResult.FreeMmap 954 955 i.metrics.tick.Inc(1) 956 957 return result, multiErr.FinalError() 958 } 959 960 type tickingBlocksResult struct { 961 totalBlocks int 962 activeBlock index.Block 963 tickingBlocks []index.Block 964 } 965 966 func (i *nsIndex) tickingBlocks( 967 startTime xtime.UnixNano, 968 ) (tickingBlocksResult, xerrors.MultiError) { 969 multiErr := xerrors.NewMultiError() 970 earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod( 971 i.retentionPeriod, i.blockSize, startTime) 972 973 i.state.Lock() 974 activeBlock := i.activeBlock 975 tickingBlocks := make([]index.Block, 0, len(i.state.blocksByTime)) 976 defer func() { 977 i.updateBlockStartsWithLock() 978 i.state.Unlock() 979 }() 980 981 for blockStart, block := range i.state.blocksByTime { 982 // Drop any blocks past the retention period. 983 if blockStart.Before(earliestBlockStartToRetain) { 984 multiErr = multiErr.Add(block.Close()) 985 delete(i.state.blocksByTime, blockStart) 986 continue 987 } 988 989 // Tick any blocks we're going to retain, but don't tick inline here 990 // we'll do this out of the block. 991 tickingBlocks = append(tickingBlocks, block) 992 993 // Seal any blocks that are sealable while holding lock (seal is fast). 994 if !blockStart.After(i.lastSealableBlockStart(startTime)) && !block.IsSealed() { 995 multiErr = multiErr.Add(block.Seal()) 996 } 997 } 998 999 return tickingBlocksResult{ 1000 totalBlocks: len(i.state.blocksByTime), 1001 activeBlock: activeBlock, 1002 tickingBlocks: tickingBlocks, 1003 }, multiErr 1004 } 1005 1006 func (i *nsIndex) WarmFlush( 1007 flush persist.IndexFlush, 1008 shards []databaseShard, 1009 ) error { 1010 if len(shards) == 0 { 1011 // No-op if no shards currently owned. 1012 return nil 1013 } 1014 1015 flushable, err := i.flushableBlocks(shards, series.WarmWrite) 1016 if err != nil { 1017 return err 1018 } 1019 1020 // Determine the current flush indexing concurrency. 1021 namespaceRuntimeOpts := i.namespaceRuntimeOptsMgr.Get() 1022 perCPUFraction := namespaceRuntimeOpts.FlushIndexingPerCPUConcurrencyOrDefault() 1023 cpus := math.Ceil(perCPUFraction * float64(goruntime.GOMAXPROCS(0))) 1024 concurrency := int(math.Max(1, cpus)) 1025 1026 builderOpts := i.opts.IndexOptions().SegmentBuilderOptions(). 1027 SetConcurrency(concurrency) 1028 1029 builder, err := builder.NewBuilderFromDocuments(builderOpts) 1030 if err != nil { 1031 return err 1032 } 1033 defer builder.Close() 1034 1035 // Emit concurrency, then reset gauge to zero to show time 1036 // active during flushing broken down per namespace. 1037 i.metrics.flushIndexingConcurrency.Update(float64(concurrency)) 1038 defer i.metrics.flushIndexingConcurrency.Update(0) 1039 1040 var evicted int 1041 for _, block := range flushable { 1042 immutableSegments, err := i.flushBlock(flush, block, shards, builder) 1043 if err != nil { 1044 return err 1045 } 1046 // Make a result that covers the entire time ranges for the 1047 // block for each shard 1048 fulfilled := result.NewShardTimeRangesFromRange(block.StartTime(), block.EndTime(), 1049 dbShards(shards).IDs()...) 1050 1051 // Add the results to the block. 1052 persistedSegments := make([]result.Segment, 0, len(immutableSegments)) 1053 for _, elem := range immutableSegments { 1054 persistedSegment := result.NewSegment(elem, true) 1055 persistedSegments = append(persistedSegments, persistedSegment) 1056 } 1057 blockResult := result.NewIndexBlock(persistedSegments, fulfilled) 1058 results := result.NewIndexBlockByVolumeType(block.StartTime()) 1059 results.SetBlock(idxpersist.DefaultIndexVolumeType, blockResult) 1060 if err := block.AddResults(results); err != nil { 1061 return err 1062 } 1063 1064 evicted++ 1065 1066 // It's now safe to remove the mutable segments as anything the block 1067 // held is covered by the owned shards we just read 1068 if err := block.EvictMutableSegments(); err != nil { 1069 // deliberately choosing to not mark this as an error as we have successfully 1070 // flushed any mutable data. 1071 i.logger.Warn("encountered error while evicting mutable segments for index block", 1072 zap.Error(err), 1073 zap.Time("blockStart", block.StartTime().ToTime()), 1074 ) 1075 } 1076 1077 for _, t := range i.blockStartsFromIndexBlockStart(block.StartTime()) { 1078 for _, s := range shards { 1079 s.MarkWarmIndexFlushStateSuccessOrError(t, err) 1080 } 1081 } 1082 } 1083 i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted)) 1084 return nil 1085 } 1086 1087 func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) { 1088 if len(shards) == 0 { 1089 // No-op if no shards currently owned. 1090 return func() error { return nil }, nil 1091 } 1092 1093 flushable, err := i.flushableBlocks(shards, series.ColdWrite) 1094 if err != nil { 1095 return nil, err 1096 } 1097 // We only rotate cold mutable segments in phase I of cold flushing. 1098 for _, block := range flushable { 1099 if err := block.RotateColdMutableSegments(); err != nil { 1100 return nil, err 1101 } 1102 } 1103 // We can't immediately evict cold mutable segments so we return a callback to do so 1104 // when cold flush finishes. 1105 return func() error { 1106 multiErr := xerrors.NewMultiError() 1107 for _, block := range flushable { 1108 multiErr = multiErr.Add(block.EvictColdMutableSegments()) 1109 } 1110 return multiErr.FinalError() 1111 }, nil 1112 } 1113 1114 // WarmFlushBlockStarts returns all index blockStarts which have been flushed to disk. 1115 func (i *nsIndex) WarmFlushBlockStarts() []xtime.UnixNano { 1116 flushed := make([]xtime.UnixNano, 0) 1117 infoFiles := i.readInfoFilesAsMap() 1118 1119 for blockStart := range infoFiles { 1120 if i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) { 1121 flushed = append(flushed, blockStart) 1122 } 1123 } 1124 return flushed 1125 } 1126 1127 // BackgroundCompact background compacts eligible segments. 1128 func (i *nsIndex) BackgroundCompact() { 1129 if i.activeBlock != nil { 1130 i.activeBlock.BackgroundCompact() 1131 } 1132 for _, b := range i.state.blocksByTime { 1133 b.BackgroundCompact() 1134 } 1135 } 1136 1137 func (i *nsIndex) readInfoFilesAsMap() map[xtime.UnixNano][]fs.ReadIndexInfoFileResult { 1138 fsOpts := i.opts.CommitLogOptions().FilesystemOptions() 1139 infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ 1140 FilePathPrefix: fsOpts.FilePathPrefix(), 1141 Namespace: i.nsMetadata.ID(), 1142 ReaderBufferSize: fsOpts.InfoReaderBufferSize(), 1143 }) 1144 result := make(map[xtime.UnixNano][]fs.ReadIndexInfoFileResult) 1145 for _, infoFile := range infoFiles { 1146 t := xtime.UnixNano(infoFile.Info.BlockStart) 1147 files := result[t] 1148 result[t] = append(files, infoFile) 1149 } 1150 return result 1151 } 1152 1153 func (i *nsIndex) flushableBlocks( 1154 shards []databaseShard, 1155 flushType series.WriteType, 1156 ) ([]index.Block, error) { 1157 i.state.RLock() 1158 defer i.state.RUnlock() 1159 if !i.isOpenWithRLock() { 1160 return nil, errDbIndexUnableToFlushClosed 1161 } 1162 // NB(bodu): We read index info files once here to avoid re-reading all of them 1163 // for each block. 1164 infoFiles := i.readInfoFilesAsMap() 1165 flushable := make([]index.Block, 0, len(i.state.blocksByTime)) 1166 1167 now := xtime.ToUnixNano(i.nowFn()) 1168 earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now) 1169 currentBlockStart := now.Truncate(i.blockSize) 1170 // Check for flushable blocks by iterating through all block starts w/in retention. 1171 for blockStart := earliestBlockStartToRetain; blockStart.Before(currentBlockStart); blockStart = blockStart.Add(i.blockSize) { 1172 blockResult, err := i.ensureBlockPresentWithRLock(blockStart) 1173 if err != nil { 1174 return nil, err 1175 } 1176 1177 canFlush, err := i.canFlushBlockWithRLock(infoFiles, blockStart, 1178 blockResult.block, shards, flushType) 1179 if err != nil { 1180 return nil, err 1181 } 1182 if !canFlush { 1183 continue 1184 } 1185 1186 flushable = append(flushable, blockResult.block) 1187 } 1188 return flushable, nil 1189 } 1190 1191 func (i *nsIndex) canFlushBlockWithRLock( 1192 infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult, 1193 blockStart xtime.UnixNano, 1194 block index.Block, 1195 shards []databaseShard, 1196 flushType series.WriteType, 1197 ) (bool, error) { 1198 switch flushType { 1199 case series.WarmWrite: 1200 // NB(bodu): We should always attempt to warm flush sealed blocks to disk if 1201 // there doesn't already exist data on disk. We're checking this instead of 1202 // `block.NeedsMutableSegmentsEvicted()` since bootstrap writes for cold block starts 1203 // get marked as warm writes if there doesn't already exist data on disk and need to 1204 // properly go through the warm flush lifecycle. 1205 if !block.IsSealed() || i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) { 1206 return false, nil 1207 } 1208 case series.ColdWrite: 1209 if !block.NeedsColdMutableSegmentsEvicted() { 1210 return false, nil 1211 } 1212 } 1213 1214 // Check all data files exist for the shards we own 1215 for _, shard := range shards { 1216 if !shard.IsBootstrapped() { 1217 i.logger. 1218 With(zap.Uint32("shard", shard.ID())). 1219 Debug("skipping index cold flush due to shard not bootstrapped yet") 1220 continue 1221 } 1222 1223 for _, t := range i.blockStartsFromIndexBlockStart(blockStart) { 1224 flushState, err := shard.FlushState(t) 1225 if err != nil { 1226 return false, err 1227 } 1228 1229 // Skip if the data flushing failed. Data flushing precedes index flushing. 1230 if flushState.WarmStatus.DataFlushed != fileOpSuccess { 1231 return false, nil 1232 } 1233 } 1234 } 1235 1236 return true, nil 1237 } 1238 1239 // blockStartsFromIndexBlockStart returns the possibly many blocksStarts that exist within 1240 // a given index block (since index block size >= data block size) 1241 func (i *nsIndex) blockStartsFromIndexBlockStart(blockStart xtime.UnixNano) []xtime.UnixNano { 1242 start := blockStart 1243 end := blockStart.Add(i.blockSize) 1244 dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize() 1245 blockStarts := make([]xtime.UnixNano, 0) 1246 for t := start; t.Before(end); t = t.Add(dataBlockSize) { 1247 blockStarts = append(blockStarts, t) 1248 } 1249 return blockStarts 1250 } 1251 1252 func (i *nsIndex) hasIndexWarmFlushedToDisk( 1253 infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult, 1254 blockStart xtime.UnixNano, 1255 ) bool { 1256 // NB(bodu): We consider the block to have been warm flushed if there are any 1257 // filesets on disk. This is consistent with the "has warm flushed" check in the db shard. 1258 // Shard block starts are marked as having warm flushed if an info file is successfully read from disk. 1259 f, ok := infoFiles[blockStart] 1260 if !ok { 1261 return false 1262 } 1263 1264 for _, fileInfo := range f { 1265 indexVolumeType := idxpersist.DefaultIndexVolumeType 1266 if fileInfo.Info.IndexVolumeType != nil { 1267 indexVolumeType = idxpersist.IndexVolumeType(fileInfo.Info.IndexVolumeType.Value) 1268 } 1269 match := fileInfo.ID.BlockStart == blockStart && indexVolumeType == idxpersist.DefaultIndexVolumeType 1270 if match { 1271 return true 1272 } 1273 } 1274 return false 1275 } 1276 1277 func (i *nsIndex) flushBlock( 1278 flush persist.IndexFlush, 1279 indexBlock index.Block, 1280 shards []databaseShard, 1281 builder segment.DocumentsBuilder, 1282 ) ([]segment.Segment, error) { 1283 allShards := make(map[uint32]struct{}) 1284 for _, shard := range shards { 1285 // Populate all shards 1286 allShards[shard.ID()] = struct{}{} 1287 } 1288 1289 volumeIndex, err := i.opts.IndexClaimsManager().ClaimNextIndexFileSetVolumeIndex( 1290 i.nsMetadata, 1291 indexBlock.StartTime(), 1292 ) 1293 if err != nil { 1294 return nil, fmt.Errorf("failed to claim next index volume index: %w", err) 1295 } 1296 1297 preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{ 1298 NamespaceMetadata: i.nsMetadata, 1299 BlockStart: indexBlock.StartTime(), 1300 FileSetType: persist.FileSetFlushType, 1301 Shards: allShards, 1302 // NB(bodu): By default, we always write to the "default" index volume type. 1303 IndexVolumeType: idxpersist.DefaultIndexVolumeType, 1304 VolumeIndex: volumeIndex, 1305 }) 1306 if err != nil { 1307 return nil, err 1308 } 1309 1310 var closed bool 1311 defer func() { 1312 if !closed { 1313 segments, _ := preparedPersist.Close() 1314 // NB(r): Safe to for over a nil array so disregard error here. 1315 for _, segment := range segments { 1316 segment.Close() 1317 } 1318 } 1319 }() 1320 1321 // Flush a single block segment. 1322 if err := i.flushBlockSegment(preparedPersist, indexBlock, shards, builder); err != nil { 1323 return nil, err 1324 } 1325 1326 closed = true 1327 1328 // Now return the immutable segments 1329 return preparedPersist.Close() 1330 } 1331 1332 func (i *nsIndex) flushBlockSegment( 1333 preparedPersist persist.PreparedIndexPersist, 1334 indexBlock index.Block, 1335 shards []databaseShard, 1336 builder segment.DocumentsBuilder, 1337 ) error { 1338 // Reset the builder 1339 builder.Reset() 1340 1341 var ( 1342 batch = m3ninxindex.Batch{AllowPartialUpdates: true} 1343 batchSize = defaultFlushDocsBatchSize 1344 ) 1345 ctx := i.opts.ContextPool().Get() 1346 defer ctx.Close() 1347 1348 for _, shard := range shards { 1349 var ( 1350 first = true 1351 pageToken PageToken 1352 ) 1353 for first || pageToken != nil { 1354 first = false 1355 1356 var ( 1357 opts = block.FetchBlocksMetadataOptions{ 1358 // NB(bodu): There is a lag between when data gets flushed 1359 // to disk and when it gets removed from memory during the next 1360 // Tick. In this case, the same series can exist both on disk 1361 // and in memory at the same time resulting in dupe series IDs. 1362 // Only read data from disk when flushing index segments. 1363 OnlyDisk: true, 1364 } 1365 limit = defaultFlushReadDataBlocksBatchSize 1366 results block.FetchBlocksMetadataResults 1367 err error 1368 ) 1369 ctx.Reset() 1370 results, pageToken, err = shard.FetchBlocksMetadataV2(ctx, 1371 indexBlock.StartTime(), indexBlock.EndTime(), 1372 limit, pageToken, opts) 1373 if err != nil { 1374 return err 1375 } 1376 1377 // Reset docs batch before use. 1378 batch.Docs = batch.Docs[:0] 1379 for _, result := range results.Results() { 1380 doc, exists, err := shard.DocRef(result.ID) 1381 if err != nil { 1382 return err 1383 } 1384 if !exists { 1385 doc, err = convert.FromSeriesIDAndTagIter(result.ID, result.Tags) 1386 if err != nil { 1387 return err 1388 } 1389 i.metrics.flushDocsNew.Inc(1) 1390 } else { 1391 i.metrics.flushDocsCached.Inc(1) 1392 } 1393 1394 batch.Docs = append(batch.Docs, doc) 1395 if len(batch.Docs) < batchSize { 1396 continue 1397 } 1398 1399 err = i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch)) 1400 if err != nil { 1401 return err 1402 } 1403 1404 // Reset docs after insertions. 1405 batch.Docs = batch.Docs[:0] 1406 } 1407 1408 // Add last batch if remaining. 1409 if len(batch.Docs) > 0 { 1410 err := i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch)) 1411 if err != nil { 1412 return err 1413 } 1414 } 1415 1416 results.Close() 1417 1418 // Use BlockingCloseReset so that we can reuse the context without 1419 // it going back to the pool. 1420 ctx.BlockingCloseReset() 1421 } 1422 } 1423 1424 // Finally flush this segment 1425 return preparedPersist.Persist(builder) 1426 } 1427 1428 func (i *nsIndex) sanitizeAllowDuplicatesWriteError(err error) error { 1429 if err == nil { 1430 return nil 1431 } 1432 1433 // NB: dropping duplicate id error messages from logs as they're expected when we see 1434 // repeated inserts. as long as a block has an ID, it's not an error so we don't need 1435 // to pollute the logs with these messages. 1436 if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok { 1437 err = partialError.FilterDuplicateIDErrors() 1438 } 1439 1440 return err 1441 } 1442 1443 func (i *nsIndex) AssignShardSet(shardSet sharding.ShardSet) { 1444 // NB(r): Allocate the filter function once, it can be used outside 1445 // of locks as it depends on no internal state. 1446 set := bitset.NewBitSet(uint(shardSet.Max())) 1447 assigned := make(map[uint32]struct{}) 1448 for _, shardID := range shardSet.AllIDs() { 1449 set.Set(uint(shardID)) 1450 assigned[shardID] = struct{}{} 1451 } 1452 1453 i.state.Lock() 1454 i.state.shardsFilterID = func(id ident.ID) bool { 1455 // NB(r): Use a bitset for fast lookups. 1456 return set.Test(uint(shardSet.Lookup(id))) 1457 } 1458 1459 i.state.shardFilteredForID = func(id ident.ID) (uint32, bool) { 1460 shard := shardSet.Lookup(id) 1461 return shard, set.Test(uint(shard)) 1462 } 1463 1464 i.state.shardsAssigned = assigned 1465 i.state.Unlock() 1466 } 1467 1468 func (i *nsIndex) shardsFilterID() func(id ident.ID) bool { 1469 i.state.RLock() 1470 v := i.state.shardsFilterID 1471 i.state.RUnlock() 1472 return v 1473 } 1474 1475 func (i *nsIndex) shardForID() func(id ident.ID) (uint32, bool) { 1476 i.state.RLock() 1477 v := i.state.shardFilteredForID 1478 i.state.RUnlock() 1479 return v 1480 } 1481 1482 func (i *nsIndex) Query( 1483 ctx context.Context, 1484 query index.Query, 1485 opts index.QueryOptions, 1486 ) (index.QueryResult, error) { 1487 var logFields []opentracinglog.Field 1488 ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQuery) 1489 defer sp.Finish() 1490 if sampled { 1491 // Only allocate metadata such as query string if sampling trace. 1492 logFields = []opentracinglog.Field{ 1493 opentracinglog.String("query", query.String()), 1494 opentracinglog.String("namespace", i.nsMetadata.ID().String()), 1495 opentracinglog.Int("seriesLimit", opts.SeriesLimit), 1496 opentracinglog.Int("docsLimit", opts.DocsLimit), 1497 xopentracing.Time("queryStart", opts.StartInclusive.ToTime()), 1498 xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()), 1499 } 1500 sp.LogFields(logFields...) 1501 } 1502 1503 // Get results and set the namespace ID and size limit. 1504 results := i.resultsPool.Get() 1505 results.Reset(i.nsMetadata.ID(), index.QueryResultsOptions{ 1506 SizeLimit: opts.SeriesLimit, 1507 FilterID: i.shardsFilterID(), 1508 }) 1509 ctx.RegisterFinalizer(results) 1510 queryRes, err := i.query(ctx, query, results, opts, i.execBlockQueryFn, 1511 i.newBlockQueryIterFn, logFields) 1512 if err != nil { 1513 sp.LogFields(opentracinglog.Error(err)) 1514 return index.QueryResult{}, err 1515 } 1516 1517 return index.QueryResult{ 1518 Results: results, 1519 Exhaustive: queryRes.exhaustive, 1520 Waited: queryRes.waited, 1521 }, nil 1522 } 1523 1524 func (i *nsIndex) AggregateQuery( 1525 ctx context.Context, 1526 query index.Query, 1527 opts index.AggregationOptions, 1528 ) (index.AggregateQueryResult, error) { 1529 id := i.nsMetadata.ID() 1530 logFields := []opentracinglog.Field{ 1531 opentracinglog.String("query", query.String()), 1532 opentracinglog.String("namespace", id.String()), 1533 opentracinglog.Int("seriesLimit", opts.SeriesLimit), 1534 opentracinglog.Int("docsLimit", opts.DocsLimit), 1535 xopentracing.Time("queryStart", opts.StartInclusive.ToTime()), 1536 xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()), 1537 } 1538 1539 ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxAggregateQuery) 1540 sp.LogFields(logFields...) 1541 defer sp.Finish() 1542 1543 metrics := index.NewAggregateUsageMetrics(id, i.opts.InstrumentOptions()) 1544 // Get results and set the filters, namespace ID and size limit. 1545 results := i.aggregateResultsPool.Get() 1546 aopts := index.AggregateResultsOptions{ 1547 SizeLimit: opts.SeriesLimit, 1548 DocsLimit: opts.DocsLimit, 1549 FieldFilter: opts.FieldFilter, 1550 Type: opts.Type, 1551 AggregateUsageMetrics: metrics, 1552 } 1553 ctx.RegisterFinalizer(results) 1554 // use appropriate fn to query underlying blocks. 1555 // use block.Aggregate() for querying and set the query if required. 1556 fn := i.execBlockAggregateQueryFn 1557 isAllQuery := query.Equal(allQuery) 1558 if !isAllQuery { 1559 if field, isFieldQuery := idx.FieldQuery(query.Query); isFieldQuery { 1560 aopts.FieldFilter = aopts.FieldFilter.AddIfMissing(field) 1561 } else { 1562 // Need to actually restrict whether we should return a term or not 1563 // based on running the actual query to resolve a postings list and 1564 // then seeing if that intersects the aggregated term postings list 1565 // at all. 1566 aopts.RestrictByQuery = &query 1567 } 1568 } 1569 aopts.FieldFilter = aopts.FieldFilter.SortAndDedupe() 1570 results.Reset(id, aopts) 1571 queryRes, err := i.query(ctx, query, results, opts.QueryOptions, fn, 1572 i.newBlockAggregatorIterFn, logFields) 1573 if err != nil { 1574 return index.AggregateQueryResult{}, err 1575 } 1576 return index.AggregateQueryResult{ 1577 Results: results, 1578 Exhaustive: queryRes.exhaustive, 1579 Waited: queryRes.waited, 1580 }, nil 1581 } 1582 1583 type queryResult struct { 1584 exhaustive bool 1585 waited int 1586 } 1587 1588 func (i *nsIndex) query( 1589 ctx context.Context, 1590 query index.Query, 1591 results index.BaseResults, 1592 opts index.QueryOptions, 1593 execBlockFn execBlockQueryFn, 1594 newBlockIterFn newBlockIterFn, 1595 logFields []opentracinglog.Field, 1596 ) (queryResult, error) { 1597 ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQueryHelper) 1598 sp.LogFields(logFields...) 1599 defer sp.Finish() 1600 if sampled { 1601 // Only log fields if sampled. 1602 sp.LogFields(logFields...) 1603 } 1604 1605 queryRes, err := i.queryWithSpan(ctx, query, results, opts, execBlockFn, 1606 newBlockIterFn, sp, logFields) 1607 if err != nil { 1608 sp.LogFields(opentracinglog.Error(err)) 1609 1610 if queryRes.exhaustive { 1611 i.metrics.queryExhaustiveInternalError.Inc(1) 1612 } else { 1613 i.metrics.queryNonExhaustiveInternalError.Inc(1) 1614 } 1615 return queryRes, err 1616 } 1617 1618 if queryRes.exhaustive { 1619 i.metrics.queryExhaustiveSuccess.Inc(1) 1620 return queryRes, nil 1621 } 1622 1623 // If require exhaustive but not, return error. 1624 if opts.RequireExhaustive { 1625 seriesCount := results.Size() 1626 docsCount := results.TotalDocsCount() 1627 if opts.SeriesLimitExceeded(seriesCount) { 1628 i.metrics.queryNonExhaustiveSeriesLimitError.Inc(1) 1629 } else if opts.DocsLimitExceeded(docsCount) { 1630 i.metrics.queryNonExhaustiveDocsLimitError.Inc(1) 1631 } else { 1632 i.metrics.queryNonExhaustiveLimitError.Inc(1) 1633 } 1634 1635 // NB(r): Make sure error is not retried and returns as bad request. 1636 return queryRes, xerrors.NewInvalidParamsError(limits.NewQueryLimitExceededError(fmt.Sprintf( 1637 "query exceeded limit: require_exhaustive=%v, series_limit=%d, series_matched=%d, docs_limit=%d, docs_matched=%d", 1638 opts.RequireExhaustive, 1639 opts.SeriesLimit, 1640 seriesCount, 1641 opts.DocsLimit, 1642 docsCount, 1643 ))) 1644 } 1645 1646 // Otherwise non-exhaustive but not required to be. 1647 i.metrics.queryNonExhaustiveSuccess.Inc(1) 1648 return queryRes, nil 1649 } 1650 1651 // blockIter is a composite type to hold various state about a block while iterating over the results. 1652 type blockIter struct { 1653 iter index.ResultIterator 1654 iterCloser io.Closer 1655 block index.Block 1656 waitTime time.Duration 1657 processingTime time.Duration 1658 } 1659 1660 func (i *nsIndex) queryWithSpan( 1661 ctx context.Context, 1662 query index.Query, 1663 results index.BaseResults, 1664 opts index.QueryOptions, 1665 execBlockFn execBlockQueryFn, 1666 newBlockIterFn newBlockIterFn, 1667 span opentracing.Span, 1668 logFields []opentracinglog.Field, 1669 ) (queryResult, error) { 1670 i.state.RLock() 1671 if !i.isOpenWithRLock() { 1672 i.state.RUnlock() 1673 return queryResult{}, errDbIndexUnableToQueryClosed 1674 } 1675 1676 // Track this as an inflight query that needs to finish 1677 // when the index is closed. 1678 i.queriesWg.Add(1) 1679 defer i.queriesWg.Done() 1680 1681 // Enact overrides for query options 1682 opts = i.overriddenOptsForQueryWithRLock(opts) 1683 1684 // Retrieve blocks to query, then we can release lock. 1685 // NB(r): Important not to block ticking, and other tasks by 1686 // holding the RLock during a query. 1687 qryRange := xtime.NewRanges(xtime.Range{ 1688 Start: opts.StartInclusive, 1689 End: opts.EndExclusive, 1690 }) 1691 // NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's 1692 // immutable and we only create an iterator over it. 1693 blocks := newBlocksIterStackAlloc(i.activeBlock, i.state.blocksDescOrderImmutable, qryRange) 1694 1695 // Can now release the lock and execute the query without holding the lock. 1696 i.state.RUnlock() 1697 1698 var ( 1699 // State contains concurrent mutable state for async execution below. 1700 state = &asyncQueryExecState{} 1701 wg sync.WaitGroup 1702 ) 1703 perms, err := i.permitsManager.NewPermits(ctx) 1704 if err != nil { 1705 return queryResult{}, err 1706 } 1707 defer perms.Close() 1708 1709 var blockIters []*blockIter 1710 for b, ok := blocks.Next(); ok; b, ok = b.Next() { 1711 block := b.Current() 1712 iter, err := newBlockIterFn(ctx, block, query, results) 1713 if err != nil { 1714 return queryResult{}, err 1715 } 1716 blockIters = append(blockIters, &blockIter{ 1717 iter: iter, 1718 iterCloser: x.NewSafeCloser(iter), 1719 block: block, 1720 }) 1721 } 1722 1723 defer func() { 1724 for _, iter := range blockIters { 1725 // safe to call Close multiple times, so it's fine to eagerly close in the loop below and here. 1726 _ = iter.iterCloser.Close() 1727 } 1728 }() 1729 1730 // queryCanceled returns true if the query has been canceled and the current iteration should terminate. 1731 queryCanceled := func() bool { 1732 return opts.LimitsExceeded(results.Size(), results.TotalDocsCount()) || state.hasErr() 1733 } 1734 // waitForPermit waits for a permit. returns non-nil if the permit was acquired and the wait time. 1735 waitForPermit := func() (permits.Permit, time.Duration) { 1736 // make sure the query hasn't been canceled before waiting for a permit. 1737 if queryCanceled() { 1738 return nil, 0 1739 } 1740 1741 startWait := time.Now() 1742 acquireResult, err := perms.Acquire(ctx) 1743 waitTime := time.Since(startWait) 1744 var success bool 1745 defer func() { 1746 // Note: ALWAYS release if we do not successfully return back 1747 // the permit and we checked one out. 1748 if !success && acquireResult.Permit != nil { 1749 perms.Release(acquireResult.Permit) 1750 } 1751 }() 1752 if acquireResult.Waited { 1753 // Potentially break an error if require no wait set. 1754 if err == nil && opts.RequireNoWait { 1755 // Fail iteration if request requires no waiting occurs. 1756 err = permits.ErrOperationWaitedOnRequireNoWait 1757 } 1758 state.incWaited(1) 1759 } 1760 if err != nil { 1761 state.addErr(err) 1762 return nil, waitTime 1763 } 1764 1765 // make sure the query hasn't been canceled while waiting for a permit. 1766 if queryCanceled() { 1767 return nil, waitTime 1768 } 1769 1770 success = true 1771 return acquireResult.Permit, waitTime 1772 } 1773 1774 // We're looping through all the blocks that we need to query and kicking 1775 // off parallel queries which are bounded by the permits maximum 1776 // concurrency. It's possible at this point that we've completed querying one or more blocks and already exhausted 1777 // the maximum number of results that we're allowed to return. If thats the case, there is no value in kicking off 1778 // more parallel queries, so we break out of the loop. 1779 for _, blockIter := range blockIters { 1780 // Capture for async query execution below. 1781 blockIter := blockIter 1782 1783 // acquire a permit before kicking off the goroutine to process the iterator. this limits the number of 1784 // concurrent goroutines to # of permits + large queries that needed multiple iterations to finish. 1785 permit, waitTime := waitForPermit() 1786 blockIter.waitTime += waitTime 1787 if permit == nil { 1788 break 1789 } 1790 1791 // must not reuse logField slice as the last field will be mutated by concurrent goroutines. 1792 blockLogFields := make([]opentracinglog.Field, 0, len(logFields)+1) 1793 blockLogFields = append(blockLogFields, logFields...) 1794 1795 wg.Add(1) 1796 // kick off a go routine to process the entire iterator. 1797 go func() { 1798 defer wg.Done() 1799 first := true 1800 for !blockIter.iter.Done() { 1801 // if this is not the first iteration of the iterator, need to acquire another permit. 1802 if !first { 1803 permit, waitTime = waitForPermit() 1804 blockIter.waitTime += waitTime 1805 if permit == nil { 1806 break 1807 } 1808 } 1809 blockLogFields = append(blockLogFields, xopentracing.Duration("permitWaitTime", waitTime)) 1810 first = false 1811 startProcessing := time.Now() 1812 execBlockFn(ctx, blockIter.block, permit, blockIter.iter, opts, state, results, blockLogFields) 1813 processingTime := time.Since(startProcessing) 1814 blockIter.processingTime += processingTime 1815 permit.Use(int64(processingTime)) 1816 perms.Release(permit) 1817 } 1818 if first { 1819 // this should never happen since a new iter cannot be Done, but just to be safe. 1820 perms.Release(permit) 1821 } 1822 1823 // close the iterator since it's no longer needed. it's safe to call Close multiple times, here and in the 1824 // defer when the function returns. 1825 if err := blockIter.iterCloser.Close(); err != nil { 1826 state.addErr(err) 1827 } 1828 }() 1829 } 1830 1831 // wait for all workers to finish. if the caller cancels the call, the workers will be interrupted and eventually 1832 // finish. 1833 wg.Wait() 1834 1835 i.metrics.loadedDocsPerQuery.RecordValue(float64(results.TotalDocsCount())) 1836 1837 exhaustive := opts.Exhaustive(results.Size(), results.TotalDocsCount()) 1838 // ok to read state without lock since all parallel queries are done. 1839 multiErr := state.multiErr 1840 err = multiErr.FinalError() 1841 1842 return queryResult{ 1843 exhaustive: exhaustive, 1844 waited: state.waited(), 1845 }, err 1846 } 1847 1848 func (i *nsIndex) newBlockQueryIterFn( 1849 ctx context.Context, 1850 block index.Block, 1851 query index.Query, 1852 _ index.BaseResults, 1853 ) (index.ResultIterator, error) { 1854 return block.QueryIter(ctx, query) 1855 } 1856 1857 //nolint: dupl 1858 func (i *nsIndex) execBlockQueryFn( 1859 ctx context.Context, 1860 block index.Block, 1861 permit permits.Permit, 1862 iter index.ResultIterator, 1863 opts index.QueryOptions, 1864 state *asyncQueryExecState, 1865 results index.BaseResults, 1866 logFields []opentracinglog.Field, 1867 ) { 1868 logFields = append(logFields, 1869 xopentracing.Time("blockStart", block.StartTime().ToTime()), 1870 xopentracing.Time("blockEnd", block.EndTime().ToTime()), 1871 ) 1872 1873 ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockQuery) 1874 sp.LogFields(logFields...) 1875 defer sp.Finish() 1876 1877 docResults, ok := results.(index.DocumentResults) 1878 if !ok { // should never happen 1879 state.addErr(fmt.Errorf("unknown results type [%T] received during query", results)) 1880 return 1881 } 1882 queryIter, ok := iter.(index.QueryIterator) 1883 if !ok { // should never happen 1884 state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter)) 1885 return 1886 } 1887 1888 deadline := time.Now().Add(time.Duration(permit.AllowedQuota())) 1889 err := block.QueryWithIter(ctx, opts, queryIter, docResults, deadline, logFields) 1890 if err == index.ErrUnableToQueryBlockClosed { 1891 // NB(r): Because we query this block outside of the results lock, it's 1892 // possible this block may get closed if it slides out of retention, in 1893 // that case those results are no longer considered valid and outside of 1894 // retention regardless, so this is a non-issue. 1895 err = nil 1896 } 1897 1898 if err != nil { 1899 sp.LogFields(opentracinglog.Error(err)) 1900 state.addErr(err) 1901 } 1902 } 1903 1904 func (i *nsIndex) newBlockAggregatorIterFn( 1905 ctx context.Context, 1906 block index.Block, 1907 _ index.Query, 1908 results index.BaseResults, 1909 ) (index.ResultIterator, error) { 1910 aggResults, ok := results.(index.AggregateResults) 1911 if !ok { // should never happen 1912 return nil, fmt.Errorf("unknown results type [%T] received during aggregation", results) 1913 } 1914 return block.AggregateIter(ctx, aggResults.AggregateResultsOptions()) 1915 } 1916 1917 func (i *nsIndex) execBlockAggregateQueryFn( 1918 ctx context.Context, 1919 block index.Block, 1920 permit permits.Permit, 1921 iter index.ResultIterator, 1922 opts index.QueryOptions, 1923 state *asyncQueryExecState, 1924 results index.BaseResults, 1925 logFields []opentracinglog.Field, 1926 ) { 1927 logFields = append(logFields, 1928 xopentracing.Time("blockStart", block.StartTime().ToTime()), 1929 xopentracing.Time("blockEnd", block.EndTime().ToTime()), 1930 ) 1931 1932 ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockAggregateQuery) 1933 sp.LogFields(logFields...) 1934 defer sp.Finish() 1935 1936 aggResults, ok := results.(index.AggregateResults) 1937 if !ok { // should never happen 1938 state.addErr(fmt.Errorf("unknown results type [%T] received during aggregation", results)) 1939 return 1940 } 1941 aggIter, ok := iter.(index.AggregateIterator) 1942 if !ok { // should never happen 1943 state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter)) 1944 return 1945 } 1946 1947 deadline := time.Now().Add(time.Duration(permit.AllowedQuota())) 1948 err := block.AggregateWithIter(ctx, aggIter, opts, aggResults, deadline, logFields) 1949 if err == index.ErrUnableToQueryBlockClosed { 1950 // NB(r): Because we query this block outside of the results lock, it's 1951 // possible this block may get closed if it slides out of retention, in 1952 // that case those results are no longer considered valid and outside of 1953 // retention regardless, so this is a non-issue. 1954 err = nil 1955 } 1956 1957 if err != nil { 1958 sp.LogFields(opentracinglog.Error(err)) 1959 state.addErr(err) 1960 } 1961 } 1962 1963 func (i *nsIndex) overriddenOptsForQueryWithRLock( 1964 opts index.QueryOptions, 1965 ) index.QueryOptions { 1966 // Override query response limits if needed. 1967 if i.state.runtimeOpts.maxQuerySeriesLimit > 0 && (opts.SeriesLimit == 0 || 1968 int64(opts.SeriesLimit) > i.state.runtimeOpts.maxQuerySeriesLimit) { 1969 i.logger.Debug("overriding query response series limit", 1970 zap.Int("requested", opts.SeriesLimit), 1971 zap.Int64("maxAllowed", i.state.runtimeOpts.maxQuerySeriesLimit)) // FOLLOWUP(prateek): log query too once it's serializable. 1972 opts.SeriesLimit = int(i.state.runtimeOpts.maxQuerySeriesLimit) 1973 } 1974 if i.state.runtimeOpts.maxQueryDocsLimit > 0 && (opts.DocsLimit == 0 || 1975 int64(opts.DocsLimit) > i.state.runtimeOpts.maxQueryDocsLimit) { 1976 i.logger.Debug("overriding query response docs limit", 1977 zap.Int("requested", opts.DocsLimit), 1978 zap.Int64("maxAllowed", i.state.runtimeOpts.maxQueryDocsLimit)) // FOLLOWUP(prateek): log query too once it's serializable. 1979 opts.DocsLimit = int(i.state.runtimeOpts.maxQueryDocsLimit) 1980 } 1981 return opts 1982 } 1983 1984 type blockPresentResult struct { 1985 block index.Block 1986 latest bool 1987 } 1988 1989 func (i *nsIndex) ensureBlockPresent(blockStart xtime.UnixNano) (blockPresentResult, error) { 1990 i.state.RLock() 1991 defer i.state.RUnlock() 1992 if !i.isOpenWithRLock() { 1993 return blockPresentResult{}, errDbIndexUnableToWriteClosed 1994 } 1995 return i.ensureBlockPresentWithRLock(blockStart) 1996 } 1997 1998 func (i *nsIndex) isLatestBlockWithRLock(blockStart xtime.UnixNano) bool { 1999 return i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart) 2000 } 2001 2002 // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified 2003 // blockStart, allocating one if it does not. It returns the desired block, or 2004 // error if it's unable to do so. 2005 func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (blockPresentResult, error) { 2006 // check if the current latest block matches the required block, this 2007 // is the usual path and can short circuit the rest of the logic in this 2008 // function in most cases. 2009 if i.isLatestBlockWithRLock(blockStart) { 2010 return blockPresentResult{ 2011 block: i.state.latestBlock, 2012 latest: true, 2013 }, nil 2014 } 2015 2016 // check if exists in the map (this can happen if the latestBlock has not 2017 // been rotated yet). 2018 if block, ok := i.state.blocksByTime[blockStart]; ok { 2019 return blockPresentResult{block: block}, nil 2020 } 2021 2022 // i.e. block start does not exist, so we have to alloc. 2023 // we release the RLock (the function is called with this lock), and acquire 2024 // the write lock to do the extra allocation. 2025 i.state.RUnlock() 2026 i.state.Lock() 2027 2028 // need to guarantee all exit paths from the function leave with the RLock 2029 // so we release the write lock and re-acquire a read lock. 2030 defer func() { 2031 i.state.Unlock() 2032 i.state.RLock() 2033 }() 2034 2035 // re-check if exists in the map (another routine did the alloc) 2036 if block, ok := i.state.blocksByTime[blockStart]; ok { 2037 return blockPresentResult{ 2038 block: block, 2039 latest: i.isLatestBlockWithRLock(blockStart), 2040 }, nil 2041 } 2042 2043 // ok now we know for sure we have to alloc 2044 block, err := i.newBlockFn(blockStart, i.nsMetadata, 2045 index.BlockOptions{}, i.namespaceRuntimeOptsMgr, i.opts.IndexOptions()) 2046 if err != nil { // unable to allocate the block, should never happen. 2047 return blockPresentResult{}, i.unableToAllocBlockInvariantError(err) 2048 } 2049 2050 // NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent. 2051 // We need to seal cold blocks write away for cold writes. 2052 if !blockStart.After(i.lastSealableBlockStart(xtime.ToUnixNano(i.nowFn()))) { 2053 if err := block.Seal(); err != nil { 2054 return blockPresentResult{}, err 2055 } 2056 } 2057 2058 // add to tracked blocks map 2059 i.state.blocksByTime[blockStart] = block 2060 2061 // update ordered blockStarts slice, and latestBlock 2062 i.updateBlockStartsWithLock() 2063 2064 return blockPresentResult{ 2065 block: block, 2066 latest: i.isLatestBlockWithRLock(blockStart), 2067 }, nil 2068 } 2069 2070 func (i *nsIndex) lastSealableBlockStart(t xtime.UnixNano) xtime.UnixNano { 2071 return retention.FlushTimeEndForBlockSize(i.blockSize, t.Add(-i.bufferPast)) 2072 } 2073 2074 func (i *nsIndex) updateBlockStartsWithLock() { 2075 // update ordered blockStarts slice 2076 var ( 2077 latestBlockStart xtime.UnixNano 2078 latestBlock index.Block 2079 ) 2080 2081 blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime)+1) 2082 for ts, block := range i.state.blocksByTime { 2083 if ts >= latestBlockStart { 2084 latestBlockStart = ts 2085 latestBlock = block 2086 } 2087 blocks = append(blocks, blockAndBlockStart{ 2088 block: block, 2089 blockStart: ts, 2090 }) 2091 } 2092 2093 // order in desc order (i.e. reverse chronological) 2094 sort.Slice(blocks, func(i, j int) bool { 2095 return blocks[i].blockStart > blocks[j].blockStart 2096 }) 2097 2098 // NB(r): Important not to modify this once set since we take reference 2099 // to this slice with an RLock, release with RUnlock and then loop over it 2100 // during query time so it must not be altered and stay immutable. 2101 // This is done to avoid allocating a copy of the slice at query time for 2102 // each query. 2103 i.state.blocksDescOrderImmutable = blocks 2104 2105 // rotate latestBlock 2106 i.state.latestBlock = latestBlock 2107 } 2108 2109 func (i *nsIndex) isOpenWithRLock() bool { 2110 return !i.state.closed 2111 } 2112 2113 func (i *nsIndex) CleanupExpiredFileSets(t xtime.UnixNano) error { 2114 // we only expire data on drive that we don't hold a reference to, and is 2115 // past the expiration period. the earliest data we have to retain is given 2116 // by the following computation: 2117 // Min(FIRST_EXPIRED_BLOCK, EARLIEST_RETAINED_BLOCK) 2118 i.state.RLock() 2119 defer i.state.RUnlock() 2120 if i.state.closed { 2121 return errDbIndexUnableToCleanupClosed 2122 } 2123 2124 // earliest block to retain based on retention period 2125 earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, t) 2126 2127 // now we loop through the blocks we hold, to ensure we don't delete any data for them. 2128 for t := range i.state.blocksByTime { 2129 if t.Before(earliestBlockStartToRetain) { 2130 earliestBlockStartToRetain = t 2131 } 2132 } 2133 2134 // know the earliest block to retain, find all blocks earlier than it 2135 var ( 2136 pathPrefix = i.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() 2137 nsID = i.nsMetadata.ID() 2138 ) 2139 filesets, err := i.indexFilesetsBeforeFn(pathPrefix, nsID, earliestBlockStartToRetain) 2140 if err != nil { 2141 return err 2142 } 2143 2144 // and delete them 2145 return i.deleteFilesFn(filesets) 2146 } 2147 2148 func (i *nsIndex) CleanupCorruptedFileSets() error { 2149 /* 2150 Corrupted index filesets can be safely cleaned up if its not 2151 the latest volume index per index volume type/block start combo. 2152 2153 We are guaranteed not to be actively writing to an index fileset once 2154 we're already writing to later volume indices. 2155 */ 2156 fsOpts := i.opts.CommitLogOptions().FilesystemOptions() 2157 infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ 2158 FilePathPrefix: fsOpts.FilePathPrefix(), 2159 Namespace: i.nsMetadata.ID(), 2160 ReaderBufferSize: fsOpts.InfoReaderBufferSize(), 2161 IncludeCorrupted: true, 2162 }) 2163 2164 if len(infoFiles) == 0 { 2165 return nil 2166 } 2167 2168 var ( 2169 toDelete []string 2170 begin = 0 // marks the beginning of a subslice that contains filesets with same block starts 2171 ) 2172 // It's expected that info files are ordered by block start and volume index 2173 for j := range infoFiles { 2174 if infoFiles[begin].ID.BlockStart.Before(infoFiles[j].ID.BlockStart) { 2175 files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:j]) 2176 if err != nil { 2177 return err 2178 } 2179 toDelete = append(toDelete, files...) 2180 begin = j 2181 } else if infoFiles[begin].ID.BlockStart.After(infoFiles[j].ID.BlockStart) { 2182 errorMessage := "filesets are expected to be ordered by block start" 2183 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2184 l.Error(errorMessage) 2185 }) 2186 return instrument.InvariantErrorf(errorMessage) 2187 } 2188 } 2189 2190 // Process the volumes in the last block, which are not covered by the loop. 2191 files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:]) 2192 if err != nil { 2193 return err 2194 } 2195 toDelete = append(toDelete, files...) 2196 2197 return i.deleteFilesFn(toDelete) 2198 } 2199 2200 func (i *nsIndex) getCorruptedVolumesForDeletion(filesets []fs.ReadIndexInfoFileResult) ([]string, error) { 2201 if len(filesets) <= 1 { 2202 return nil, nil 2203 } 2204 2205 // Check for invariants. 2206 for j := 1; j < len(filesets); j++ { 2207 if !filesets[j-1].ID.BlockStart.Equal(filesets[j].ID.BlockStart) { 2208 errorMessage := "all the filesets passed to this function should have the same block start" 2209 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2210 l.Error(errorMessage) 2211 }) 2212 return nil, instrument.InvariantErrorf(errorMessage) 2213 } else if filesets[j-1].ID.VolumeIndex >= filesets[j].ID.VolumeIndex { 2214 errorMessage := "filesets should be ordered by volume index in increasing order" 2215 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2216 l.Error(errorMessage) 2217 }) 2218 return nil, instrument.InvariantErrorf(errorMessage) 2219 } 2220 } 2221 2222 toDelete := make([]string, 0) 2223 hasMoreRecentVolumeOfType := make(map[idxpersist.IndexVolumeType]struct{}) 2224 // Iterate filesets in reverse order to process higher volume indexes first. 2225 for j := len(filesets) - 1; j >= 0; j-- { 2226 f := filesets[j] 2227 2228 // NB: If the fileset info fields contains inconsistent information (e.g. block start inside 2229 // info file doesn't match the block start extracted from the filename), it means that info file 2230 // is missing or corrupted. Thus we cannot trust the information of this fileset 2231 // and we cannot be sure what's the actual volume type of it. However, a part of corrupted 2232 // fileset cleanup logic depends on knowing the volume type. 2233 // 2234 // Such fileset is deleted, except when it is the most recent volume in the block. 2235 // 2236 // The most recent volume is excluded because it is more likely to be actively written to. 2237 // If info file writes are not atomic, due to timing readers might observe the file 2238 // to be corrupted, even though at that moment the file is being written/re-written. 2239 if f.Corrupted && !f.ID.BlockStart.Equal(xtime.UnixNano(f.Info.BlockStart)) { 2240 if j != len(filesets)-1 { 2241 toDelete = append(toDelete, f.AbsoluteFilePaths...) 2242 } 2243 continue 2244 } 2245 2246 volType := idxpersist.DefaultIndexVolumeType 2247 if f.Info.IndexVolumeType != nil { 2248 volType = idxpersist.IndexVolumeType(f.Info.IndexVolumeType.Value) 2249 } 2250 // Delete corrupted filesets if there are more recent volumes with the same volume type. 2251 if _, ok := hasMoreRecentVolumeOfType[volType]; !ok { 2252 hasMoreRecentVolumeOfType[volType] = struct{}{} 2253 } else if f.Corrupted { 2254 toDelete = append(toDelete, f.AbsoluteFilePaths...) 2255 } 2256 } 2257 return toDelete, nil 2258 } 2259 2260 func (i *nsIndex) CleanupDuplicateFileSets(activeShards []uint32) error { 2261 fsOpts := i.opts.CommitLogOptions().FilesystemOptions() 2262 infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{ 2263 FilePathPrefix: fsOpts.FilePathPrefix(), 2264 Namespace: i.nsMetadata.ID(), 2265 ReaderBufferSize: fsOpts.InfoReaderBufferSize(), 2266 }) 2267 2268 segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart := make(map[xtime.UnixNano]map[idxpersist.IndexVolumeType][]fs.Segments) 2269 for _, file := range infoFiles { 2270 seg := fs.NewSegments(file.Info, file.ID.VolumeIndex, file.AbsoluteFilePaths) 2271 blockStart := seg.BlockStart() 2272 segmentsOrderByVolumeIndexByVolumeType, ok := segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart] 2273 if !ok { 2274 segmentsOrderByVolumeIndexByVolumeType = make(map[idxpersist.IndexVolumeType][]fs.Segments) 2275 segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart] = segmentsOrderByVolumeIndexByVolumeType 2276 } 2277 2278 volumeType := seg.VolumeType() 2279 if _, ok := segmentsOrderByVolumeIndexByVolumeType[volumeType]; !ok { 2280 segmentsOrderByVolumeIndexByVolumeType[volumeType] = make([]fs.Segments, 0) 2281 } 2282 segmentsOrderByVolumeIndexByVolumeType[volumeType] = append(segmentsOrderByVolumeIndexByVolumeType[volumeType], seg) 2283 } 2284 2285 // Ensure that segments are sorted by volume index. 2286 for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart { 2287 for _, segs := range segmentsOrderByVolumeIndexByVolumeType { 2288 sort.SliceStable(segs, func(i, j int) bool { 2289 return segs[i].VolumeIndex() < segs[j].VolumeIndex() 2290 }) 2291 } 2292 } 2293 2294 multiErr := xerrors.NewMultiError() 2295 // Check for dupes and remove. 2296 filesToDelete := make([]string, 0) 2297 for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart { 2298 for _, segmentsOrderByVolumeIndex := range segmentsOrderByVolumeIndexByVolumeType { 2299 segmentsToKeep := make([]fs.Segments, 0) 2300 for _, seg := range segmentsOrderByVolumeIndex { 2301 for len(segmentsToKeep) > 0 { 2302 idx := len(segmentsToKeep) - 1 2303 if previous := segmentsToKeep[idx]; seg.ShardTimeRanges().IsSuperset( 2304 previous.ShardTimeRanges().FilterShards(activeShards)) { 2305 filesToDelete = append(filesToDelete, previous.AbsoluteFilePaths()...) 2306 segmentsToKeep = segmentsToKeep[:idx] 2307 } else { 2308 break 2309 } 2310 } 2311 segmentsToKeep = append(segmentsToKeep, seg) 2312 } 2313 } 2314 } 2315 multiErr = multiErr.Add(i.deleteFilesFn(filesToDelete)) 2316 return multiErr.FinalError() 2317 } 2318 2319 func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error { 2320 i.state.RLock() 2321 defer i.state.RUnlock() 2322 if i.state.closed { 2323 return errDbIndexAlreadyClosed 2324 } 2325 2326 ctx := context.NewBackground() 2327 defer ctx.Close() 2328 2329 // Create a new set of file system options to output to new directory. 2330 fsOpts := i.opts.CommitLogOptions(). 2331 FilesystemOptions(). 2332 SetFilePathPrefix(opts.OutputDirectory) 2333 2334 for _, block := range i.state.blocksByTime { 2335 segmentsData, err := block.MemorySegmentsData(ctx) 2336 if err != nil { 2337 return err 2338 } 2339 2340 for numSegment, segmentData := range segmentsData { 2341 indexWriter, err := fs.NewIndexWriter(fsOpts) 2342 if err != nil { 2343 return err 2344 } 2345 2346 fileSetID := fs.FileSetFileIdentifier{ 2347 FileSetContentType: persist.FileSetIndexContentType, 2348 Namespace: i.nsMetadata.ID(), 2349 BlockStart: block.StartTime(), 2350 VolumeIndex: numSegment, 2351 } 2352 openOpts := fs.IndexWriterOpenOptions{ 2353 Identifier: fileSetID, 2354 BlockSize: i.blockSize, 2355 FileSetType: persist.FileSetFlushType, 2356 Shards: i.state.shardsAssigned, 2357 IndexVolumeType: idxpersist.DefaultIndexVolumeType, 2358 } 2359 if err := indexWriter.Open(openOpts); err != nil { 2360 return err 2361 } 2362 2363 segWriter, err := idxpersist.NewFSTSegmentDataFileSetWriter(segmentData) 2364 if err != nil { 2365 return err 2366 } 2367 2368 if err := indexWriter.WriteSegmentFileSet(segWriter); err != nil { 2369 return err 2370 } 2371 2372 if err := indexWriter.Close(); err != nil { 2373 return err 2374 } 2375 } 2376 } 2377 2378 return nil 2379 } 2380 2381 func (i *nsIndex) Close() error { 2382 i.state.Lock() 2383 if !i.isOpenWithRLock() { 2384 i.state.Unlock() 2385 return errDbIndexAlreadyClosed 2386 } 2387 2388 i.state.closed = true 2389 close(i.state.closeCh) 2390 2391 var multiErr xerrors.MultiError 2392 multiErr = multiErr.Add(i.state.insertQueue.Stop()) 2393 2394 blocks := make([]index.Block, 0, len(i.state.blocksByTime)+1) 2395 for _, block := range i.state.blocksByTime { 2396 blocks = append(blocks, block) 2397 } 2398 blocks = append(blocks, i.activeBlock) 2399 2400 i.activeBlock = nil 2401 i.state.latestBlock = nil 2402 i.state.blocksByTime = nil 2403 i.state.blocksDescOrderImmutable = nil 2404 2405 if i.runtimeOptsListener != nil { 2406 i.runtimeOptsListener.Close() 2407 i.runtimeOptsListener = nil 2408 } 2409 2410 if i.runtimeNsOptsListener != nil { 2411 i.runtimeNsOptsListener.Close() 2412 i.runtimeNsOptsListener = nil 2413 } 2414 2415 // Can now unlock after collecting blocks to close and setting closed state. 2416 i.state.Unlock() 2417 2418 // Wait for inflight queries to finish before closing blocks, do this 2419 // outside of lock in case an inflight query needs to acquire a read lock 2420 // to finish but can't acquire it because close was holding the lock waiting 2421 // for queries to drain first. 2422 i.queriesWg.Wait() 2423 2424 for _, block := range blocks { 2425 multiErr = multiErr.Add(block.Close()) 2426 } 2427 2428 return multiErr.FinalError() 2429 } 2430 2431 func (i *nsIndex) unableToAllocBlockInvariantError(err error) error { 2432 ierr := fmt.Errorf("index unable to allocate block: %v", err) 2433 instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) { 2434 l.Error(ierr.Error()) 2435 }) 2436 return ierr 2437 } 2438 2439 type nsIndexMetrics struct { 2440 tick tally.Counter 2441 2442 asyncInsertAttemptTotal tally.Counter 2443 asyncInsertAttemptSkip tally.Counter 2444 asyncInsertAttemptWrite tally.Counter 2445 2446 asyncInsertSuccess tally.Counter 2447 asyncInsertErrors tally.Counter 2448 insertAfterClose tally.Counter 2449 queryAfterClose tally.Counter 2450 forwardIndexHits tally.Counter 2451 forwardIndexMisses tally.Counter 2452 forwardIndexCounter tally.Counter 2453 insertEndToEndLatency tally.Timer 2454 blocksEvictedMutableSegments tally.Counter 2455 blockMetrics nsIndexBlocksMetrics 2456 indexingConcurrencyMin tally.Gauge 2457 indexingConcurrencyMax tally.Gauge 2458 indexingConcurrencyAvg tally.Gauge 2459 flushIndexingConcurrency tally.Gauge 2460 flushDocsNew tally.Counter 2461 flushDocsCached tally.Counter 2462 latestBlockNumSegmentsForeground tally.Gauge 2463 latestBlockNumDocsForeground tally.Gauge 2464 latestBlockNumSegmentsBackground tally.Gauge 2465 latestBlockNumDocsBackground tally.Gauge 2466 2467 loadedDocsPerQuery tally.Histogram 2468 queryExhaustiveSuccess tally.Counter 2469 queryExhaustiveInternalError tally.Counter 2470 queryNonExhaustiveSuccess tally.Counter 2471 queryNonExhaustiveInternalError tally.Counter 2472 queryNonExhaustiveLimitError tally.Counter 2473 queryNonExhaustiveSeriesLimitError tally.Counter 2474 queryNonExhaustiveDocsLimitError tally.Counter 2475 } 2476 2477 func newNamespaceIndexMetrics( 2478 opts index.Options, 2479 iopts instrument.Options, 2480 ) nsIndexMetrics { 2481 const ( 2482 indexAttemptName = "index-attempt" 2483 forwardIndexName = "forward-index" 2484 indexingConcurrency = "indexing-concurrency" 2485 flushIndexingConcurrency = "flush-indexing-concurrency" 2486 ) 2487 scope := iopts.MetricsScope() 2488 blocksScope := scope.SubScope("blocks") 2489 m := nsIndexMetrics{ 2490 tick: scope.Counter("index-tick"), 2491 asyncInsertAttemptTotal: scope.Tagged(map[string]string{ 2492 "stage": "process", 2493 }).Counter(indexAttemptName), 2494 asyncInsertAttemptSkip: scope.Tagged(map[string]string{ 2495 "stage": "skip", 2496 }).Counter(indexAttemptName), 2497 asyncInsertAttemptWrite: scope.Tagged(map[string]string{ 2498 "stage": "write", 2499 }).Counter(indexAttemptName), 2500 asyncInsertSuccess: scope.Counter("index-success"), 2501 asyncInsertErrors: scope.Tagged(map[string]string{ 2502 "error_type": "async-insert", 2503 }).Counter("index-error"), 2504 insertAfterClose: scope.Tagged(map[string]string{ 2505 "error_type": "insert-closed", 2506 }).Counter("insert-after-close"), 2507 queryAfterClose: scope.Tagged(map[string]string{ 2508 "error_type": "query-closed", 2509 }).Counter("query-after-error"), 2510 forwardIndexHits: scope.Tagged(map[string]string{ 2511 "status": "hit", 2512 }).Counter(forwardIndexName), 2513 forwardIndexMisses: scope.Tagged(map[string]string{ 2514 "status": "miss", 2515 }).Counter(forwardIndexName), 2516 forwardIndexCounter: scope.Tagged(map[string]string{ 2517 "status": "count", 2518 }).Counter(forwardIndexName), 2519 insertEndToEndLatency: instrument.NewTimer(scope, 2520 "insert-end-to-end-latency", iopts.TimerOptions()), 2521 blocksEvictedMutableSegments: scope.Counter("blocks-evicted-mutable-segments"), 2522 blockMetrics: newNamespaceIndexBlocksMetrics(opts, blocksScope), 2523 indexingConcurrencyMin: scope.Tagged(map[string]string{ 2524 "stat": "min", 2525 }).Gauge(indexingConcurrency), 2526 indexingConcurrencyMax: scope.Tagged(map[string]string{ 2527 "stat": "max", 2528 }).Gauge(indexingConcurrency), 2529 indexingConcurrencyAvg: scope.Tagged(map[string]string{ 2530 "stat": "avg", 2531 }).Gauge(indexingConcurrency), 2532 flushIndexingConcurrency: scope.Gauge(flushIndexingConcurrency), 2533 flushDocsNew: scope.Tagged(map[string]string{ 2534 "status": "new", 2535 }).Counter("flush-docs"), 2536 flushDocsCached: scope.Tagged(map[string]string{ 2537 "status": "cached", 2538 }).Counter("flush-docs"), 2539 latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{ 2540 "segment_type": "foreground", 2541 }).Gauge("latest-block-num-segments"), 2542 latestBlockNumDocsForeground: scope.Tagged(map[string]string{ 2543 "segment_type": "foreground", 2544 }).Gauge("latest-block-num-docs"), 2545 latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{ 2546 "segment_type": "background", 2547 }).Gauge("latest-block-num-segments"), 2548 latestBlockNumDocsBackground: scope.Tagged(map[string]string{ 2549 "segment_type": "background", 2550 }).Gauge("latest-block-num-docs"), 2551 loadedDocsPerQuery: scope.Histogram( 2552 "loaded-docs-per-query", 2553 tally.MustMakeExponentialValueBuckets(10, 2, 16), 2554 ), 2555 queryExhaustiveSuccess: scope.Tagged(map[string]string{ 2556 "exhaustive": "true", 2557 "result": "success", 2558 }).Counter("query"), 2559 queryExhaustiveInternalError: scope.Tagged(map[string]string{ 2560 "exhaustive": "true", 2561 "result": "error_internal", 2562 }).Counter("query"), 2563 queryNonExhaustiveSuccess: scope.Tagged(map[string]string{ 2564 "exhaustive": "false", 2565 "result": "success", 2566 }).Counter("query"), 2567 queryNonExhaustiveInternalError: scope.Tagged(map[string]string{ 2568 "exhaustive": "false", 2569 "result": "error_internal", 2570 }).Counter("query"), 2571 queryNonExhaustiveLimitError: scope.Tagged(map[string]string{ 2572 "exhaustive": "false", 2573 "result": "error_require_exhaustive", 2574 }).Counter("query"), 2575 queryNonExhaustiveSeriesLimitError: scope.Tagged(map[string]string{ 2576 "exhaustive": "false", 2577 "result": "error_series_require_exhaustive", 2578 }).Counter("query"), 2579 queryNonExhaustiveDocsLimitError: scope.Tagged(map[string]string{ 2580 "exhaustive": "false", 2581 "result": "error_docs_require_exhaustive", 2582 }).Counter("query"), 2583 } 2584 2585 // Initialize gauges that should default to zero before 2586 // returning results so that they are exported with an 2587 // explicit zero value at process startup. 2588 m.flushIndexingConcurrency.Update(0) 2589 2590 return m 2591 } 2592 2593 type nsIndexBlocksMetrics struct { 2594 ForegroundSegments nsIndexBlocksSegmentsMetrics 2595 BackgroundSegments nsIndexBlocksSegmentsMetrics 2596 FlushedSegments nsIndexBlocksSegmentsMetrics 2597 } 2598 2599 func newNamespaceIndexBlocksMetrics( 2600 opts index.Options, 2601 scope tally.Scope, 2602 ) nsIndexBlocksMetrics { 2603 return nsIndexBlocksMetrics{ 2604 ForegroundSegments: newNamespaceIndexBlocksSegmentsMetrics( 2605 opts.ForegroundCompactionPlannerOptions(), 2606 scope.Tagged(map[string]string{ 2607 "segment-type": "foreground", 2608 })), 2609 BackgroundSegments: newNamespaceIndexBlocksSegmentsMetrics( 2610 opts.BackgroundCompactionPlannerOptions(), 2611 scope.Tagged(map[string]string{ 2612 "segment-type": "background", 2613 })), 2614 FlushedSegments: newNamespaceIndexBlocksSegmentsMetrics( 2615 opts.BackgroundCompactionPlannerOptions(), 2616 scope.Tagged(map[string]string{ 2617 "segment-type": "flushed", 2618 })), 2619 } 2620 } 2621 2622 type nsIndexBlocksSegmentsMetrics struct { 2623 Levels []nsIndexBlocksSegmentsLevelMetrics 2624 } 2625 2626 type nsIndexBlocksSegmentsLevelMetrics struct { 2627 MinSizeInclusive int64 2628 MaxSizeExclusive int64 2629 NumSegments tally.Gauge 2630 NumTotalDocs tally.Gauge 2631 SegmentsAge tally.Timer 2632 } 2633 2634 func newNamespaceIndexBlocksSegmentsMetrics( 2635 compactionOpts compaction.PlannerOptions, 2636 scope tally.Scope, 2637 ) nsIndexBlocksSegmentsMetrics { 2638 segmentLevelsScope := scope.SubScope("segment-levels") 2639 levels := make([]nsIndexBlocksSegmentsLevelMetrics, 0, len(compactionOpts.Levels)) 2640 for _, level := range compactionOpts.Levels { 2641 subScope := segmentLevelsScope.Tagged(map[string]string{ 2642 "level-min-size": strconv.Itoa(int(level.MinSizeInclusive)), 2643 "level-max-size": strconv.Itoa(int(level.MaxSizeExclusive)), 2644 }) 2645 levels = append(levels, nsIndexBlocksSegmentsLevelMetrics{ 2646 MinSizeInclusive: level.MinSizeInclusive, 2647 MaxSizeExclusive: level.MaxSizeExclusive, 2648 NumSegments: subScope.Gauge("num-segments"), 2649 NumTotalDocs: subScope.Gauge("num-total-docs"), 2650 SegmentsAge: subScope.Timer("segments-age"), 2651 }) 2652 } 2653 2654 return nsIndexBlocksSegmentsMetrics{ 2655 Levels: levels, 2656 } 2657 } 2658 2659 type dbShards []databaseShard 2660 2661 func (shards dbShards) IDs() []uint32 { 2662 ids := make([]uint32, 0, len(shards)) 2663 for _, s := range shards { 2664 ids = append(ids, s.ID()) 2665 } 2666 return ids 2667 } 2668 2669 // blocksIterStackAlloc is a stack allocated block iterator, ensuring no 2670 // allocations per query. 2671 type blocksIterStackAlloc struct { 2672 activeBlock index.Block 2673 blocks []blockAndBlockStart 2674 queryRanges xtime.Ranges 2675 idx int 2676 } 2677 2678 func newBlocksIterStackAlloc( 2679 activeBlock index.Block, 2680 blocks []blockAndBlockStart, 2681 queryRanges xtime.Ranges, 2682 ) blocksIterStackAlloc { 2683 return blocksIterStackAlloc{ 2684 activeBlock: activeBlock, 2685 blocks: blocks, 2686 queryRanges: queryRanges, 2687 idx: -2, 2688 } 2689 } 2690 2691 func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) { 2692 iter := i 2693 2694 for { 2695 iter.idx++ 2696 if iter.idx == -1 { 2697 // This will return the active block. 2698 return iter, true 2699 } 2700 2701 // No more ranges to query, perform this second so that 2702 // the in memory block always returns results. 2703 if i.queryRanges.IsEmpty() { 2704 return iter, false 2705 } 2706 2707 if iter.idx >= len(i.blocks) { 2708 return iter, false 2709 } 2710 2711 block := i.blocks[iter.idx].block 2712 2713 // Ensure the block has data requested by the query. 2714 blockRange := xtime.Range{ 2715 Start: block.StartTime(), 2716 End: block.EndTime(), 2717 } 2718 if !i.queryRanges.Overlaps(blockRange) { 2719 continue 2720 } 2721 2722 // Remove this range from the query range. 2723 i.queryRanges.RemoveRange(blockRange) 2724 2725 return iter, true 2726 } 2727 } 2728 2729 func (i blocksIterStackAlloc) Current() index.Block { 2730 if i.idx == -1 { 2731 return i.activeBlock 2732 } 2733 return i.blocks[i.idx].block 2734 }