github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/namespace.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "errors" 25 "fmt" 26 iofs "io/fs" 27 "math" 28 "runtime" 29 "sync" 30 "time" 31 32 "github.com/m3db/m3/src/dbnode/namespace" 33 "github.com/m3db/m3/src/dbnode/persist" 34 "github.com/m3db/m3/src/dbnode/persist/fs" 35 "github.com/m3db/m3/src/dbnode/sharding" 36 "github.com/m3db/m3/src/dbnode/storage/block" 37 "github.com/m3db/m3/src/dbnode/storage/bootstrap" 38 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 39 "github.com/m3db/m3/src/dbnode/storage/index" 40 "github.com/m3db/m3/src/dbnode/storage/index/convert" 41 "github.com/m3db/m3/src/dbnode/storage/repair" 42 "github.com/m3db/m3/src/dbnode/storage/series" 43 "github.com/m3db/m3/src/dbnode/tracepoint" 44 "github.com/m3db/m3/src/dbnode/ts" 45 "github.com/m3db/m3/src/dbnode/ts/writes" 46 "github.com/m3db/m3/src/m3ninx/doc" 47 idxpersist "github.com/m3db/m3/src/m3ninx/persist" 48 "github.com/m3db/m3/src/x/clock" 49 "github.com/m3db/m3/src/x/context" 50 xerrors "github.com/m3db/m3/src/x/errors" 51 "github.com/m3db/m3/src/x/ident" 52 "github.com/m3db/m3/src/x/instrument" 53 xopentracing "github.com/m3db/m3/src/x/opentracing" 54 xresource "github.com/m3db/m3/src/x/resource" 55 xsync "github.com/m3db/m3/src/x/sync" 56 xtime "github.com/m3db/m3/src/x/time" 57 58 opentracinglog "github.com/opentracing/opentracing-go/log" 59 "github.com/uber-go/tally" 60 "go.uber.org/zap" 61 ) 62 63 var ( 64 errNamespaceAlreadyClosed = errors.New("namespace already closed") 65 errNamespaceIndexingDisabled = errors.New("namespace indexing is disabled") 66 errNamespaceReadOnly = errors.New("cannot write to a read only namespace") 67 ) 68 69 type commitLogWriter interface { 70 Write( 71 ctx context.Context, 72 series ts.Series, 73 datapoint ts.Datapoint, 74 unit xtime.Unit, 75 annotation ts.Annotation, 76 ) error 77 } 78 79 type commitLogWriterFn func( 80 ctx context.Context, 81 series ts.Series, 82 datapoint ts.Datapoint, 83 unit xtime.Unit, 84 annotation ts.Annotation, 85 ) error 86 87 func (fn commitLogWriterFn) Write( 88 ctx context.Context, 89 series ts.Series, 90 datapoint ts.Datapoint, 91 unit xtime.Unit, 92 annotation ts.Annotation, 93 ) error { 94 return fn(ctx, series, datapoint, unit, annotation) 95 } 96 97 var commitLogWriteNoOp = commitLogWriter(commitLogWriterFn(func( 98 ctx context.Context, 99 series ts.Series, 100 datapoint ts.Datapoint, 101 unit xtime.Unit, 102 annotation ts.Annotation, 103 ) error { 104 return nil 105 })) 106 107 type createEmptyWarmIndexIfNotExistsFn func(blockStart xtime.UnixNano) error 108 109 type dbNamespace struct { 110 sync.RWMutex 111 112 closed bool 113 readOnly bool 114 shutdownCh chan struct{} 115 id ident.ID 116 shardSet sharding.ShardSet 117 blockRetriever block.DatabaseBlockRetriever 118 namespaceReaderMgr databaseNamespaceReaderManager 119 opts Options 120 metadata namespace.Metadata 121 nopts namespace.Options 122 seriesOpts series.Options 123 nowFn clock.NowFn 124 snapshotFilesFn snapshotFilesFn 125 log *zap.Logger 126 bootstrapState BootstrapState 127 repairsAny bool 128 129 // schemaDescr caches the latest schema for the namespace. 130 // schemaDescr is updated whenever schema registry is updated. 131 schemaListener xresource.SimpleCloser 132 schemaDescr namespace.SchemaDescr 133 134 // Contains an entry to all shards for fast shard lookup, an 135 // entry will be nil when this shard does not belong to current database 136 shards []databaseShard 137 138 increasingIndex increasingIndex 139 commitLogWriter commitLogWriter 140 reverseIndex NamespaceIndex 141 142 createEmptyWarmIndexIfNotExistsFn createEmptyWarmIndexIfNotExistsFn 143 144 tickWorkers xsync.WorkerPool 145 tickWorkersConcurrency int 146 statsLastTick databaseNamespaceStatsLastTick 147 148 metrics databaseNamespaceMetrics 149 } 150 151 type databaseNamespaceStatsLastTick struct { 152 sync.RWMutex 153 activeSeries int64 154 activeBlocks int64 155 index databaseNamespaceIndexStatsLastTick 156 } 157 158 type databaseNamespaceIndexStatsLastTick struct { 159 numDocs int64 160 numBlocks int64 161 numSegments int64 162 } 163 164 type databaseNamespaceMetrics struct { 165 bootstrap instrument.MethodMetrics 166 flushWarmData instrument.MethodMetrics 167 flushColdData instrument.MethodMetrics 168 flushIndex instrument.MethodMetrics 169 snapshot instrument.MethodMetrics 170 write instrument.MethodMetrics 171 writeTagged instrument.MethodMetrics 172 read instrument.MethodMetrics 173 fetchBlocks instrument.MethodMetrics 174 fetchBlocksMetadata instrument.MethodMetrics 175 queryIDs instrument.MethodMetrics 176 aggregateQuery instrument.MethodMetrics 177 178 unfulfilled tally.Counter 179 bootstrapStart tally.Counter 180 bootstrapEnd tally.Counter 181 snapshotSeriesPersist tally.Counter 182 writesWithoutAnnotation tally.Counter 183 184 shards databaseNamespaceShardMetrics 185 tick databaseNamespaceTickMetrics 186 status databaseNamespaceStatusMetrics 187 188 repairDifferingPercent tally.Gauge 189 repairComparedBlocks tally.Counter 190 repairDifferingBlocks tally.Counter 191 repairMismatchBlocks tally.Counter 192 repairMissingBlocks tally.Counter 193 repairExtraBlocks tally.Counter 194 } 195 196 type databaseNamespaceShardMetrics struct { 197 add tally.Counter 198 close tally.Counter 199 closeErrors tally.Counter 200 } 201 202 type databaseNamespaceTickMetrics struct { 203 activeSeries tally.Gauge 204 expiredSeries tally.Counter 205 activeBlocks tally.Gauge 206 wiredBlocks tally.Gauge 207 unwiredBlocks tally.Gauge 208 pendingMergeBlocks tally.Gauge 209 madeUnwiredBlocks tally.Counter 210 madeExpiredBlocks tally.Counter 211 mergedOutOfOrderBlocks tally.Counter 212 errors tally.Counter 213 index databaseNamespaceIndexTickMetrics 214 evictedBuckets tally.Counter 215 } 216 217 type databaseNamespaceIndexTickMetrics struct { 218 numBlocks tally.Gauge 219 numDocs tally.Gauge 220 numSegments tally.Gauge 221 numBlocksSealed tally.Counter 222 numBlocksEvicted tally.Counter 223 } 224 225 // databaseNamespaceStatusMetrics are metrics emitted at a fixed interval 226 // so that summing the value of gauges across hosts when graphed summarizing 227 // values at the same fixed intervals can show meaningful results (vs variably 228 // emitted values that can be aggregated across hosts to see a snapshot). 229 type databaseNamespaceStatusMetrics struct { 230 activeSeries tally.Gauge 231 activeBlocks tally.Gauge 232 index databaseNamespaceIndexStatusMetrics 233 } 234 235 type databaseNamespaceIndexStatusMetrics struct { 236 numDocs tally.Gauge 237 numBlocks tally.Gauge 238 numSegments tally.Gauge 239 } 240 241 func newDatabaseNamespaceMetrics( 242 scope tally.Scope, 243 opts instrument.TimerOptions, 244 ) databaseNamespaceMetrics { 245 shardsScope := scope.SubScope("dbnamespace").SubScope("shards") 246 tickScope := scope.SubScope("tick") 247 indexTickScope := tickScope.SubScope("index") 248 statusScope := scope.SubScope("status") 249 indexStatusScope := statusScope.SubScope("index") 250 bootstrapScope := scope.SubScope("bootstrap") 251 snapshotScope := scope.SubScope("snapshot") 252 repairScope := scope.SubScope("repair") 253 return databaseNamespaceMetrics{ 254 bootstrap: instrument.NewMethodMetrics(scope, "bootstrap", opts), 255 flushWarmData: instrument.NewMethodMetrics(scope, "flushWarmData", opts), 256 flushColdData: instrument.NewMethodMetrics(scope, "flushColdData", opts), 257 flushIndex: instrument.NewMethodMetrics(scope, "flushIndex", opts), 258 snapshot: instrument.NewMethodMetrics(scope, "snapshot", opts), 259 write: instrument.NewMethodMetrics(scope, "write", opts), 260 writeTagged: instrument.NewMethodMetrics(scope, "write-tagged", opts), 261 read: instrument.NewMethodMetrics(scope, "read", opts), 262 fetchBlocks: instrument.NewMethodMetrics(scope, "fetchBlocks", opts), 263 fetchBlocksMetadata: instrument.NewMethodMetrics(scope, "fetchBlocksMetadata", opts), 264 queryIDs: instrument.NewMethodMetrics(scope, "queryIDs", opts), 265 aggregateQuery: instrument.NewMethodMetrics(scope, "aggregateQuery", opts), 266 267 unfulfilled: bootstrapScope.Counter("unfulfilled"), 268 bootstrapStart: bootstrapScope.Counter("start"), 269 bootstrapEnd: bootstrapScope.Counter("end"), 270 snapshotSeriesPersist: snapshotScope.Counter("series-persist"), 271 writesWithoutAnnotation: scope.Counter("writes-without-annotation"), 272 273 shards: databaseNamespaceShardMetrics{ 274 add: shardsScope.Counter("add"), 275 close: shardsScope.Counter("close"), 276 closeErrors: shardsScope.Counter("close-errors"), 277 }, 278 tick: databaseNamespaceTickMetrics{ 279 activeSeries: tickScope.Gauge("active-series"), 280 expiredSeries: tickScope.Counter("expired-series"), 281 activeBlocks: tickScope.Gauge("active-blocks"), 282 wiredBlocks: tickScope.Gauge("wired-blocks"), 283 unwiredBlocks: tickScope.Gauge("unwired-blocks"), 284 pendingMergeBlocks: tickScope.Gauge("pending-merge-blocks"), 285 madeUnwiredBlocks: tickScope.Counter("made-unwired-blocks"), 286 madeExpiredBlocks: tickScope.Counter("made-expired-blocks"), 287 mergedOutOfOrderBlocks: tickScope.Counter("merged-out-of-order-blocks"), 288 errors: tickScope.Counter("errors"), 289 index: databaseNamespaceIndexTickMetrics{ 290 numDocs: indexTickScope.Gauge("num-docs"), 291 numBlocks: indexTickScope.Gauge("num-blocks"), 292 numSegments: indexTickScope.Gauge("num-segments"), 293 numBlocksSealed: indexTickScope.Counter("num-blocks-sealed"), 294 numBlocksEvicted: indexTickScope.Counter("num-blocks-evicted"), 295 }, 296 evictedBuckets: tickScope.Counter("evicted-buckets"), 297 }, 298 status: databaseNamespaceStatusMetrics{ 299 activeSeries: statusScope.Gauge("active-series"), 300 activeBlocks: statusScope.Gauge("active-blocks"), 301 index: databaseNamespaceIndexStatusMetrics{ 302 numDocs: indexStatusScope.Gauge("num-docs"), 303 numBlocks: indexStatusScope.Gauge("num-blocks"), 304 numSegments: indexStatusScope.Gauge("num-segments"), 305 }, 306 }, 307 repairDifferingPercent: repairScope.Gauge("differing-percent"), 308 repairComparedBlocks: repairScope.Counter("compared-blocks"), 309 repairDifferingBlocks: repairScope.Counter("differing-blocks"), 310 repairMismatchBlocks: repairScope.Counter("mismatch-blocks"), 311 repairMissingBlocks: repairScope.Counter("missing-blocks"), 312 repairExtraBlocks: repairScope.Counter("extra-blocks"), 313 } 314 } 315 316 func newDatabaseNamespace( 317 metadata namespace.Metadata, 318 namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager, 319 shardSet sharding.ShardSet, 320 blockRetriever block.DatabaseBlockRetriever, 321 increasingIndex increasingIndex, 322 commitLogWriter commitLogWriter, 323 opts Options, 324 ) (databaseNamespace, error) { 325 var ( 326 nopts = metadata.Options() 327 id = metadata.ID() 328 ) 329 if !nopts.WritesToCommitLog() { 330 commitLogWriter = commitLogWriteNoOp 331 } 332 333 iops := opts.InstrumentOptions() 334 logger := iops.Logger().With(zap.String("namespace", id.String())) 335 iops = iops. 336 SetLogger(logger). 337 SetMetricsScope(iops.MetricsScope().Tagged(map[string]string{ 338 "namespace": id.String(), 339 })) 340 opts = opts.SetInstrumentOptions(iops) 341 342 scope := iops.MetricsScope().SubScope("database") 343 344 tickWorkersConcurrency := int(math.Max(1, float64(runtime.GOMAXPROCS(0))/8)) 345 tickWorkers := xsync.NewWorkerPool(tickWorkersConcurrency) 346 tickWorkers.Init() 347 348 seriesOpts := NewSeriesOptionsFromOptions(opts, nopts.RetentionOptions()). 349 SetStats(series.NewStats(scope)). 350 SetColdWritesEnabled(nopts.ColdWritesEnabled()) 351 if err := seriesOpts.Validate(); err != nil { 352 return nil, fmt.Errorf( 353 "unable to create namespace %v, invalid series options: %v", 354 metadata.ID().String(), err) 355 } 356 357 var ( 358 index NamespaceIndex 359 err error 360 ) 361 if metadata.Options().IndexOptions().Enabled() { 362 index, err = newNamespaceIndex(metadata, namespaceRuntimeOptsMgr, 363 shardSet, opts) 364 if err != nil { 365 return nil, err 366 } 367 } 368 369 n := &dbNamespace{ 370 id: id, 371 shutdownCh: make(chan struct{}), 372 shardSet: shardSet, 373 blockRetriever: blockRetriever, 374 namespaceReaderMgr: newNamespaceReaderManager(metadata, scope, opts), 375 opts: opts, 376 metadata: metadata, 377 nopts: nopts, 378 seriesOpts: seriesOpts, 379 nowFn: opts.ClockOptions().NowFn(), 380 snapshotFilesFn: fs.SnapshotFiles, 381 log: logger, 382 increasingIndex: increasingIndex, 383 commitLogWriter: commitLogWriter, 384 reverseIndex: index, 385 tickWorkers: tickWorkers, 386 tickWorkersConcurrency: tickWorkersConcurrency, 387 metrics: newDatabaseNamespaceMetrics(scope, iops.TimerOptions()), 388 } 389 390 n.createEmptyWarmIndexIfNotExistsFn = n.createEmptyWarmIndexIfNotExists 391 392 sl, err := opts.SchemaRegistry().RegisterListener(id, n) 393 // Fail to create namespace is schema listener can not be registered successfully. 394 // If proto is disabled, err will always be nil. 395 if err != nil { 396 return nil, fmt.Errorf( 397 "unable to register schema listener for namespace %v, error: %v", 398 metadata.ID().String(), err) 399 } 400 n.schemaListener = sl 401 n.assignShardSet(shardSet, assignShardSetOptions{ 402 needsBootstrap: nopts.BootstrapEnabled(), 403 initialAssignment: true, 404 }) 405 go n.reportStatusLoop(opts.InstrumentOptions().ReportInterval()) 406 407 return n, nil 408 } 409 410 // SetSchemaHistory implements namespace.SchemaListener. 411 func (n *dbNamespace) SetSchemaHistory(value namespace.SchemaHistory) { 412 n.Lock() 413 defer n.Unlock() 414 415 schema, ok := value.GetLatest() 416 if !ok { 417 n.log.Error("can not update namespace schema to empty", zap.Stringer("namespace", n.ID())) 418 return 419 } 420 421 metadata, err := namespace.NewMetadata(n.ID(), n.nopts.SetSchemaHistory(value)) 422 if err != nil { 423 n.log.Error("can not update namespace metadata with empty schema history", zap.Stringer("namespace", n.ID()), zap.Error(err)) 424 return 425 } 426 427 n.schemaDescr = schema 428 n.metadata = metadata 429 } 430 431 func (n *dbNamespace) reportStatusLoop(reportInterval time.Duration) { 432 ticker := time.NewTicker(reportInterval) 433 defer ticker.Stop() 434 for { 435 select { 436 case <-n.shutdownCh: 437 return 438 case <-ticker.C: 439 n.statsLastTick.RLock() 440 n.metrics.status.activeSeries.Update(float64(n.statsLastTick.activeSeries)) 441 n.metrics.status.activeBlocks.Update(float64(n.statsLastTick.activeBlocks)) 442 n.metrics.status.index.numDocs.Update(float64(n.statsLastTick.index.numDocs)) 443 n.metrics.status.index.numBlocks.Update(float64(n.statsLastTick.index.numBlocks)) 444 n.metrics.status.index.numSegments.Update(float64(n.statsLastTick.index.numSegments)) 445 n.statsLastTick.RUnlock() 446 } 447 } 448 } 449 450 func (n *dbNamespace) Options() namespace.Options { 451 return n.nopts 452 } 453 454 func (n *dbNamespace) StorageOptions() Options { 455 return n.opts 456 } 457 458 func (n *dbNamespace) ID() ident.ID { 459 return n.id 460 } 461 462 func (n *dbNamespace) Metadata() namespace.Metadata { 463 // NB(r): metadata is updated in SetSchemaHistory so requires an RLock. 464 n.RLock() 465 result := n.metadata 466 n.RUnlock() 467 return result 468 } 469 470 func (n *dbNamespace) Schema() namespace.SchemaDescr { 471 n.RLock() 472 schema := n.schemaDescr 473 n.RUnlock() 474 return schema 475 } 476 477 func (n *dbNamespace) NumSeries() int64 { 478 var count int64 479 for _, shard := range n.OwnedShards() { 480 count += shard.NumSeries() 481 } 482 return count 483 } 484 485 func (n *dbNamespace) Shards() []Shard { 486 n.RLock() 487 shards := n.shardSet.AllIDs() 488 databaseShards := make([]Shard, len(shards)) 489 for i, shard := range shards { 490 databaseShards[i] = n.shards[shard] 491 } 492 n.RUnlock() 493 return databaseShards 494 } 495 496 func (n *dbNamespace) AssignShardSet(shardSet sharding.ShardSet) { 497 n.assignShardSet(shardSet, assignShardSetOptions{ 498 needsBootstrap: n.nopts.BootstrapEnabled(), 499 initialAssignment: false, 500 }) 501 } 502 503 type assignShardSetOptions struct { 504 needsBootstrap bool 505 initialAssignment bool 506 } 507 508 func (n *dbNamespace) assignShardSet( 509 shardSet sharding.ShardSet, 510 opts assignShardSetOptions, 511 ) { 512 var ( 513 incoming = make(map[uint32]struct{}, len(shardSet.All())) 514 createdShardIds []uint32 515 existing []databaseShard 516 closing []databaseShard 517 ) 518 for _, shard := range shardSet.AllIDs() { 519 incoming[shard] = struct{}{} 520 } 521 522 n.Lock() 523 metadata := n.metadata 524 existing = n.shards 525 for _, shard := range existing { 526 if shard == nil { 527 continue 528 } 529 if _, ok := incoming[shard.ID()]; !ok { 530 closing = append(closing, shard) 531 } 532 } 533 n.shardSet = shardSet 534 n.shards = make([]databaseShard, n.shardSet.Max()+1) 535 for _, shard := range n.shardSet.AllIDs() { 536 // We create shards if its an initial assignment or if its not an initial assignment 537 // and the shard doesn't already exist. 538 if !opts.initialAssignment && int(shard) < len(existing) && existing[shard] != nil { 539 n.shards[shard] = existing[shard] 540 continue 541 } 542 543 // Otherwise it's the initial assignment or there isn't an existing 544 // shard created for this shard ID. 545 n.shards[shard] = newDatabaseShard(metadata, shard, n.blockRetriever, 546 n.namespaceReaderMgr, n.increasingIndex, n.reverseIndex, 547 opts.needsBootstrap, n.opts, n.seriesOpts) 548 createdShardIds = append(createdShardIds, shard) 549 // NB(bodu): We only record shard add metrics for shards created in non 550 // initial assignments. 551 if !opts.initialAssignment { 552 n.metrics.shards.add.Inc(1) 553 } 554 } 555 556 if len(createdShardIds) > 0 { 557 n.log.Info("created new shards", 558 zap.Stringer("namespace", n.ID()), 559 zap.Uint32s("shards", createdShardIds), 560 zap.Bool("initialAssignment", opts.initialAssignment), 561 zap.Bool("needsBootstrap", opts.needsBootstrap)) 562 } 563 564 if idx := n.reverseIndex; idx != nil { 565 idx.AssignShardSet(shardSet) 566 } 567 if br := n.blockRetriever; br != nil { 568 br.AssignShardSet(shardSet) 569 } 570 if mgr := n.namespaceReaderMgr; mgr != nil { 571 mgr.assignShardSet(shardSet) 572 } 573 574 n.Unlock() 575 n.closeShards(closing, false) 576 } 577 578 func (n *dbNamespace) closeShards(shards []databaseShard, blockUntilClosed bool) { 579 var wg sync.WaitGroup 580 // NB(r): There is a shard close deadline that controls how fast each 581 // shard closes set in the options. To make sure this is the single 582 // point of control for determining how impactful closing shards may 583 // be to performance, we let this be the single gate and simply spin 584 // up a goroutine per shard that we need to close and rely on the self 585 // throttling of each shard as determined by the close shard deadline to 586 // gate the impact. 587 closeFn := func(shard databaseShard) { 588 defer wg.Done() 589 if err := shard.Close(); err != nil { 590 n.log. 591 With(zap.Uint32("shard", shard.ID())). 592 Error("error occurred closing shard", zap.Error(err)) 593 n.metrics.shards.closeErrors.Inc(1) 594 } else { 595 n.metrics.shards.close.Inc(1) 596 } 597 } 598 599 wg.Add(len(shards)) 600 for _, shard := range shards { 601 dbShard := shard 602 if dbShard == nil { 603 continue 604 } 605 go closeFn(dbShard) 606 } 607 608 if blockUntilClosed { 609 wg.Wait() 610 } 611 } 612 613 func (n *dbNamespace) Tick(c context.Cancellable, startTime xtime.UnixNano) error { 614 // Allow the reader cache to tick. 615 n.namespaceReaderMgr.tick() 616 617 // Fetch the owned shards. 618 shards := n.OwnedShards() 619 if len(shards) == 0 { 620 return nil 621 } 622 623 n.RLock() 624 nsCtx := n.nsContextWithRLock() 625 n.RUnlock() 626 627 // Tick through the shards at a capped level of concurrency. 628 var ( 629 r tickResult 630 multiErr xerrors.MultiError 631 l sync.Mutex 632 wg sync.WaitGroup 633 ) 634 for _, shard := range shards { 635 shard := shard 636 wg.Add(1) 637 n.tickWorkers.Go(func() { 638 defer wg.Done() 639 640 if c.IsCancelled() { 641 return 642 } 643 644 shardResult, err := shard.Tick(c, startTime, nsCtx) 645 646 l.Lock() 647 r = r.merge(shardResult) 648 multiErr = multiErr.Add(err) 649 l.Unlock() 650 }) 651 } 652 653 wg.Wait() 654 655 // Tick namespaceIndex if it exists. 656 var ( 657 indexTickResults namespaceIndexTickResult 658 err error 659 ) 660 if idx := n.reverseIndex; idx != nil { 661 indexTickResults, err = idx.Tick(c, startTime) 662 if err != nil { 663 multiErr = multiErr.Add(err) 664 } 665 } 666 667 // NB: we early terminate here to ensure we are not reporting metrics 668 // based on in-accurate/partial tick results. 669 if err := multiErr.FinalError(); err != nil || c.IsCancelled() { 670 return err 671 } 672 673 n.statsLastTick.Lock() 674 n.statsLastTick.activeSeries = int64(r.activeSeries) 675 n.statsLastTick.activeBlocks = int64(r.activeBlocks) 676 n.statsLastTick.index = databaseNamespaceIndexStatsLastTick{ 677 numDocs: indexTickResults.NumTotalDocs, 678 numBlocks: indexTickResults.NumBlocks, 679 numSegments: indexTickResults.NumSegments, 680 } 681 n.statsLastTick.Unlock() 682 683 n.metrics.tick.activeSeries.Update(float64(r.activeSeries)) 684 n.metrics.tick.expiredSeries.Inc(int64(r.expiredSeries)) 685 n.metrics.tick.activeBlocks.Update(float64(r.activeBlocks)) 686 n.metrics.tick.wiredBlocks.Update(float64(r.wiredBlocks)) 687 n.metrics.tick.unwiredBlocks.Update(float64(r.unwiredBlocks)) 688 n.metrics.tick.pendingMergeBlocks.Update(float64(r.pendingMergeBlocks)) 689 n.metrics.tick.madeExpiredBlocks.Inc(int64(r.madeExpiredBlocks)) 690 n.metrics.tick.madeUnwiredBlocks.Inc(int64(r.madeUnwiredBlocks)) 691 n.metrics.tick.mergedOutOfOrderBlocks.Inc(int64(r.mergedOutOfOrderBlocks)) 692 n.metrics.tick.evictedBuckets.Inc(int64(r.evictedBuckets)) 693 n.metrics.tick.index.numDocs.Update(float64(indexTickResults.NumTotalDocs)) 694 n.metrics.tick.index.numBlocks.Update(float64(indexTickResults.NumBlocks)) 695 n.metrics.tick.index.numSegments.Update(float64(indexTickResults.NumSegments)) 696 n.metrics.tick.index.numBlocksEvicted.Inc(indexTickResults.NumBlocksEvicted) 697 n.metrics.tick.index.numBlocksSealed.Inc(indexTickResults.NumBlocksSealed) 698 n.metrics.tick.errors.Inc(int64(r.errors)) 699 700 return nil 701 } 702 703 func (n *dbNamespace) Write( 704 ctx context.Context, 705 id ident.ID, 706 timestamp xtime.UnixNano, 707 value float64, 708 unit xtime.Unit, 709 annotation []byte, 710 ) (SeriesWrite, error) { 711 callStart := n.nowFn() 712 713 if n.ReadOnly() { 714 n.metrics.write.ReportError(n.nowFn().Sub(callStart)) 715 return SeriesWrite{}, errNamespaceReadOnly 716 } 717 718 shard, nsCtx, err := n.shardFor(id) 719 if err != nil { 720 n.metrics.write.ReportError(n.nowFn().Sub(callStart)) 721 return SeriesWrite{}, err 722 } 723 724 opts := series.WriteOptions{ 725 TruncateType: n.opts.TruncateType(), 726 SchemaDesc: nsCtx.Schema, 727 } 728 seriesWrite, err := shard.Write(ctx, id, timestamp, 729 value, unit, annotation, opts) 730 if err == nil && len(annotation) == 0 { 731 n.metrics.writesWithoutAnnotation.Inc(1) 732 } 733 n.metrics.write.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 734 return seriesWrite, err 735 } 736 737 func (n *dbNamespace) WriteTagged( 738 ctx context.Context, 739 id ident.ID, 740 tagResolver convert.TagMetadataResolver, 741 timestamp xtime.UnixNano, 742 value float64, 743 unit xtime.Unit, 744 annotation []byte, 745 ) (SeriesWrite, error) { 746 callStart := n.nowFn() 747 748 if n.ReadOnly() { 749 n.metrics.writeTagged.ReportError(n.nowFn().Sub(callStart)) 750 return SeriesWrite{}, errNamespaceReadOnly 751 } 752 753 if n.reverseIndex == nil { 754 n.metrics.writeTagged.ReportError(n.nowFn().Sub(callStart)) 755 return SeriesWrite{}, errNamespaceIndexingDisabled 756 } 757 758 shard, nsCtx, err := n.shardFor(id) 759 if err != nil { 760 n.metrics.writeTagged.ReportError(n.nowFn().Sub(callStart)) 761 return SeriesWrite{}, err 762 } 763 764 opts := series.WriteOptions{ 765 TruncateType: n.opts.TruncateType(), 766 SchemaDesc: nsCtx.Schema, 767 } 768 seriesWrite, err := shard.WriteTagged(ctx, id, tagResolver, timestamp, 769 value, unit, annotation, opts) 770 if err == nil && len(annotation) == 0 { 771 n.metrics.writesWithoutAnnotation.Inc(1) 772 } 773 n.metrics.writeTagged.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 774 return seriesWrite, err 775 } 776 777 func (n *dbNamespace) WritePendingIndexInserts( 778 pending []writes.PendingIndexInsert, 779 ) error { 780 if n.reverseIndex == nil { 781 return errNamespaceIndexingDisabled 782 } 783 return n.reverseIndex.WritePending(pending) 784 } 785 786 func (n *dbNamespace) SeriesRefResolver( 787 shardID uint32, 788 id ident.ID, 789 tags ident.TagIterator, 790 ) (bootstrap.SeriesRefResolver, bool, error) { 791 n.RLock() 792 shard, owned, err := n.shardAtWithRLock(shardID) 793 n.RUnlock() 794 if err != nil { 795 return nil, owned, err 796 } 797 resolver, err := shard.SeriesRefResolver(id, tags) 798 return resolver, true, err 799 } 800 801 func (n *dbNamespace) QueryIDs( 802 ctx context.Context, 803 query index.Query, 804 opts index.QueryOptions, 805 ) (index.QueryResult, error) { 806 ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSQueryIDs) 807 if sampled { 808 sp.LogFields( 809 opentracinglog.String("query", query.String()), 810 opentracinglog.String("namespace", n.ID().String()), 811 opentracinglog.Int("seriesLimit", opts.SeriesLimit), 812 opentracinglog.Int("docsLimit", opts.DocsLimit), 813 xopentracing.Time("start", opts.StartInclusive.ToTime()), 814 xopentracing.Time("end", opts.EndExclusive.ToTime()), 815 ) 816 } 817 defer sp.Finish() 818 819 callStart := n.nowFn() 820 if n.reverseIndex == nil { 821 n.metrics.queryIDs.ReportError(n.nowFn().Sub(callStart)) 822 err := errNamespaceIndexingDisabled 823 sp.LogFields(opentracinglog.Error(err)) 824 return index.QueryResult{}, err 825 } 826 827 if !n.reverseIndex.Bootstrapped() { 828 // Similar to reading shard data, return not bootstrapped 829 n.metrics.queryIDs.ReportError(n.nowFn().Sub(callStart)) 830 err := errIndexNotBootstrappedToRead 831 sp.LogFields(opentracinglog.Error(err)) 832 return index.QueryResult{}, 833 xerrors.NewRetryableError(err) 834 } 835 836 res, err := n.reverseIndex.Query(ctx, query, opts) 837 if err != nil { 838 sp.LogFields(opentracinglog.Error(err)) 839 } 840 n.metrics.queryIDs.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 841 return res, err 842 } 843 844 func (n *dbNamespace) AggregateQuery( 845 ctx context.Context, 846 query index.Query, 847 opts index.AggregationOptions, 848 ) (index.AggregateQueryResult, error) { 849 callStart := n.nowFn() 850 if n.reverseIndex == nil { 851 n.metrics.aggregateQuery.ReportError(n.nowFn().Sub(callStart)) 852 return index.AggregateQueryResult{}, errNamespaceIndexingDisabled 853 } 854 855 if !n.reverseIndex.Bootstrapped() { 856 // Similar to reading shard data, return not bootstrapped 857 n.metrics.aggregateQuery.ReportError(n.nowFn().Sub(callStart)) 858 return index.AggregateQueryResult{}, 859 xerrors.NewRetryableError(errIndexNotBootstrappedToRead) 860 } 861 862 res, err := n.reverseIndex.AggregateQuery(ctx, query, opts) 863 n.metrics.aggregateQuery.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 864 return res, err 865 } 866 867 func (n *dbNamespace) PrepareBootstrap(ctx context.Context) ([]databaseShard, error) { 868 ctx, span, sampled := ctx.StartSampledTraceSpan(tracepoint.NSPrepareBootstrap) 869 defer span.Finish() 870 871 if sampled { 872 span.LogFields(opentracinglog.String("namespace", n.id.String())) 873 } 874 875 var ( 876 wg sync.WaitGroup 877 multiErrLock sync.Mutex 878 multiErr xerrors.MultiError 879 shards = n.OwnedShards() 880 ) 881 for _, shard := range shards { 882 shard := shard 883 wg.Add(1) 884 go func() { 885 defer wg.Done() 886 887 err := shard.PrepareBootstrap(ctx) 888 if err != nil { 889 multiErrLock.Lock() 890 multiErr = multiErr.Add(err) 891 multiErrLock.Unlock() 892 } 893 }() 894 } 895 896 wg.Wait() 897 898 if err := multiErr.FinalError(); err != nil { 899 return nil, err 900 } 901 902 return shards, nil 903 } 904 905 func (n *dbNamespace) ReadEncoded( 906 ctx context.Context, 907 id ident.ID, 908 start, end xtime.UnixNano, 909 ) (series.BlockReaderIter, error) { 910 callStart := n.nowFn() 911 shard, nsCtx, err := n.readableShardFor(id) 912 if err != nil { 913 n.metrics.read.ReportError(n.nowFn().Sub(callStart)) 914 return nil, err 915 } 916 res, err := shard.ReadEncoded(ctx, id, start, end, nsCtx) 917 n.metrics.read.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 918 return res, err 919 } 920 921 func (n *dbNamespace) FetchBlocks( 922 ctx context.Context, 923 shardID uint32, 924 id ident.ID, 925 starts []xtime.UnixNano, 926 ) ([]block.FetchBlockResult, error) { 927 callStart := n.nowFn() 928 shard, nsCtx, err := n.ReadableShardAt(shardID) 929 if err != nil { 930 n.metrics.fetchBlocks.ReportError(n.nowFn().Sub(callStart)) 931 return nil, err 932 } 933 934 res, err := shard.FetchBlocks(ctx, id, starts, nsCtx) 935 n.metrics.fetchBlocks.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 936 return res, err 937 } 938 939 func (n *dbNamespace) FetchBlocksMetadataV2( 940 ctx context.Context, 941 shardID uint32, 942 start, end xtime.UnixNano, 943 limit int64, 944 pageToken PageToken, 945 opts block.FetchBlocksMetadataOptions, 946 ) (block.FetchBlocksMetadataResults, PageToken, error) { 947 callStart := n.nowFn() 948 shard, _, err := n.ReadableShardAt(shardID) 949 if err != nil { 950 n.metrics.fetchBlocksMetadata.ReportError(n.nowFn().Sub(callStart)) 951 return nil, nil, err 952 } 953 954 res, nextPageToken, err := shard.FetchBlocksMetadataV2(ctx, start, end, limit, 955 pageToken, opts) 956 n.metrics.fetchBlocksMetadata.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 957 return res, nextPageToken, err 958 } 959 960 func (n *dbNamespace) Bootstrap( 961 ctx context.Context, 962 bootstrapResult bootstrap.NamespaceResult, 963 ) error { 964 ctx, span, sampled := ctx.StartSampledTraceSpan(tracepoint.NSBootstrap) 965 defer span.Finish() 966 967 if sampled { 968 span.LogFields(opentracinglog.String("namespace", n.id.String())) 969 } 970 971 callStart := n.nowFn() 972 973 n.Lock() 974 if n.bootstrapState == Bootstrapping { 975 n.Unlock() 976 n.metrics.bootstrap.ReportError(n.nowFn().Sub(callStart)) 977 return errNamespaceIsBootstrapping 978 } 979 n.bootstrapState = Bootstrapping 980 nsCtx := n.nsContextWithRLock() 981 n.Unlock() 982 983 n.metrics.bootstrapStart.Inc(1) 984 985 success := false 986 defer func() { 987 n.Lock() 988 if success { 989 n.bootstrapState = Bootstrapped 990 } else { 991 n.bootstrapState = BootstrapNotStarted 992 } 993 n.Unlock() 994 n.metrics.bootstrapEnd.Inc(1) 995 }() 996 997 if !n.nopts.BootstrapEnabled() { 998 success = true 999 n.metrics.bootstrap.ReportSuccess(n.nowFn().Sub(callStart)) 1000 return nil 1001 } 1002 1003 // Bootstrap shards using at least half the CPUs available 1004 workers := xsync.NewWorkerPool(int(math.Ceil(float64(runtime.GOMAXPROCS(0)) / 2))) 1005 workers.Init() 1006 1007 var ( 1008 bootstrappedShards = bootstrapResult.Shards 1009 multiErr = xerrors.NewMultiError() 1010 mutex sync.Mutex 1011 wg sync.WaitGroup 1012 ) 1013 n.log.Info("bootstrap marking all shards as bootstrapped", 1014 zap.Stringer("namespace", n.id), 1015 zap.Int("numShards", len(bootstrappedShards))) 1016 for _, shard := range n.OwnedShards() { 1017 // Make sure it was bootstrapped during this bootstrap run. 1018 shardID := shard.ID() 1019 bootstrapped := false 1020 for _, elem := range bootstrappedShards { 1021 if elem == shardID { 1022 bootstrapped = true 1023 break 1024 } 1025 } 1026 if !bootstrapped { 1027 // NB(r): Not bootstrapped in this bootstrap run. 1028 n.log.Debug("skipping already bootstrapped shard", 1029 zap.Uint32("shard", shardID), 1030 zap.Stringer("namespace", n.id)) 1031 continue 1032 } 1033 1034 if shard.IsBootstrapped() { 1035 // No concurrent bootstraps, this is an invariant since 1036 // we only select bootstrapping the shard for a run if it's 1037 // not already bootstrapped. 1038 err := instrument.InvariantErrorf( 1039 "bootstrapper already bootstrapped shard: %d", shardID) 1040 mutex.Lock() 1041 multiErr = multiErr.Add(err) 1042 mutex.Unlock() 1043 continue 1044 } 1045 1046 // Check if there are unfulfilled ranges 1047 if ranges, ok := bootstrapResult.DataResult.Unfulfilled().Get(shardID); ok && !ranges.IsEmpty() { 1048 continue 1049 } else if n.reverseIndex != nil { 1050 if ranges, ok := bootstrapResult.IndexResult.Unfulfilled().Get(shardID); ok && !ranges.IsEmpty() { 1051 continue 1052 } 1053 } 1054 1055 wg.Add(1) 1056 shard := shard 1057 workers.Go(func() { 1058 err := shard.Bootstrap(ctx, nsCtx) 1059 1060 mutex.Lock() 1061 multiErr = multiErr.Add(err) 1062 mutex.Unlock() 1063 1064 wg.Done() 1065 }) 1066 } 1067 wg.Wait() 1068 1069 if n.reverseIndex != nil { 1070 indexResults := bootstrapResult.IndexResult.IndexResults() 1071 n.log.Info("bootstrap index with bootstrapped index segments", 1072 zap.Int("numIndexBlocks", len(indexResults))) 1073 err := n.reverseIndex.Bootstrap(indexResults) 1074 multiErr = multiErr.Add(err) 1075 } 1076 1077 markAnyUnfulfilled := func( 1078 bootstrapType string, 1079 unfulfilled result.ShardTimeRanges, 1080 ) error { 1081 shardsUnfulfilled := int64(unfulfilled.Len()) 1082 n.metrics.unfulfilled.Inc(shardsUnfulfilled) 1083 if shardsUnfulfilled == 0 { 1084 return nil 1085 } 1086 1087 errStr := unfulfilled.SummaryString() 1088 errFmt := "bootstrap completed with unfulfilled ranges" 1089 n.log.Error(errFmt, 1090 zap.Error(errors.New(errStr)), 1091 zap.String("namespace", n.id.String()), 1092 zap.String("bootstrapType", bootstrapType)) 1093 return fmt.Errorf("%s: %s", errFmt, errStr) 1094 } 1095 1096 r := bootstrapResult 1097 if err := markAnyUnfulfilled("data", r.DataResult.Unfulfilled()); err != nil { 1098 multiErr = multiErr.Add(err) 1099 } 1100 if n.reverseIndex != nil { 1101 if err := markAnyUnfulfilled("index", r.IndexResult.Unfulfilled()); err != nil { 1102 multiErr = multiErr.Add(err) 1103 } 1104 } 1105 1106 err := multiErr.FinalError() 1107 n.metrics.bootstrap.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 1108 1109 // NB(r): "success" is read by the defer above and depending on if true/false 1110 // will set the namespace status as bootstrapped or not. 1111 success = err == nil 1112 return err 1113 } 1114 1115 func (n *dbNamespace) WarmFlush( 1116 blockStart xtime.UnixNano, 1117 flushPersist persist.FlushPreparer, 1118 ) error { 1119 // NB(rartoul): This value can be used for emitting metrics, but should not be used 1120 // for business logic. 1121 callStart := n.nowFn() 1122 1123 n.RLock() 1124 if n.bootstrapState != Bootstrapped { 1125 n.RUnlock() 1126 n.metrics.flushWarmData.ReportError(n.nowFn().Sub(callStart)) 1127 return errNamespaceNotBootstrapped 1128 } 1129 nsCtx := n.nsContextWithRLock() 1130 n.RUnlock() 1131 1132 if n.ReadOnly() || !n.nopts.FlushEnabled() { 1133 n.metrics.flushWarmData.ReportSuccess(n.nowFn().Sub(callStart)) 1134 return nil 1135 } 1136 1137 // check if blockStart is aligned with the namespace's retention options 1138 bs := n.nopts.RetentionOptions().BlockSize() 1139 if t := blockStart.Truncate(bs); !blockStart.Equal(t) { 1140 return fmt.Errorf("failed to flush at time %v, not aligned to blockSize", blockStart.String()) 1141 } 1142 1143 multiErr := xerrors.NewMultiError() 1144 shards := n.OwnedShards() 1145 for _, shard := range shards { 1146 if !shard.IsBootstrapped() { 1147 n.log. 1148 With(zap.Uint32("shard", shard.ID())). 1149 Debug("skipping warm flush due to shard not bootstrapped yet") 1150 continue 1151 } 1152 1153 flushState, err := shard.FlushState(blockStart) 1154 if err != nil { 1155 return err 1156 } 1157 // skip flushing if the shard has already flushed data for the `blockStart` 1158 if flushState.WarmStatus.DataFlushed == fileOpSuccess { 1159 continue 1160 } 1161 1162 // NB(xichen): we still want to proceed if a shard fails to flush its data. 1163 // Probably want to emit a counter here, but for now just log it. 1164 if err := shard.WarmFlush(blockStart, flushPersist, nsCtx); err != nil { 1165 detailedErr := fmt.Errorf("shard %d failed to flush data: %v", 1166 shard.ID(), err) 1167 multiErr = multiErr.Add(detailedErr) 1168 } 1169 } 1170 1171 res := multiErr.FinalError() 1172 n.metrics.flushWarmData.ReportSuccessOrError(res, n.nowFn().Sub(callStart)) 1173 return res 1174 } 1175 1176 // idAndBlockStart is the composite key for the genny map used to keep track of 1177 // dirty series that need to be ColdFlushed. 1178 type idAndBlockStart struct { 1179 id []byte 1180 blockStart xtime.UnixNano 1181 } 1182 1183 type coldFlushReusableResources struct { 1184 // dirtySeries is a map from a composite key of <series ID, block start> 1185 // to an element in a list in the dirtySeriesToWrite map. This map is used 1186 // to quickly test whether a series is dirty for a particular block start. 1187 // 1188 // The composite key is deliberately made so that this map stays one level 1189 // deep, making it easier to share between shard loops, minimizing the need 1190 // for allocations. 1191 // 1192 // Having a reference to the element in the dirtySeriesToWrite list enables 1193 // efficient removal of the series that have been read and subsequent 1194 // iterating through remaining series to be read. 1195 dirtySeries *dirtySeriesMap 1196 // dirtySeriesToWrite is a map from block start to a list of dirty series 1197 // that have yet to be written to disk. 1198 dirtySeriesToWrite map[xtime.UnixNano]*idList 1199 // idElementPool is a pool of list elements to be used when constructing 1200 // new lists for the dirtySeriesToWrite map. 1201 idElementPool *idElementPool 1202 fsReader fs.DataFileSetReader 1203 } 1204 1205 func newColdFlushReusableResources(opts Options) coldFlushReusableResources { 1206 fsReader, err := fs.NewReader(opts.BytesPool(), opts.CommitLogOptions().FilesystemOptions()) 1207 if err != nil { 1208 return coldFlushReusableResources{} 1209 } 1210 1211 return coldFlushReusableResources{ 1212 dirtySeries: newDirtySeriesMap(), 1213 dirtySeriesToWrite: make(map[xtime.UnixNano]*idList), 1214 // TODO(juchan): set pool options. 1215 idElementPool: newIDElementPool(nil), 1216 fsReader: fsReader, 1217 } 1218 } 1219 1220 func (r *coldFlushReusableResources) reset() { 1221 for _, seriesList := range r.dirtySeriesToWrite { 1222 if seriesList != nil { 1223 seriesList.Reset() 1224 } 1225 // Don't delete the empty list from the map so that other shards don't 1226 // need to reinitialize the list for these blocks. 1227 } 1228 1229 r.dirtySeries.Reset() 1230 } 1231 1232 func (n *dbNamespace) ColdFlush(flushPersist persist.FlushPreparer) error { 1233 // NB(rartoul): This value can be used for emitting metrics, but should not be used 1234 // for business logic. 1235 callStart := n.nowFn() 1236 1237 n.RLock() 1238 if n.bootstrapState != Bootstrapped { 1239 n.RUnlock() 1240 n.metrics.flushColdData.ReportError(n.nowFn().Sub(callStart)) 1241 return errNamespaceNotBootstrapped 1242 } 1243 nsCtx := n.nsContextWithRLock() 1244 repairsAny := n.repairsAny 1245 n.RUnlock() 1246 1247 // If repair has run we still need cold flush regardless of whether cold writes is 1248 // enabled since repairs are dependent on the cold flushing logic. 1249 enabled := n.nopts.ColdWritesEnabled() || repairsAny 1250 if n.ReadOnly() || !enabled { 1251 n.metrics.flushColdData.ReportSuccess(n.nowFn().Sub(callStart)) 1252 return nil 1253 } 1254 1255 shards := n.OwnedShards() 1256 resources := newColdFlushReusableResources(n.opts) 1257 1258 // NB(bodu): The in-mem index will lag behind the TSDB in terms of new series writes. For a period of 1259 // time between when we rotate out the active cold mutable index segments (happens here) and when 1260 // we actually cold flush the data to disk we will be making writes to the newly active mutable seg. 1261 // This means that some series can live doubly in-mem and loaded from disk until the next cold flush 1262 // where they will be evicted from the in-mem index. 1263 var ( 1264 onColdFlushDone OnColdFlushDone 1265 err error 1266 ) 1267 if n.reverseIndex != nil { 1268 onColdFlushDone, err = n.reverseIndex.ColdFlush(shards) 1269 if err != nil { 1270 n.metrics.flushColdData.ReportError(n.nowFn().Sub(callStart)) 1271 return err 1272 } 1273 } 1274 1275 cfOpts := NewColdFlushNsOpts(true) 1276 onColdFlushNs, err := n.opts.OnColdFlush().ColdFlushNamespace(n, cfOpts) 1277 if err != nil { 1278 n.metrics.flushColdData.ReportError(n.nowFn().Sub(callStart)) 1279 return err 1280 } 1281 1282 // NB(bodu): Deferred shard cold flushes so that we can ensure that cold flush index data is 1283 // persisted before persisting TSDB data to ensure crash consistency. 1284 multiErr := xerrors.NewMultiError() 1285 shardColdFlushes := make([]ShardColdFlush, 0, len(shards)) 1286 for _, shard := range shards { 1287 if !shard.IsBootstrapped() { 1288 n.log. 1289 With(zap.Uint32("shard", shard.ID())). 1290 Debug("skipping cold flush due to shard not bootstrapped yet") 1291 continue 1292 } 1293 shardColdFlush, err := shard.ColdFlush(flushPersist, resources, nsCtx, onColdFlushNs) 1294 if err != nil { 1295 detailedErr := fmt.Errorf("shard %d failed to compact: %v", shard.ID(), err) 1296 multiErr = multiErr.Add(detailedErr) 1297 continue 1298 } 1299 shardColdFlushes = append(shardColdFlushes, shardColdFlush) 1300 } 1301 1302 // We go through this error checking process to allow for partially successful flushes. 1303 indexColdFlushError := onColdFlushNs.Done() 1304 if indexColdFlushError == nil && onColdFlushDone != nil { 1305 // Only evict rotated cold mutable index segments if the index cold flush was successful 1306 // or we will lose queryability of data that's still in mem. 1307 indexColdFlushError = onColdFlushDone() 1308 } 1309 if indexColdFlushError == nil { 1310 // NB(bodu): We only want to complete data cold flushes if the index cold flush 1311 // is successful. If index cold flush is successful, we want to attempt writing 1312 // of checkpoint files to complete the cold data flush lifecycle for successful shards. 1313 for _, shardColdFlush := range shardColdFlushes { 1314 multiErr = multiErr.Add(shardColdFlush.Done()) 1315 } 1316 } 1317 multiErr = multiErr.Add(indexColdFlushError) 1318 1319 res := multiErr.FinalError() 1320 n.metrics.flushColdData.ReportSuccessOrError(res, n.nowFn().Sub(callStart)) 1321 return res 1322 } 1323 1324 func (n *dbNamespace) FlushIndex(flush persist.IndexFlush) error { 1325 callStart := n.nowFn() 1326 n.RLock() 1327 if n.bootstrapState != Bootstrapped { 1328 n.RUnlock() 1329 n.metrics.flushIndex.ReportError(n.nowFn().Sub(callStart)) 1330 return errNamespaceNotBootstrapped 1331 } 1332 n.RUnlock() 1333 1334 if n.ReadOnly() || !n.nopts.FlushEnabled() || !n.nopts.IndexOptions().Enabled() { 1335 n.metrics.flushIndex.ReportSuccess(n.nowFn().Sub(callStart)) 1336 return nil 1337 } 1338 1339 shards := n.OwnedShards() 1340 err := n.reverseIndex.WarmFlush(flush, shards) 1341 n.metrics.flushIndex.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 1342 return err 1343 } 1344 1345 func (n *dbNamespace) Snapshot( 1346 blockStarts []xtime.UnixNano, 1347 snapshotTime xtime.UnixNano, 1348 snapshotPersist persist.SnapshotPreparer, 1349 ) error { 1350 // NB(rartoul): This value can be used for emitting metrics, but should not be used 1351 // for business logic. 1352 callStart := n.nowFn() 1353 1354 var nsCtx namespace.Context 1355 n.RLock() 1356 if n.bootstrapState != Bootstrapped { 1357 n.RUnlock() 1358 n.metrics.snapshot.ReportError(n.nowFn().Sub(callStart)) 1359 return errNamespaceNotBootstrapped 1360 } 1361 nsCtx = n.nsContextWithRLock() 1362 n.RUnlock() 1363 1364 if !n.nopts.SnapshotEnabled() { 1365 // Note that we keep the ability to disable snapshots at the namespace level around for 1366 // debugging / performance / flexibility reasons, but disabling it can / will cause data 1367 // loss due to the commitlog cleanup logic assuming that a valid snapshot checkpoint file 1368 // means that all namespaces were successfully snapshotted. 1369 n.metrics.snapshot.ReportSuccess(n.nowFn().Sub(callStart)) 1370 return nil 1371 } 1372 1373 var ( 1374 seriesPersist int 1375 multiErr xerrors.MultiError 1376 ) 1377 1378 for _, shard := range n.OwnedShards() { 1379 log := n.log.With(zap.Uint32("shard", shard.ID())) 1380 if !shard.IsBootstrapped() { 1381 log.Debug("skipping snapshot due to shard not bootstrapped yet") 1382 continue 1383 } 1384 snapshotBlockStarts := shard.FilterBlocksNeedSnapshot(blockStarts) 1385 if len(snapshotBlockStarts) == 0 { 1386 log.Debug("skipping shard snapshot since no blocks need it") 1387 continue 1388 } 1389 for _, blockStart := range snapshotBlockStarts { 1390 snapshotResult, err := shard.Snapshot(blockStart, snapshotTime, snapshotPersist, nsCtx) 1391 if err != nil { 1392 detailedErr := fmt.Errorf("shard %d failed to snapshot %v block: %w", shard.ID(), blockStart, err) 1393 multiErr = multiErr.Add(detailedErr) 1394 continue 1395 } 1396 seriesPersist += snapshotResult.SeriesPersist 1397 } 1398 } 1399 1400 n.metrics.snapshotSeriesPersist.Inc(int64(seriesPersist)) 1401 1402 res := multiErr.FinalError() 1403 n.metrics.snapshot.ReportSuccessOrError(res, n.nowFn().Sub(callStart)) 1404 return res 1405 } 1406 1407 func (n *dbNamespace) NeedsFlush( 1408 alignedInclusiveStart xtime.UnixNano, 1409 alignedInclusiveEnd xtime.UnixNano, 1410 ) (bool, error) { 1411 // NB(r): Essentially if all are success, we don't need to flush, if any 1412 // are failed with the minimum num failures less than max retries then 1413 // we need to flush - otherwise if any in progress we can't flush and if 1414 // any not started then we need to flush. 1415 n.RLock() 1416 defer n.RUnlock() 1417 return n.needsFlushWithLock(alignedInclusiveStart, alignedInclusiveEnd) 1418 } 1419 1420 func (n *dbNamespace) needsFlushWithLock( 1421 alignedInclusiveStart xtime.UnixNano, 1422 alignedInclusiveEnd xtime.UnixNano, 1423 ) (bool, error) { 1424 var ( 1425 blockSize = n.nopts.RetentionOptions().BlockSize() 1426 blockStarts = timesInRange(alignedInclusiveStart, alignedInclusiveEnd, blockSize) 1427 ) 1428 1429 // NB(prateek): we do not check if any other flush is in progress in this method, 1430 // instead relying on the databaseFlushManager to ensure atomicity of flushes. 1431 1432 // Check for not started or failed that might need a flush 1433 for _, shard := range n.shards { 1434 if shard == nil { 1435 continue 1436 } 1437 for _, blockStart := range blockStarts { 1438 flushState, err := shard.FlushState(blockStart) 1439 if err != nil { 1440 return false, err 1441 } 1442 if flushState.WarmStatus.DataFlushed != fileOpSuccess { 1443 return true, nil 1444 } 1445 } 1446 } 1447 1448 // All success or failed and reached max retries 1449 return false, nil 1450 } 1451 1452 func (n *dbNamespace) Truncate() (int64, error) { 1453 var totalNumSeries int64 1454 1455 n.RLock() 1456 shards := n.shardSet.AllIDs() 1457 for _, shard := range shards { 1458 totalNumSeries += n.shards[shard].NumSeries() 1459 } 1460 n.RUnlock() 1461 1462 // For now we are simply dropping all the objects (e.g., shards, series, blocks etc) owned by the 1463 // namespace, which means the memory will be reclaimed the next time GC kicks in and returns the 1464 // reclaimed memory to the OS. In the future, we might investigate whether it's worth returning 1465 // the pooled objects to the pools if the pool is low and needs replenishing. 1466 n.assignShardSet(n.shardSet, assignShardSetOptions{ 1467 needsBootstrap: false, 1468 initialAssignment: true, 1469 }) 1470 1471 // NB(xichen): possibly also clean up disk files and force a GC here to reclaim memory immediately 1472 return totalNumSeries, nil 1473 } 1474 1475 func (n *dbNamespace) Repair( 1476 repairer databaseShardRepairer, 1477 tr xtime.Range, 1478 opts NamespaceRepairOptions, 1479 ) error { 1480 shouldRun := opts.Force || n.nopts.RepairEnabled() 1481 if !shouldRun { 1482 return nil 1483 } 1484 1485 n.RLock() 1486 repairsAny := n.repairsAny 1487 n.RUnlock() 1488 if !repairsAny { 1489 // Only acquire write lock if required. 1490 n.Lock() 1491 n.repairsAny = true 1492 n.Unlock() 1493 } 1494 1495 var ( 1496 wg sync.WaitGroup 1497 mutex sync.Mutex 1498 numShardsRepaired int 1499 numTotalSeries int64 1500 numTotalBlocks int64 1501 numSizeDiffSeries int64 1502 numSizeDiffBlocks int64 1503 numChecksumDiffSeries int64 1504 numChecksumDiffBlocks int64 1505 peerMetadataComparisons repair.PeerMetadataComparisonResults 1506 throttlePerShard time.Duration 1507 ) 1508 1509 multiErr := xerrors.NewMultiError() 1510 shards := n.OwnedShards() 1511 numShards := len(shards) 1512 if numShards > 0 { 1513 throttlePerShard = time.Duration( 1514 int64(repairer.Options().RepairThrottle()) / int64(numShards)) 1515 } 1516 1517 workers := xsync.NewWorkerPool(repairer.Options().RepairShardConcurrency()) 1518 workers.Init() 1519 1520 n.RLock() 1521 nsCtx := n.nsContextWithRLock() 1522 nsMeta := n.metadata 1523 n.RUnlock() 1524 1525 for _, shard := range shards { 1526 shard := shard 1527 1528 wg.Add(1) 1529 workers.Go(func() { 1530 defer wg.Done() 1531 1532 ctx := n.opts.ContextPool().Get() 1533 defer ctx.Close() 1534 1535 metadataRes, err := shard.Repair(ctx, nsCtx, nsMeta, tr, repairer) 1536 1537 mutex.Lock() 1538 if err != nil { 1539 multiErr = multiErr.Add(err) 1540 } else { 1541 numShardsRepaired++ 1542 numTotalSeries += metadataRes.NumSeries 1543 numTotalBlocks += metadataRes.NumBlocks 1544 numSizeDiffSeries += metadataRes.SizeDifferences.NumSeries() 1545 numSizeDiffBlocks += metadataRes.SizeDifferences.NumBlocks() 1546 numChecksumDiffSeries += metadataRes.ChecksumDifferences.NumSeries() 1547 numChecksumDiffBlocks += metadataRes.ChecksumDifferences.NumBlocks() 1548 peerMetadataComparisons = append(peerMetadataComparisons, metadataRes.PeerMetadataComparisonResults...) 1549 } 1550 mutex.Unlock() 1551 1552 if throttlePerShard > 0 { 1553 time.Sleep(throttlePerShard) 1554 } 1555 }) 1556 } 1557 1558 wg.Wait() 1559 1560 aggregatePeerComparison := peerMetadataComparisons.Aggregate() 1561 n.metrics.repairDifferingPercent.Update(aggregatePeerComparison.ComparedDifferingPercent) 1562 n.metrics.repairComparedBlocks.Inc(aggregatePeerComparison.ComparedBlocks) 1563 n.metrics.repairDifferingBlocks.Inc(aggregatePeerComparison.ComparedDifferingBlocks) 1564 n.metrics.repairMismatchBlocks.Inc(aggregatePeerComparison.ComparedMismatchBlocks) 1565 n.metrics.repairMissingBlocks.Inc(aggregatePeerComparison.ComparedMissingBlocks) 1566 n.metrics.repairExtraBlocks.Inc(aggregatePeerComparison.ComparedExtraBlocks) 1567 1568 n.log.Info("repair result", 1569 zap.String("repairTimeRange", tr.String()), 1570 zap.Int("numTotalShards", len(shards)), 1571 zap.Int("numShardsRepaired", numShardsRepaired), 1572 zap.Int64("numTotalSeries", numTotalSeries), 1573 zap.Int64("numTotalBlocks", numTotalBlocks), 1574 zap.Int64("numSizeDiffSeries", numSizeDiffSeries), 1575 zap.Int64("numSizeDiffBlocks", numSizeDiffBlocks), 1576 zap.Int64("numChecksumDiffSeries", numChecksumDiffSeries), 1577 zap.Int64("numChecksumDiffBlocks", numChecksumDiffBlocks), 1578 zap.Float64("peerComparisonComparedDifferingPercent", aggregatePeerComparison.ComparedDifferingPercent), 1579 zap.Int64("peerComparisonComparedBlocks", aggregatePeerComparison.ComparedBlocks), 1580 zap.Int64("peerComparisonComparedDifferingBlocks", aggregatePeerComparison.ComparedDifferingBlocks), 1581 zap.Int64("peerComparisonComparedMismatchBlocks", aggregatePeerComparison.ComparedMismatchBlocks), 1582 zap.Int64("peerComparisonComparedMissingBlocks", aggregatePeerComparison.ComparedMissingBlocks), 1583 zap.Int64("peerComparisonComparedExtraBlocks", aggregatePeerComparison.ComparedExtraBlocks), 1584 ) 1585 1586 return multiErr.FinalError() 1587 } 1588 1589 func (n *dbNamespace) OwnedShards() []databaseShard { 1590 n.RLock() 1591 shards := n.shardSet.AllIDs() 1592 databaseShards := make([]databaseShard, len(shards)) 1593 for i, shard := range shards { 1594 databaseShards[i] = n.shards[shard] 1595 } 1596 n.RUnlock() 1597 return databaseShards 1598 } 1599 1600 func (n *dbNamespace) SetIndex(reverseIndex NamespaceIndex) error { 1601 n.Lock() 1602 defer n.Unlock() 1603 1604 if !n.metadata.Options().IndexOptions().Enabled() { 1605 return errNamespaceIndexingDisabled 1606 } 1607 n.reverseIndex = reverseIndex 1608 1609 return nil 1610 } 1611 1612 func (n *dbNamespace) Index() (NamespaceIndex, error) { 1613 n.RLock() 1614 defer n.RUnlock() 1615 if !n.metadata.Options().IndexOptions().Enabled() { 1616 return nil, errNamespaceIndexingDisabled 1617 } 1618 return n.reverseIndex, nil 1619 } 1620 1621 func (n *dbNamespace) ReadOnly() bool { 1622 return n.readOnly 1623 } 1624 1625 func (n *dbNamespace) SetReadOnly(value bool) { 1626 n.readOnly = value 1627 } 1628 1629 func (n *dbNamespace) shardFor(id ident.ID) (databaseShard, namespace.Context, error) { 1630 n.RLock() 1631 nsCtx := n.nsContextWithRLock() 1632 shardID := n.shardSet.Lookup(id) 1633 shard, _, err := n.shardAtWithRLock(shardID) 1634 n.RUnlock() 1635 return shard, nsCtx, err 1636 } 1637 1638 func (n *dbNamespace) readableShardFor(id ident.ID) (databaseShard, namespace.Context, error) { 1639 n.RLock() 1640 nsCtx := n.nsContextWithRLock() 1641 shardID := n.shardSet.Lookup(id) 1642 shard, err := n.readableShardAtWithRLock(shardID) 1643 n.RUnlock() 1644 return shard, nsCtx, err 1645 } 1646 1647 func (n *dbNamespace) ReadableShardAt(shardID uint32) (databaseShard, namespace.Context, error) { 1648 n.RLock() 1649 nsCtx := n.nsContextWithRLock() 1650 shard, err := n.readableShardAtWithRLock(shardID) 1651 n.RUnlock() 1652 return shard, nsCtx, err 1653 } 1654 1655 func (n *dbNamespace) shardAtWithRLock(shardID uint32) (databaseShard, bool, error) { 1656 // NB(r): These errors are retryable as they will occur 1657 // during a topology change and must be retried by the client. 1658 if int(shardID) >= len(n.shards) { 1659 return nil, false, xerrors.NewRetryableError( 1660 fmt.Errorf("not responsible for shard %d", shardID)) 1661 } 1662 shard := n.shards[shardID] 1663 if shard == nil { 1664 return nil, false, xerrors.NewRetryableError( 1665 fmt.Errorf("not responsible for shard %d", shardID)) 1666 } 1667 return shard, true, nil 1668 } 1669 1670 func (n *dbNamespace) readableShardAtWithRLock(shardID uint32) (databaseShard, error) { 1671 shard, _, err := n.shardAtWithRLock(shardID) 1672 if err != nil { 1673 return nil, err 1674 } 1675 if !shard.IsBootstrapped() { 1676 return nil, xerrors.NewRetryableError(errShardNotBootstrappedToRead) 1677 } 1678 return shard, nil 1679 } 1680 1681 func (n *dbNamespace) Close() error { 1682 n.Lock() 1683 if n.closed { 1684 n.Unlock() 1685 return errNamespaceAlreadyClosed 1686 } 1687 n.closed = true 1688 shards := n.shards 1689 n.shards = shards[:0] 1690 n.shardSet = sharding.NewEmptyShardSet(sharding.DefaultHashFn(1)) 1691 n.Unlock() 1692 n.namespaceReaderMgr.close() 1693 n.closeShards(shards, true) 1694 if retriever := n.blockRetriever; retriever != nil { 1695 if err := retriever.Close(); err != nil { 1696 n.log.Error("error when closing blockRetriever", 1697 zap.Error(err), zap.Stringer("namespace", n.id)) 1698 } 1699 } 1700 close(n.shutdownCh) 1701 if n.reverseIndex != nil { 1702 return n.reverseIndex.Close() 1703 } 1704 if n.schemaListener != nil { 1705 n.schemaListener.Close() 1706 } 1707 return nil 1708 } 1709 1710 func (n *dbNamespace) BootstrapState() BootstrapState { 1711 n.RLock() 1712 defer n.RUnlock() 1713 return n.bootstrapState 1714 } 1715 1716 func (n *dbNamespace) ShardBootstrapState() ShardBootstrapStates { 1717 n.RLock() 1718 shardStates := make(ShardBootstrapStates, len(n.shards)) 1719 for _, shard := range n.shards { 1720 if shard == nil { 1721 continue 1722 } 1723 shardStates[shard.ID()] = shard.BootstrapState() 1724 } 1725 n.RUnlock() 1726 return shardStates 1727 } 1728 1729 func (n *dbNamespace) FlushState(shardID uint32, blockStart xtime.UnixNano) (fileOpState, error) { 1730 n.RLock() 1731 defer n.RUnlock() 1732 shard, _, err := n.shardAtWithRLock(shardID) 1733 if err != nil { 1734 return fileOpState{}, err 1735 } 1736 flushState, err := shard.FlushState(blockStart) 1737 if err != nil { 1738 return fileOpState{}, err 1739 } 1740 return flushState, nil 1741 } 1742 1743 func (n *dbNamespace) nsContextWithRLock() namespace.Context { 1744 return namespace.Context{ID: n.id, Schema: n.schemaDescr} 1745 } 1746 1747 func (n *dbNamespace) AggregateTiles( 1748 ctx context.Context, 1749 sourceNs databaseNamespace, 1750 opts AggregateTilesOptions, 1751 ) (int64, error) { 1752 var ( 1753 callStart = n.nowFn() 1754 1755 aggregateTilesScope = opts.InsOptions.MetricsScope() 1756 timerOpts = n.opts.InstrumentOptions().TimerOptions() 1757 methodMetrics = instrument.NewMethodMetrics( 1758 aggregateTilesScope, "aggregate-tiles", timerOpts) 1759 ) 1760 1761 processedTileCount, err := n.aggregateTiles(ctx, sourceNs, opts) 1762 1763 methodMetrics.ReportSuccessOrError(err, n.nowFn().Sub(callStart)) 1764 1765 return processedTileCount, err 1766 } 1767 1768 func (n *dbNamespace) aggregateTiles( 1769 ctx context.Context, 1770 sourceNs databaseNamespace, 1771 opts AggregateTilesOptions, 1772 ) (int64, error) { 1773 var ( 1774 startedAt = time.Now() 1775 targetBlockSize = n.Metadata().Options().RetentionOptions().BlockSize() 1776 targetBlockStart = opts.Start.Truncate(targetBlockSize) 1777 sourceBlockSize = sourceNs.Options().RetentionOptions().BlockSize() 1778 lastSourceBlockEnd = opts.End.Truncate(sourceBlockSize) 1779 1780 scope = opts.InsOptions.MetricsScope() 1781 processedShards = scope.Counter("processed-shards") 1782 ) 1783 1784 if targetBlockStart.Add(targetBlockSize).Before(lastSourceBlockEnd) { 1785 return 0, fmt.Errorf("tile aggregation must be done within a single target block (start=%s, end=%s, blockSize=%s)", 1786 opts.Start, opts.End, targetBlockSize.String()) 1787 } 1788 1789 if n.BootstrapState() != Bootstrapped || sourceNs.BootstrapState() != Bootstrapped { 1790 return 0, errNamespaceNotBootstrapped 1791 } 1792 1793 if len(n.OwnedShards()) == 0 { 1794 // This is possible during a new node join. 1795 n.log.Info("no shards owned, skip AggregateTiles") 1796 return 0, nil 1797 } 1798 1799 // Create an empty warm index fileset because the block 1800 // will not be queryable if it has no warm index. 1801 if err := n.createEmptyWarmIndexIfNotExistsFn(targetBlockStart); err != nil { 1802 return 0, err 1803 } 1804 1805 // Cold flusher builds the reverse index for target (current) ns. 1806 cfOpts := NewColdFlushNsOpts(false) 1807 onColdFlushNs, err := n.opts.OnColdFlush().ColdFlushNamespace(n, cfOpts) 1808 if err != nil { 1809 return 0, err 1810 } 1811 1812 var ( 1813 processedTileCount int64 1814 aggregationSuccess bool 1815 ) 1816 1817 defer func() { 1818 if aggregationSuccess { 1819 return 1820 } 1821 // Abort building reverse index if aggregation fails. 1822 if err := onColdFlushNs.Abort(); err != nil { 1823 n.log.Error("error aborting cold flush", 1824 zap.Stringer("sourceNs", sourceNs.ID()), zap.Error(err)) 1825 } 1826 }() 1827 1828 for _, targetShard := range n.OwnedShards() { 1829 if !targetShard.IsBootstrapped() { 1830 n.log.Debug("skipping aggregateTiles due to shard not bootstrapped", 1831 zap.Uint32("shard", targetShard.ID())) 1832 continue 1833 } 1834 1835 shardProcessedTileCount, err := targetShard.AggregateTiles( 1836 ctx, sourceNs, n, targetShard.ID(), onColdFlushNs, opts) 1837 1838 processedTileCount += shardProcessedTileCount 1839 processedShards.Inc(1) 1840 if err != nil { 1841 return 0, fmt.Errorf("shard %d aggregation failed: %v", targetShard.ID(), err) 1842 } 1843 } 1844 1845 // Aggregation success, mark so we don't abort reverse index building (cold flusher). 1846 aggregationSuccess = true 1847 if err := onColdFlushNs.Done(); err != nil { 1848 return 0, err 1849 } 1850 1851 n.log.Info("finished large tiles aggregation for namespace", 1852 zap.Stringer("sourceNs", sourceNs.ID()), 1853 zap.Stringer("process", opts.Process), 1854 zap.Bool("memorizeMetricTypes", opts.MemorizeMetricTypes), 1855 zap.Bool("backfillMetricTypes", opts.BackfillMetricTypes), 1856 zap.Time("targetBlockStart", targetBlockStart.ToTime()), 1857 zap.Time("lastSourceBlockEnd", lastSourceBlockEnd.ToTime()), 1858 zap.Duration("step", opts.Step), 1859 zap.Int64("processedTiles", processedTileCount), 1860 zap.Duration("took", time.Now().Sub(startedAt))) 1861 1862 return processedTileCount, nil 1863 } 1864 1865 func (n *dbNamespace) DocRef(id ident.ID) (doc.Metadata, bool, error) { 1866 shard, _, err := n.readableShardFor(id) 1867 if err != nil { 1868 return doc.Metadata{}, false, err 1869 } 1870 return shard.DocRef(id) 1871 } 1872 1873 func (n *dbNamespace) createEmptyWarmIndexIfNotExists(blockStart xtime.UnixNano) error { 1874 fsOpts := n.opts.CommitLogOptions().FilesystemOptions() 1875 1876 shardIds := make(map[uint32]struct{}, len(n.OwnedShards())) 1877 for _, shard := range n.OwnedShards() { 1878 if shard.IsBootstrapped() { 1879 shardIds[shard.ID()] = struct{}{} 1880 } 1881 } 1882 1883 fileSetID := fs.FileSetFileIdentifier{ 1884 FileSetContentType: persist.FileSetIndexContentType, 1885 Namespace: n.ID(), 1886 BlockStart: blockStart, 1887 VolumeIndex: 0, 1888 } 1889 1890 warmIndexOpts := fs.IndexWriterOpenOptions{ 1891 BlockSize: n.Metadata().Options().IndexOptions().BlockSize(), 1892 FileSetType: persist.FileSetFlushType, 1893 Identifier: fileSetID, 1894 Shards: shardIds, 1895 IndexVolumeType: idxpersist.DefaultIndexVolumeType, 1896 } 1897 1898 warmIndexWriter, err := fs.NewIndexWriter(fsOpts) 1899 if err != nil { 1900 return err 1901 } 1902 1903 if err = warmIndexWriter.Open(warmIndexOpts); err != nil { 1904 if xerrors.Is(err, iofs.ErrExist) { 1905 n.log.Debug("warm index already exists", 1906 zap.Stringer("namespace", n.id), 1907 zap.Stringer("blockStart", blockStart), 1908 zap.Reflect("shardIds", shardIds)) 1909 return nil 1910 } 1911 return err 1912 } 1913 1914 return warmIndexWriter.Close() 1915 }