github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/bootstrap/bootstrapper/persist.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package bootstrapper
    22  
    23  import (
    24  	"fmt"
    25  	"sync"
    26  
    27  	"github.com/m3db/m3/src/dbnode/namespace"
    28  	"github.com/m3db/m3/src/dbnode/persist"
    29  	"github.com/m3db/m3/src/dbnode/persist/fs"
    30  	"github.com/m3db/m3/src/dbnode/retention"
    31  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    32  	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
    33  	"github.com/m3db/m3/src/m3ninx/index/segment"
    34  	idxpersist "github.com/m3db/m3/src/m3ninx/persist"
    35  	"github.com/m3db/m3/src/x/mmap"
    36  	xtime "github.com/m3db/m3/src/x/time"
    37  )
    38  
    39  const (
    40  	mmapBootstrapIndexName = "mmap.bootstrap.index"
    41  )
    42  
    43  // SharedPersistManager is a lockable persist manager that's safe to be shared across threads.
    44  type SharedPersistManager struct {
    45  	sync.Mutex
    46  	Mgr persist.Manager
    47  }
    48  
    49  // SharedCompactor is a lockable compactor that's safe to be shared across threads.
    50  type SharedCompactor struct {
    51  	sync.Mutex
    52  	Compactor *compaction.Compactor
    53  }
    54  
    55  // PersistBootstrapIndexSegment is a helper function that persists bootstrapped index segments for a ns -> block of time.
    56  func PersistBootstrapIndexSegment(
    57  	ns namespace.Metadata,
    58  	requestedRanges result.ShardTimeRanges,
    59  	builder segment.DocumentsBuilder,
    60  	persistManager *SharedPersistManager,
    61  	indexClaimsManager fs.IndexClaimsManager,
    62  	resultOpts result.Options,
    63  	fulfilled result.ShardTimeRanges,
    64  	blockStart xtime.UnixNano,
    65  	blockEnd xtime.UnixNano,
    66  ) (result.IndexBlock, error) {
    67  	// No-op if there are no documents that need to be written for this time block (nothing to persist).
    68  	if len(builder.Docs()) == 0 {
    69  		return result.IndexBlock{}, nil
    70  	}
    71  
    72  	// If we're performing an index run with persistence enabled
    73  	// determine if we covered a full block exactly (which should
    74  	// occur since we always group readers by block size).
    75  	_, max := requestedRanges.MinMax()
    76  	expectedRangeStart, expectedRangeEnd := blockStart, blockEnd
    77  
    78  	// Index blocks can be arbitrarily larger than data blocks, but the
    79  	// retention of the namespace is based on the size of the data blocks,
    80  	// not the index blocks. As a result, it's possible that the block start
    81  	// for the earliest index block is before the earliest possible retention
    82  	// time.
    83  	// If that is the case, then we snap the expected range start to the
    84  	// earliest retention block start because that is the point in time for
    85  	// which we'll actually have data available to construct a segment from.
    86  	//
    87  	// Example:
    88  	//  Index block size: 4 hours
    89  	//  Data block size: 2 hours
    90  	//  Retention: 6 hours
    91  	//           [12PM->2PM][2PM->4PM][4PM->6PM] (Data Blocks)
    92  	// [10AM     ->     2PM][2PM     ->     6PM] (Index Blocks)
    93  	retentionOpts := ns.Options().RetentionOptions()
    94  	nowFn := resultOpts.ClockOptions().NowFn()
    95  	now := xtime.ToUnixNano(nowFn())
    96  	earliestRetentionTime := retention.FlushTimeStart(retentionOpts, now)
    97  
    98  	// If bootstrapping is taking more time than our retention period, we might end up in a situation
    99  	// when earliestRetentionTime is larger than out block end time. This means that the blocks
   100  	// got outdated during bootstrap so we just skip building index segments for them.
   101  	if !blockEnd.After(earliestRetentionTime) {
   102  		return result.IndexBlock{}, fs.ErrIndexOutOfRetention
   103  	}
   104  
   105  	if blockStart.Before(earliestRetentionTime) {
   106  		expectedRangeStart = earliestRetentionTime
   107  	}
   108  
   109  	shards := make(map[uint32]struct{})
   110  	expectedRanges := result.NewShardTimeRangesFromSize(requestedRanges.Len())
   111  	for shard := range requestedRanges.Iter() {
   112  		shards[shard] = struct{}{}
   113  		expectedRanges.Set(shard, xtime.NewRanges(xtime.Range{
   114  			Start: expectedRangeStart,
   115  			End:   expectedRangeEnd,
   116  		}))
   117  	}
   118  
   119  	return persistBootstrapIndexSegment(
   120  		ns,
   121  		shards,
   122  		builder,
   123  		persistManager,
   124  		indexClaimsManager,
   125  		requestedRanges,
   126  		expectedRanges,
   127  		fulfilled,
   128  		blockStart,
   129  		max,
   130  	)
   131  }
   132  
   133  func persistBootstrapIndexSegment(
   134  	ns namespace.Metadata,
   135  	shards map[uint32]struct{},
   136  	builder segment.DocumentsBuilder,
   137  	persistManager *SharedPersistManager,
   138  	indexClaimsManager fs.IndexClaimsManager,
   139  	requestedRanges result.ShardTimeRanges,
   140  	expectedRanges result.ShardTimeRanges,
   141  	fulfilled result.ShardTimeRanges,
   142  	blockStart xtime.UnixNano,
   143  	max xtime.UnixNano,
   144  ) (result.IndexBlock, error) {
   145  	// Check that we completely fulfilled all shards for the block
   146  	// and we didn't bootstrap any more/less than expected.
   147  	requireFulfilled := expectedRanges.Copy()
   148  	requireFulfilled.Subtract(fulfilled)
   149  	exactStartEnd := max.Equal(blockStart.Add(ns.Options().IndexOptions().BlockSize()))
   150  	if !exactStartEnd || !requireFulfilled.IsEmpty() {
   151  		return result.IndexBlock{}, fmt.Errorf("persistent fs index bootstrap invalid ranges to persist: "+
   152  			"expected=%v, actual=%v, fulfilled=%v, exactStartEnd=%v, requireFulfilledEmpty=%v",
   153  			expectedRanges.String(), requestedRanges.String(), fulfilled.String(),
   154  			exactStartEnd, requireFulfilled.IsEmpty())
   155  	}
   156  
   157  	// NB(r): Need to get an exclusive lock to actually write the segment out
   158  	// due to needing to incrementing the index file set volume index and also
   159  	// using non-thread safe resources on the persist manager.
   160  	persistManager.Lock()
   161  	defer persistManager.Unlock()
   162  
   163  	flush, err := persistManager.Mgr.StartIndexPersist()
   164  	if err != nil {
   165  		return result.IndexBlock{}, err
   166  	}
   167  
   168  	var calledDone bool
   169  	defer func() {
   170  		if !calledDone {
   171  			flush.DoneIndex()
   172  		}
   173  	}()
   174  
   175  	volumeIndex, err := indexClaimsManager.ClaimNextIndexFileSetVolumeIndex(
   176  		ns,
   177  		blockStart,
   178  	)
   179  	if err != nil {
   180  		return result.IndexBlock{}, fmt.Errorf("failed to claim next index volume index: %w", err)
   181  	}
   182  
   183  	preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{
   184  		NamespaceMetadata: ns,
   185  		BlockStart:        blockStart,
   186  		FileSetType:       persist.FileSetFlushType,
   187  		Shards:            shards,
   188  		// NB(bodu): Assume default volume type when persisted bootstrapped index data.
   189  		IndexVolumeType: idxpersist.DefaultIndexVolumeType,
   190  		VolumeIndex:     volumeIndex,
   191  	})
   192  	if err != nil {
   193  		return result.IndexBlock{}, err
   194  	}
   195  
   196  	var calledClose bool
   197  	defer func() {
   198  		if !calledClose {
   199  			preparedPersist.Close()
   200  		}
   201  	}()
   202  
   203  	if err := preparedPersist.Persist(builder); err != nil {
   204  		return result.IndexBlock{}, err
   205  	}
   206  
   207  	calledClose = true
   208  	persistedSegments, err := preparedPersist.Close()
   209  	if err != nil {
   210  		return result.IndexBlock{}, err
   211  	}
   212  
   213  	calledDone = true
   214  	if err := flush.DoneIndex(); err != nil {
   215  		return result.IndexBlock{}, err
   216  	}
   217  	segments := make([]result.Segment, 0, len(persistedSegments))
   218  	for _, pSeg := range persistedSegments {
   219  		segments = append(segments, result.NewSegment(pSeg, true))
   220  	}
   221  
   222  	return result.NewIndexBlock(segments, expectedRanges), nil
   223  }
   224  
   225  // BuildBootstrapIndexSegment is a helper function that builds (in memory) bootstrapped index segments for a ns -> block of time.
   226  func BuildBootstrapIndexSegment(
   227  	ns namespace.Metadata,
   228  	requestedRanges result.ShardTimeRanges,
   229  	builder segment.DocumentsBuilder,
   230  	compactor *SharedCompactor,
   231  	resultOpts result.Options,
   232  	mmapReporter mmap.Reporter,
   233  	blockStart xtime.UnixNano,
   234  	blockEnd xtime.UnixNano,
   235  ) (result.IndexBlock, error) {
   236  	// No-op if there are no documents that need to be written for this time block (nothing to persist).
   237  	if len(builder.Docs()) == 0 {
   238  		return result.IndexBlock{}, nil
   239  	}
   240  
   241  	// If we're performing an index run with persistence enabled
   242  	// determine if we covered a full block exactly (which should
   243  	// occur since we always group readers by block size).
   244  	expectedRangeStart, expectedRangeEnd := blockStart, blockEnd
   245  
   246  	// Index blocks can be arbitrarily larger than data blocks, but the
   247  	// retention of the namespace is based on the size of the data blocks,
   248  	// not the index blocks. As a result, it's possible that the block start
   249  	// for the earliest index block is before the earliest possible retention
   250  	// time.
   251  	// If that is the case, then we snap the expected range start to the
   252  	// earliest retention block start because that is the point in time for
   253  	// which we'll actually have data available to construct a segment from.
   254  	//
   255  	// Example:
   256  	//  Index block size: 4 hours
   257  	//  Data block size: 2 hours
   258  	//  Retention: 6 hours
   259  	//           [12PM->2PM)[2PM->4PM)[4PM->6PM) (Data Blocks)
   260  	// [10AM     ->     2PM)[2PM     ->     6PM) (Index Blocks)
   261  	retentionOpts := ns.Options().RetentionOptions()
   262  	nowFn := resultOpts.ClockOptions().NowFn()
   263  	now := xtime.ToUnixNano(nowFn())
   264  	earliestRetentionTime := retention.FlushTimeStart(retentionOpts, now)
   265  
   266  	// If bootstrapping is taking more time than our retention period, we might end up in a situation
   267  	// when earliestRetentionTime is larger than out block end time. This means that the blocks
   268  	// got outdated during bootstrap so we just skip building index segments for them.
   269  	if !blockEnd.After(earliestRetentionTime) {
   270  		return result.IndexBlock{}, fs.ErrIndexOutOfRetention
   271  	}
   272  
   273  	if blockStart.Before(earliestRetentionTime) {
   274  		expectedRangeStart = earliestRetentionTime
   275  	}
   276  
   277  	expectedRanges := result.NewShardTimeRangesFromSize(requestedRanges.Len())
   278  	for shard := range requestedRanges.Iter() {
   279  		expectedRanges.Set(shard, xtime.NewRanges(xtime.Range{
   280  			Start: expectedRangeStart,
   281  			End:   expectedRangeEnd,
   282  		}))
   283  	}
   284  
   285  	compactor.Lock()
   286  	defer compactor.Unlock()
   287  	seg, err := compactor.Compactor.CompactUsingBuilder(builder, nil, mmap.ReporterOptions{
   288  		Context: mmap.Context{
   289  			Name: mmapBootstrapIndexName,
   290  		},
   291  		Reporter: mmapReporter,
   292  	})
   293  	if err != nil {
   294  		return result.IndexBlock{}, err
   295  	}
   296  
   297  	segs := []result.Segment{result.NewSegment(seg, false)}
   298  	indexResult := result.NewIndexBlock(segs, expectedRanges)
   299  	return indexResult, nil
   300  }
   301  
   302  // GetDefaultIndexBlockForBlockStart gets the index block for the default volume type from the index results.
   303  func GetDefaultIndexBlockForBlockStart(
   304  	results result.IndexResults,
   305  	blockStart xtime.UnixNano,
   306  ) (result.IndexBlock, bool) {
   307  	indexBlockByVolumeType, ok := results[blockStart]
   308  	if !ok {
   309  		// NB(bodu): We currently write empty data files to disk, which means that we can attempt to bootstrap
   310  		// time ranges that have no data and no index block.
   311  		// For example:
   312  		// - peers data bootstrap from peer nodes receives peer blocks w/ no data (empty)
   313  		// - peers data bootstrap writes empty ts data files to disk
   314  		// - peers index bootstrap reads empty ts data files md from disk
   315  		// - attempt to bootstrap time ranges that have no index results block
   316  		return result.IndexBlock{}, false
   317  	}
   318  	indexBlock, ok := indexBlockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType)
   319  	if !ok {
   320  		return result.IndexBlock{}, false
   321  	}
   322  	return indexBlock, true
   323  }