github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/bootstrap/bootstrapper/persist.go (about) 1 // Copyright (c) 2020 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package bootstrapper 22 23 import ( 24 "fmt" 25 "sync" 26 27 "github.com/m3db/m3/src/dbnode/namespace" 28 "github.com/m3db/m3/src/dbnode/persist" 29 "github.com/m3db/m3/src/dbnode/persist/fs" 30 "github.com/m3db/m3/src/dbnode/retention" 31 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 32 "github.com/m3db/m3/src/dbnode/storage/index/compaction" 33 "github.com/m3db/m3/src/m3ninx/index/segment" 34 idxpersist "github.com/m3db/m3/src/m3ninx/persist" 35 "github.com/m3db/m3/src/x/mmap" 36 xtime "github.com/m3db/m3/src/x/time" 37 ) 38 39 const ( 40 mmapBootstrapIndexName = "mmap.bootstrap.index" 41 ) 42 43 // SharedPersistManager is a lockable persist manager that's safe to be shared across threads. 44 type SharedPersistManager struct { 45 sync.Mutex 46 Mgr persist.Manager 47 } 48 49 // SharedCompactor is a lockable compactor that's safe to be shared across threads. 50 type SharedCompactor struct { 51 sync.Mutex 52 Compactor *compaction.Compactor 53 } 54 55 // PersistBootstrapIndexSegment is a helper function that persists bootstrapped index segments for a ns -> block of time. 56 func PersistBootstrapIndexSegment( 57 ns namespace.Metadata, 58 requestedRanges result.ShardTimeRanges, 59 builder segment.DocumentsBuilder, 60 persistManager *SharedPersistManager, 61 indexClaimsManager fs.IndexClaimsManager, 62 resultOpts result.Options, 63 fulfilled result.ShardTimeRanges, 64 blockStart xtime.UnixNano, 65 blockEnd xtime.UnixNano, 66 ) (result.IndexBlock, error) { 67 // No-op if there are no documents that need to be written for this time block (nothing to persist). 68 if len(builder.Docs()) == 0 { 69 return result.IndexBlock{}, nil 70 } 71 72 // If we're performing an index run with persistence enabled 73 // determine if we covered a full block exactly (which should 74 // occur since we always group readers by block size). 75 _, max := requestedRanges.MinMax() 76 expectedRangeStart, expectedRangeEnd := blockStart, blockEnd 77 78 // Index blocks can be arbitrarily larger than data blocks, but the 79 // retention of the namespace is based on the size of the data blocks, 80 // not the index blocks. As a result, it's possible that the block start 81 // for the earliest index block is before the earliest possible retention 82 // time. 83 // If that is the case, then we snap the expected range start to the 84 // earliest retention block start because that is the point in time for 85 // which we'll actually have data available to construct a segment from. 86 // 87 // Example: 88 // Index block size: 4 hours 89 // Data block size: 2 hours 90 // Retention: 6 hours 91 // [12PM->2PM][2PM->4PM][4PM->6PM] (Data Blocks) 92 // [10AM -> 2PM][2PM -> 6PM] (Index Blocks) 93 retentionOpts := ns.Options().RetentionOptions() 94 nowFn := resultOpts.ClockOptions().NowFn() 95 now := xtime.ToUnixNano(nowFn()) 96 earliestRetentionTime := retention.FlushTimeStart(retentionOpts, now) 97 98 // If bootstrapping is taking more time than our retention period, we might end up in a situation 99 // when earliestRetentionTime is larger than out block end time. This means that the blocks 100 // got outdated during bootstrap so we just skip building index segments for them. 101 if !blockEnd.After(earliestRetentionTime) { 102 return result.IndexBlock{}, fs.ErrIndexOutOfRetention 103 } 104 105 if blockStart.Before(earliestRetentionTime) { 106 expectedRangeStart = earliestRetentionTime 107 } 108 109 shards := make(map[uint32]struct{}) 110 expectedRanges := result.NewShardTimeRangesFromSize(requestedRanges.Len()) 111 for shard := range requestedRanges.Iter() { 112 shards[shard] = struct{}{} 113 expectedRanges.Set(shard, xtime.NewRanges(xtime.Range{ 114 Start: expectedRangeStart, 115 End: expectedRangeEnd, 116 })) 117 } 118 119 return persistBootstrapIndexSegment( 120 ns, 121 shards, 122 builder, 123 persistManager, 124 indexClaimsManager, 125 requestedRanges, 126 expectedRanges, 127 fulfilled, 128 blockStart, 129 max, 130 ) 131 } 132 133 func persistBootstrapIndexSegment( 134 ns namespace.Metadata, 135 shards map[uint32]struct{}, 136 builder segment.DocumentsBuilder, 137 persistManager *SharedPersistManager, 138 indexClaimsManager fs.IndexClaimsManager, 139 requestedRanges result.ShardTimeRanges, 140 expectedRanges result.ShardTimeRanges, 141 fulfilled result.ShardTimeRanges, 142 blockStart xtime.UnixNano, 143 max xtime.UnixNano, 144 ) (result.IndexBlock, error) { 145 // Check that we completely fulfilled all shards for the block 146 // and we didn't bootstrap any more/less than expected. 147 requireFulfilled := expectedRanges.Copy() 148 requireFulfilled.Subtract(fulfilled) 149 exactStartEnd := max.Equal(blockStart.Add(ns.Options().IndexOptions().BlockSize())) 150 if !exactStartEnd || !requireFulfilled.IsEmpty() { 151 return result.IndexBlock{}, fmt.Errorf("persistent fs index bootstrap invalid ranges to persist: "+ 152 "expected=%v, actual=%v, fulfilled=%v, exactStartEnd=%v, requireFulfilledEmpty=%v", 153 expectedRanges.String(), requestedRanges.String(), fulfilled.String(), 154 exactStartEnd, requireFulfilled.IsEmpty()) 155 } 156 157 // NB(r): Need to get an exclusive lock to actually write the segment out 158 // due to needing to incrementing the index file set volume index and also 159 // using non-thread safe resources on the persist manager. 160 persistManager.Lock() 161 defer persistManager.Unlock() 162 163 flush, err := persistManager.Mgr.StartIndexPersist() 164 if err != nil { 165 return result.IndexBlock{}, err 166 } 167 168 var calledDone bool 169 defer func() { 170 if !calledDone { 171 flush.DoneIndex() 172 } 173 }() 174 175 volumeIndex, err := indexClaimsManager.ClaimNextIndexFileSetVolumeIndex( 176 ns, 177 blockStart, 178 ) 179 if err != nil { 180 return result.IndexBlock{}, fmt.Errorf("failed to claim next index volume index: %w", err) 181 } 182 183 preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{ 184 NamespaceMetadata: ns, 185 BlockStart: blockStart, 186 FileSetType: persist.FileSetFlushType, 187 Shards: shards, 188 // NB(bodu): Assume default volume type when persisted bootstrapped index data. 189 IndexVolumeType: idxpersist.DefaultIndexVolumeType, 190 VolumeIndex: volumeIndex, 191 }) 192 if err != nil { 193 return result.IndexBlock{}, err 194 } 195 196 var calledClose bool 197 defer func() { 198 if !calledClose { 199 preparedPersist.Close() 200 } 201 }() 202 203 if err := preparedPersist.Persist(builder); err != nil { 204 return result.IndexBlock{}, err 205 } 206 207 calledClose = true 208 persistedSegments, err := preparedPersist.Close() 209 if err != nil { 210 return result.IndexBlock{}, err 211 } 212 213 calledDone = true 214 if err := flush.DoneIndex(); err != nil { 215 return result.IndexBlock{}, err 216 } 217 segments := make([]result.Segment, 0, len(persistedSegments)) 218 for _, pSeg := range persistedSegments { 219 segments = append(segments, result.NewSegment(pSeg, true)) 220 } 221 222 return result.NewIndexBlock(segments, expectedRanges), nil 223 } 224 225 // BuildBootstrapIndexSegment is a helper function that builds (in memory) bootstrapped index segments for a ns -> block of time. 226 func BuildBootstrapIndexSegment( 227 ns namespace.Metadata, 228 requestedRanges result.ShardTimeRanges, 229 builder segment.DocumentsBuilder, 230 compactor *SharedCompactor, 231 resultOpts result.Options, 232 mmapReporter mmap.Reporter, 233 blockStart xtime.UnixNano, 234 blockEnd xtime.UnixNano, 235 ) (result.IndexBlock, error) { 236 // No-op if there are no documents that need to be written for this time block (nothing to persist). 237 if len(builder.Docs()) == 0 { 238 return result.IndexBlock{}, nil 239 } 240 241 // If we're performing an index run with persistence enabled 242 // determine if we covered a full block exactly (which should 243 // occur since we always group readers by block size). 244 expectedRangeStart, expectedRangeEnd := blockStart, blockEnd 245 246 // Index blocks can be arbitrarily larger than data blocks, but the 247 // retention of the namespace is based on the size of the data blocks, 248 // not the index blocks. As a result, it's possible that the block start 249 // for the earliest index block is before the earliest possible retention 250 // time. 251 // If that is the case, then we snap the expected range start to the 252 // earliest retention block start because that is the point in time for 253 // which we'll actually have data available to construct a segment from. 254 // 255 // Example: 256 // Index block size: 4 hours 257 // Data block size: 2 hours 258 // Retention: 6 hours 259 // [12PM->2PM)[2PM->4PM)[4PM->6PM) (Data Blocks) 260 // [10AM -> 2PM)[2PM -> 6PM) (Index Blocks) 261 retentionOpts := ns.Options().RetentionOptions() 262 nowFn := resultOpts.ClockOptions().NowFn() 263 now := xtime.ToUnixNano(nowFn()) 264 earliestRetentionTime := retention.FlushTimeStart(retentionOpts, now) 265 266 // If bootstrapping is taking more time than our retention period, we might end up in a situation 267 // when earliestRetentionTime is larger than out block end time. This means that the blocks 268 // got outdated during bootstrap so we just skip building index segments for them. 269 if !blockEnd.After(earliestRetentionTime) { 270 return result.IndexBlock{}, fs.ErrIndexOutOfRetention 271 } 272 273 if blockStart.Before(earliestRetentionTime) { 274 expectedRangeStart = earliestRetentionTime 275 } 276 277 expectedRanges := result.NewShardTimeRangesFromSize(requestedRanges.Len()) 278 for shard := range requestedRanges.Iter() { 279 expectedRanges.Set(shard, xtime.NewRanges(xtime.Range{ 280 Start: expectedRangeStart, 281 End: expectedRangeEnd, 282 })) 283 } 284 285 compactor.Lock() 286 defer compactor.Unlock() 287 seg, err := compactor.Compactor.CompactUsingBuilder(builder, nil, mmap.ReporterOptions{ 288 Context: mmap.Context{ 289 Name: mmapBootstrapIndexName, 290 }, 291 Reporter: mmapReporter, 292 }) 293 if err != nil { 294 return result.IndexBlock{}, err 295 } 296 297 segs := []result.Segment{result.NewSegment(seg, false)} 298 indexResult := result.NewIndexBlock(segs, expectedRanges) 299 return indexResult, nil 300 } 301 302 // GetDefaultIndexBlockForBlockStart gets the index block for the default volume type from the index results. 303 func GetDefaultIndexBlockForBlockStart( 304 results result.IndexResults, 305 blockStart xtime.UnixNano, 306 ) (result.IndexBlock, bool) { 307 indexBlockByVolumeType, ok := results[blockStart] 308 if !ok { 309 // NB(bodu): We currently write empty data files to disk, which means that we can attempt to bootstrap 310 // time ranges that have no data and no index block. 311 // For example: 312 // - peers data bootstrap from peer nodes receives peer blocks w/ no data (empty) 313 // - peers data bootstrap writes empty ts data files to disk 314 // - peers index bootstrap reads empty ts data files md from disk 315 // - attempt to bootstrap time ranges that have no index results block 316 return result.IndexBlock{}, false 317 } 318 indexBlock, ok := indexBlockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) 319 if !ok { 320 return result.IndexBlock{}, false 321 } 322 return indexBlock, true 323 }