github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/persist/fs/write.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package fs 22 23 import ( 24 "bytes" 25 "errors" 26 "fmt" 27 "math" 28 "os" 29 "sort" 30 "time" 31 32 "github.com/m3db/m3/src/dbnode/digest" 33 "github.com/m3db/m3/src/dbnode/persist" 34 "github.com/m3db/m3/src/dbnode/persist/fs/msgpack" 35 "github.com/m3db/m3/src/dbnode/persist/schema" 36 "github.com/m3db/m3/src/dbnode/ts" 37 "github.com/m3db/m3/src/x/checked" 38 "github.com/m3db/m3/src/x/ident" 39 xresource "github.com/m3db/m3/src/x/resource" 40 "github.com/m3db/m3/src/x/serialize" 41 xtime "github.com/m3db/m3/src/x/time" 42 43 "github.com/m3db/bloom/v4" 44 "github.com/pborman/uuid" 45 ) 46 47 const ( 48 // CheckpointFileSizeBytes is the expected size of a valid checkpoint file. 49 CheckpointFileSizeBytes = 4 50 ) 51 52 var errWriterEncodeTagsDataNotAccessible = errors.New( 53 "failed to encode tags: cannot get data") 54 55 type writer struct { 56 blockSize time.Duration 57 filePathPrefix string 58 newFileMode os.FileMode 59 newDirectoryMode os.FileMode 60 61 summariesPercent float64 62 bloomFilterFalsePositivePercent float64 63 bufferSize int 64 65 infoFdWithDigest digest.FdWithDigestWriter 66 indexFdWithDigest digest.FdWithDigestWriter 67 summariesFdWithDigest digest.FdWithDigestWriter 68 bloomFilterFdWithDigest digest.FdWithDigestWriter 69 dataFdWithDigest digest.FdWithDigestWriter 70 digestFdWithDigestContents digest.FdWithDigestContentsWriter 71 checkpointFilePath string 72 indexEntries indexEntries 73 74 start xtime.UnixNano 75 volumeIndex int 76 snapshotTime xtime.UnixNano 77 snapshotID uuid.UUID 78 79 currIdx int64 80 currOffset int64 81 encoder *msgpack.Encoder 82 digestBuf digest.Buffer 83 singleCheckedBytes []checked.Bytes 84 tagsIterator ident.TagsIterator 85 tagEncoderPool serialize.TagEncoderPool 86 err error 87 } 88 89 type indexEntry struct { 90 index int64 91 dataFileOffset int64 92 indexFileOffset int64 93 size uint32 94 dataChecksum uint32 95 } 96 97 type indexEntryWithMetadata struct { 98 entry indexEntry 99 metadata persist.Metadata 100 } 101 102 type indexEntries []indexEntryWithMetadata 103 104 func (e indexEntries) releaseRefs() { 105 // Close any metadata. 106 for _, elem := range e { 107 elem.metadata.Finalize() 108 } 109 // Apply memset zero loop optimization. 110 var zeroed indexEntryWithMetadata 111 for i := range e { 112 e[i] = zeroed 113 } 114 } 115 116 func (e indexEntries) Len() int { 117 return len(e) 118 } 119 120 func (e indexEntries) Less(i, j int) bool { 121 return bytes.Compare(e[i].metadata.BytesID(), e[j].metadata.BytesID()) < 0 122 } 123 124 func (e indexEntries) Swap(i, j int) { 125 e[i], e[j] = e[j], e[i] 126 } 127 128 // NewWriter returns a new writer with options. 129 func NewWriter(opts Options) (DataFileSetWriter, error) { 130 if err := opts.Validate(); err != nil { 131 return nil, err 132 } 133 bufferSize := opts.WriterBufferSize() 134 return &writer{ 135 filePathPrefix: opts.FilePathPrefix(), 136 newFileMode: opts.NewFileMode(), 137 newDirectoryMode: opts.NewDirectoryMode(), 138 summariesPercent: opts.IndexSummariesPercent(), 139 bloomFilterFalsePositivePercent: opts.IndexBloomFilterFalsePositivePercent(), 140 bufferSize: bufferSize, 141 infoFdWithDigest: digest.NewFdWithDigestWriter(bufferSize), 142 indexFdWithDigest: digest.NewFdWithDigestWriter(bufferSize), 143 summariesFdWithDigest: digest.NewFdWithDigestWriter(bufferSize), 144 bloomFilterFdWithDigest: digest.NewFdWithDigestWriter(bufferSize), 145 dataFdWithDigest: digest.NewFdWithDigestWriter(bufferSize), 146 digestFdWithDigestContents: digest.NewFdWithDigestContentsWriter(bufferSize), 147 encoder: msgpack.NewEncoderWithOptions(opts.EncodingOptions()), 148 digestBuf: digest.NewBuffer(), 149 singleCheckedBytes: make([]checked.Bytes, 1), 150 tagsIterator: ident.NewTagsIterator(ident.Tags{}), 151 tagEncoderPool: opts.TagEncoderPool(), 152 }, nil 153 } 154 155 // Open initializes the internal state for writing to the given shard, 156 // specifically creating the shard directory if it doesn't exist, and 157 // opening / truncating files associated with that shard for writing. 158 func (w *writer) Open(opts DataWriterOpenOptions) error { 159 var ( 160 err error 161 namespace = opts.Identifier.Namespace 162 shard = opts.Identifier.Shard 163 blockStart = opts.Identifier.BlockStart 164 volumeIndex = opts.Identifier.VolumeIndex 165 ) 166 w.reset(opts) 167 168 var ( 169 shardDir string 170 infoFilepath string 171 indexFilepath string 172 summariesFilepath string 173 bloomFilterFilepath string 174 dataFilepath string 175 digestFilepath string 176 ) 177 switch opts.FileSetType { 178 case persist.FileSetSnapshotType: 179 shardDir = ShardSnapshotsDirPath(w.filePathPrefix, namespace, shard) 180 // Can't do this outside of the switch statement because we need to make sure 181 // the directory exists before calling NextSnapshotFileSetIndex 182 if err := os.MkdirAll(shardDir, w.newDirectoryMode); err != nil { 183 return err 184 } 185 186 w.checkpointFilePath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, CheckpointFileSuffix) 187 infoFilepath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, InfoFileSuffix) 188 indexFilepath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix) 189 summariesFilepath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, summariesFileSuffix) 190 bloomFilterFilepath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix) 191 dataFilepath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix) 192 digestFilepath = FilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, DigestFileSuffix) 193 case persist.FileSetFlushType: 194 shardDir = ShardDataDirPath(w.filePathPrefix, namespace, shard) 195 if err := os.MkdirAll(shardDir, w.newDirectoryMode); err != nil { 196 return err 197 } 198 199 w.checkpointFilePath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, CheckpointFileSuffix, false) 200 infoFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, InfoFileSuffix, false) 201 indexFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, indexFileSuffix, false) 202 summariesFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, summariesFileSuffix, false) 203 bloomFilterFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, bloomFilterFileSuffix, false) 204 dataFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, dataFileSuffix, false) 205 digestFilepath = dataFilesetPathFromTimeAndIndex(shardDir, blockStart, volumeIndex, DigestFileSuffix, false) 206 default: 207 return fmt.Errorf("unable to open reader with fileset type: %s", opts.FileSetType) 208 } 209 210 var infoFd, indexFd, summariesFd, bloomFilterFd, dataFd, digestFd *os.File 211 err = openFiles(w.openWritable, 212 map[string]**os.File{ 213 infoFilepath: &infoFd, 214 indexFilepath: &indexFd, 215 summariesFilepath: &summariesFd, 216 bloomFilterFilepath: &bloomFilterFd, 217 dataFilepath: &dataFd, 218 digestFilepath: &digestFd, 219 }, 220 ) 221 if err != nil { 222 return err 223 } 224 225 w.infoFdWithDigest.Reset(infoFd) 226 w.indexFdWithDigest.Reset(indexFd) 227 w.summariesFdWithDigest.Reset(summariesFd) 228 w.bloomFilterFdWithDigest.Reset(bloomFilterFd) 229 w.dataFdWithDigest.Reset(dataFd) 230 w.digestFdWithDigestContents.Reset(digestFd) 231 232 return nil 233 } 234 235 func (w *writer) reset(opts DataWriterOpenOptions) { 236 w.blockSize = opts.BlockSize 237 w.start = opts.Identifier.BlockStart 238 w.volumeIndex = opts.Identifier.VolumeIndex 239 w.snapshotTime = opts.Snapshot.SnapshotTime 240 w.snapshotID = opts.Snapshot.SnapshotID 241 w.currIdx = 0 242 w.currOffset = 0 243 w.err = nil 244 // This happens after writing the previous set of files index files, however, do it 245 // again to ensure they get cleared even if there was a premature error writing out the 246 // previous set of files which would have prevented them from being cleared. 247 w.indexEntries.releaseRefs() 248 w.indexEntries = w.indexEntries[:0] 249 } 250 251 func (w *writer) writeData(data []byte) error { 252 if len(data) == 0 { 253 return nil 254 } 255 written, err := w.dataFdWithDigest.Write(data) 256 if err != nil { 257 return err 258 } 259 w.currOffset += int64(written) 260 return nil 261 } 262 263 func (w *writer) Write( 264 metadata persist.Metadata, 265 data checked.Bytes, 266 dataChecksum uint32, 267 ) error { 268 w.singleCheckedBytes[0] = data 269 return w.WriteAll(metadata, w.singleCheckedBytes, dataChecksum) 270 } 271 272 func (w *writer) WriteAll( 273 metadata persist.Metadata, 274 data []checked.Bytes, 275 dataChecksum uint32, 276 ) error { 277 if w.err != nil { 278 return w.err 279 } 280 281 if err := w.writeAll(metadata, data, dataChecksum); err != nil { 282 w.err = err 283 return err 284 } 285 return nil 286 } 287 288 func (w *writer) writeAll( 289 metadata persist.Metadata, 290 data []checked.Bytes, 291 dataChecksum uint32, 292 ) error { 293 var size int64 294 for _, d := range data { 295 if d == nil { 296 continue 297 } 298 size += int64(d.Len()) 299 } 300 if size == 0 { 301 return nil 302 } 303 304 entry := indexEntryWithMetadata{ 305 entry: indexEntry{ 306 index: w.currIdx, 307 dataFileOffset: w.currOffset, 308 size: uint32(size), 309 dataChecksum: dataChecksum, 310 }, 311 metadata: metadata, 312 } 313 for _, d := range data { 314 if d == nil { 315 continue 316 } 317 if err := w.writeData(d.Bytes()); err != nil { 318 return err 319 } 320 } 321 322 w.indexEntries = append(w.indexEntries, entry) 323 w.currIdx++ 324 325 return nil 326 } 327 328 func (w *writer) Close() error { 329 err := w.close() 330 if w.err != nil { 331 return w.err 332 } 333 if err != nil { 334 w.err = err 335 return err 336 } 337 // NB(xichen): only write out the checkpoint file if there are no errors 338 // encountered between calling writer.Open() and writer.Close(). 339 if err := writeCheckpointFile( 340 w.checkpointFilePath, 341 w.digestFdWithDigestContents.Digest().Sum32(), 342 w.digestBuf, 343 w.newFileMode, 344 ); err != nil { 345 w.err = err 346 return err 347 } 348 return nil 349 } 350 351 func (w *writer) DeferClose() (persist.DataCloser, error) { 352 err := w.close() 353 if w.err != nil { 354 return nil, w.err 355 } 356 if err != nil { 357 w.err = err 358 return nil, err 359 } 360 checkpointFilePath := w.checkpointFilePath 361 digestChecksum := w.digestFdWithDigestContents.Digest().Sum32() 362 newFileMode := w.newFileMode 363 return func() error { 364 return writeCheckpointFile( 365 checkpointFilePath, 366 digestChecksum, 367 digest.NewBuffer(), 368 newFileMode, 369 ) 370 }, nil 371 } 372 373 func (w *writer) close() error { 374 if err := w.writeIndexRelatedFiles(); err != nil { 375 return err 376 } 377 378 return w.closeWOIndex() 379 } 380 381 func (w *writer) closeWOIndex() error { 382 if err := w.digestFdWithDigestContents.WriteDigests( 383 w.infoFdWithDigest.Digest().Sum32(), 384 w.indexFdWithDigest.Digest().Sum32(), 385 w.summariesFdWithDigest.Digest().Sum32(), 386 w.bloomFilterFdWithDigest.Digest().Sum32(), 387 w.dataFdWithDigest.Digest().Sum32(), 388 ); err != nil { 389 return err 390 } 391 392 return xresource.CloseAll( 393 w.infoFdWithDigest, 394 w.indexFdWithDigest, 395 w.summariesFdWithDigest, 396 w.bloomFilterFdWithDigest, 397 w.dataFdWithDigest, 398 w.digestFdWithDigestContents, 399 ) 400 } 401 402 func (w *writer) openWritable(filePath string) (*os.File, error) { 403 return OpenWritable(filePath, w.newFileMode) 404 } 405 406 func (w *writer) writeIndexRelatedFiles() error { 407 summariesApprox := float64(len(w.indexEntries)) * w.summariesPercent 408 summaryEvery := 0 409 if summariesApprox > 0 { 410 summaryEvery = int(math.Floor(float64(len(w.indexEntries)) / summariesApprox)) 411 } 412 413 // Write the index entries and calculate the bloom filter 414 n, p := uint(w.currIdx), w.bloomFilterFalsePositivePercent 415 if n == 0 { 416 n = 1 417 } 418 m, k := bloom.EstimateFalsePositiveRate(n, p) 419 bloomFilter := bloom.NewBloomFilter(m, k) 420 421 err := w.writeIndexFileContents(bloomFilter, summaryEvery) 422 if err != nil { 423 return err 424 } 425 426 // Write summaries and start zeroing out memory to avoid holding onto refs 427 summaries, err := w.writeSummariesFileContents(summaryEvery) 428 if err != nil { 429 return err 430 } 431 432 // Reset summaries slice to avoid allocs for next shard flush, this avoids 433 // leaking memory. Be sure to release all refs before resizing to avoid GC 434 // holding roots. 435 w.indexEntries.releaseRefs() 436 w.indexEntries = w.indexEntries[:0] 437 438 // Write the bloom filter bitset out 439 if err := w.writeBloomFilterFileContents(bloomFilter); err != nil { 440 return err 441 } 442 443 return w.writeInfoFileContents(bloomFilter, summaries, w.currIdx) 444 } 445 446 func (w *writer) writeIndexFileContents( 447 bloomFilter *bloom.BloomFilter, 448 summaryEvery int, 449 ) error { 450 // NB(r): Write the index file in order, in the future we could write 451 // these in order to avoid this sort at the end however that does require 452 // significant changes in the storage/databaseShard to store things in order 453 // which would sacrifice O(1) insertion of new series we currently have. 454 // 455 // Probably do want to do this at the end still however so we don't stripe 456 // writes to two different files during the write loop. 457 sort.Sort(w.indexEntries) 458 459 var ( 460 offset int64 461 prevID []byte 462 tagsReusable = w.tagsIterator 463 tagsEncoder = w.tagEncoderPool.Get() 464 ) 465 defer tagsEncoder.Finalize() 466 for i, entry := range w.indexEntries { 467 metadata := entry.metadata 468 id := metadata.BytesID() 469 // Need to check if i > 0 or we can never write an empty string ID 470 if i > 0 && bytes.Equal(id, prevID) { 471 // Should never happen, Write() should only be called once per ID 472 return fmt.Errorf("encountered duplicate ID: %s", id) 473 } 474 475 tagsIter, err := metadata.ResetOrReturnProvidedTagIterator(tagsReusable) 476 if err != nil { 477 return err 478 } 479 480 // Add to the bloom filter, note this must be zero alloc or else this will 481 // cause heavy GC churn as we flush millions of series at end of each 482 // time window 483 bloomFilter.Add(id) 484 485 if i%summaryEvery == 0 { 486 // Capture the offset for when we write this summary back, only capture 487 // for every summary we'll actually write to avoid a few memcopies 488 w.indexEntries[i].entry.indexFileOffset = offset 489 } 490 491 length, err := w.writeIndex(id, tagsIter, tagsEncoder, entry.entry) 492 if err != nil { 493 return err 494 } 495 offset += length 496 497 prevID = id 498 } 499 500 return nil 501 } 502 503 func (w *writer) writeIndex( 504 id []byte, 505 tagsIter ident.TagIterator, 506 tagsEncoder serialize.TagEncoder, 507 entry indexEntry, 508 ) (int64, error) { 509 var encodedTags []byte 510 if numTags := tagsIter.Remaining(); numTags > 0 { 511 tagsEncoder.Reset() 512 if err := tagsEncoder.Encode(tagsIter); err != nil { 513 return 0, err 514 } 515 516 encodedTagsData, ok := tagsEncoder.Data() 517 if !ok { 518 return 0, errWriterEncodeTagsDataNotAccessible 519 } 520 521 encodedTags = encodedTagsData.Bytes() 522 } 523 524 return w.writeIndexWithEncodedTags(id, encodedTags, entry) 525 } 526 527 func (w *writer) writeIndexWithEncodedTags( 528 id []byte, 529 encodedTags ts.EncodedTags, 530 entry indexEntry, 531 ) (int64, error) { 532 e := schema.IndexEntry{ 533 Index: entry.index, 534 ID: id, 535 Size: int64(entry.size), 536 Offset: entry.dataFileOffset, 537 DataChecksum: int64(entry.dataChecksum), 538 EncodedTags: encodedTags, 539 } 540 541 w.encoder.Reset() 542 if err := w.encoder.EncodeIndexEntry(e); err != nil { 543 return 0, err 544 } 545 546 data := w.encoder.Bytes() 547 if _, err := w.indexFdWithDigest.Write(data); err != nil { 548 return 0, err 549 } 550 551 return int64(len(data)), nil 552 } 553 554 func (w *writer) writeSummariesFileContents( 555 summaryEvery int, 556 ) (int, error) { 557 summaries := 0 558 for i := range w.indexEntries { 559 if i%summaryEvery != 0 { 560 continue 561 } 562 err := w.writeSummariesEntry(w.indexEntries[i].metadata.BytesID(), w.indexEntries[i].entry) 563 if err != nil { 564 return 0, err 565 } 566 summaries++ 567 } 568 569 return summaries, nil 570 } 571 572 func (w *writer) writeSummariesEntry( 573 id ident.BytesID, 574 entry indexEntry, 575 ) error { 576 summary := schema.IndexSummary{ 577 Index: entry.index, 578 ID: id, 579 IndexEntryOffset: entry.indexFileOffset, 580 } 581 582 w.encoder.Reset() 583 if err := w.encoder.EncodeIndexSummary(summary); err != nil { 584 return err 585 } 586 587 data := w.encoder.Bytes() 588 if _, err := w.summariesFdWithDigest.Write(data); err != nil { 589 return err 590 } 591 592 return nil 593 } 594 595 func (w *writer) writeBloomFilterFileContents( 596 bloomFilter *bloom.BloomFilter, 597 ) error { 598 return bloomFilter.BitSet().Write(w.bloomFilterFdWithDigest) 599 } 600 601 func (w *writer) writeInfoFileContents( 602 bloomFilter *bloom.BloomFilter, 603 summaries int, 604 entriesCount int64, 605 ) error { 606 snapshotBytes, err := w.snapshotID.MarshalBinary() 607 if err != nil { 608 return fmt.Errorf("error marshaling snapshot ID into bytes: %v", err) 609 } 610 611 info := schema.IndexInfo{ 612 BlockStart: int64(w.start), 613 VolumeIndex: w.volumeIndex, 614 SnapshotTime: int64(w.snapshotTime), 615 SnapshotID: snapshotBytes, 616 BlockSize: int64(w.blockSize), 617 Entries: entriesCount, 618 MajorVersion: schema.MajorVersion, 619 MinorVersion: schema.MinorVersion, 620 Summaries: schema.IndexSummariesInfo{ 621 Summaries: int64(summaries), 622 }, 623 BloomFilter: schema.IndexBloomFilterInfo{ 624 NumElementsM: int64(bloomFilter.M()), 625 NumHashesK: int64(bloomFilter.K()), 626 }, 627 } 628 629 w.encoder.Reset() 630 if err := w.encoder.EncodeIndexInfo(info); err != nil { 631 return err 632 } 633 634 _, err = w.infoFdWithDigest.Write(w.encoder.Bytes()) 635 return err 636 } 637 638 func writeCheckpointFile( 639 checkpointFilePath string, 640 digestChecksum uint32, 641 digestBuf digest.Buffer, 642 newFileMode os.FileMode, 643 ) error { 644 fd, err := OpenWritable(checkpointFilePath, newFileMode) 645 if err != nil { 646 return err 647 } 648 if err := digestBuf.WriteDigestToFile(fd, digestChecksum); err != nil { 649 // NB(prateek): intentionally skipping fd.Close() error, as failure 650 // to write takes precedence over failure to close the file 651 fd.Close() 652 return err 653 } 654 return fd.Close() 655 }