github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/persist/fs/streaming_write.go (about) 1 // Copyright (c) 2020 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package fs 22 23 import ( 24 "bytes" 25 "fmt" 26 "io" 27 "math" 28 "time" 29 30 "github.com/m3db/m3/src/dbnode/persist" 31 "github.com/m3db/m3/src/dbnode/ts" 32 "github.com/m3db/m3/src/x/ident" 33 xtime "github.com/m3db/m3/src/x/time" 34 35 "github.com/m3db/bloom/v4" 36 ) 37 38 // StreamingWriter writes into data fileset without intermediate buffering. 39 // Writes must be lexicographically ordered by the id. 40 type StreamingWriter interface { 41 io.Closer 42 43 // Open opens the files for writing data to the given shard in the given namespace. 44 Open(opts StreamingWriterOpenOptions) error 45 46 // WriteAll will write the id and all byte slices and returns an error on a write error. 47 // Callers should call this method with strictly lexicographically increasing ID values. 48 WriteAll(id ident.BytesID, encodedTags ts.EncodedTags, data [][]byte, dataChecksum uint32) error 49 50 // Abort closes the file descriptors without writing out a checkpoint file. 51 Abort() error 52 } 53 54 // StreamingWriterOpenOptions in the options for the StreamingWriter. 55 type StreamingWriterOpenOptions struct { 56 NamespaceID ident.ID 57 ShardID uint32 58 BlockStart xtime.UnixNano 59 BlockSize time.Duration 60 VolumeIndex int 61 62 // PlannedRecordsCount is an estimate of the number of series to be written. 63 // Must be greater than 0. 64 PlannedRecordsCount uint 65 } 66 67 type streamingWriter struct { 68 writer *writer 69 options Options 70 currIdx int64 71 prevIDBytes []byte 72 summaryEvery int64 73 bloomFilter *bloom.BloomFilter 74 indexOffset int64 75 summaries int 76 } 77 78 // NewStreamingWriter creates a new streaming writer that writes into the data 79 // fileset without buffering. 80 func NewStreamingWriter(opts Options) (StreamingWriter, error) { 81 w, err := NewWriter(opts) 82 if err != nil { 83 return nil, err 84 } 85 86 return &streamingWriter{writer: w.(*writer), options: opts}, nil 87 } 88 89 func (w *streamingWriter) Open(opts StreamingWriterOpenOptions) error { 90 if opts.PlannedRecordsCount <= 0 { 91 return fmt.Errorf( 92 "PlannedRecordsCount must be positive, got %d", opts.PlannedRecordsCount) 93 } 94 95 writerOpts := DataWriterOpenOptions{ 96 BlockSize: opts.BlockSize, 97 Identifier: FileSetFileIdentifier{ 98 Namespace: opts.NamespaceID, 99 Shard: opts.ShardID, 100 BlockStart: opts.BlockStart, 101 VolumeIndex: opts.VolumeIndex, 102 }, 103 FileSetType: persist.FileSetFlushType, 104 } 105 106 plannedRecordsCount := opts.PlannedRecordsCount 107 if plannedRecordsCount == 0 { 108 plannedRecordsCount = 1 109 } 110 m, k := bloom.EstimateFalsePositiveRate( 111 plannedRecordsCount, 112 w.options.IndexBloomFilterFalsePositivePercent(), 113 ) 114 w.bloomFilter = bloom.NewBloomFilter(m, k) 115 116 summariesApprox := float64(opts.PlannedRecordsCount) * w.options.IndexSummariesPercent() 117 w.summaryEvery = 1 118 if summariesApprox > 0 { 119 w.summaryEvery = int64(math.Max(1, 120 math.Floor(float64(opts.PlannedRecordsCount)/summariesApprox))) 121 } 122 123 if err := w.writer.Open(writerOpts); err != nil { 124 return err 125 } 126 127 w.currIdx = 0 128 w.indexOffset = 0 129 w.summaries = 0 130 w.prevIDBytes = nil 131 132 return nil 133 } 134 135 func (w *streamingWriter) WriteAll( 136 id ident.BytesID, 137 encodedTags ts.EncodedTags, 138 data [][]byte, 139 dataChecksum uint32, 140 ) error { 141 // Need to check if w.prevIDBytes != nil, otherwise we can never write an empty string ID 142 if w.prevIDBytes != nil && bytes.Compare(id, w.prevIDBytes) <= 0 { 143 return fmt.Errorf("ids must be written in lexicographic order, no duplicates, but got %s followed by %s", w.prevIDBytes, id) 144 } 145 w.prevIDBytes = append(w.prevIDBytes[:0], id...) 146 147 entry, ok, err := w.writeData(data, dataChecksum) 148 if err != nil { 149 return err 150 } 151 152 if ok { 153 return w.writeIndexRelated(id, encodedTags, entry) 154 } 155 156 return nil 157 } 158 159 func (w *streamingWriter) writeData( 160 data [][]byte, 161 dataChecksum uint32, 162 ) (indexEntry, bool, error) { 163 var size int64 164 for _, d := range data { 165 size += int64(len(d)) 166 } 167 if size == 0 { 168 return indexEntry{}, false, nil 169 } 170 171 entry := indexEntry{ 172 index: w.currIdx, 173 dataFileOffset: w.writer.currOffset, 174 size: uint32(size), 175 dataChecksum: dataChecksum, 176 } 177 for _, d := range data { 178 if err := w.writer.writeData(d); err != nil { 179 return indexEntry{}, false, err 180 } 181 } 182 183 w.currIdx++ 184 185 return entry, true, nil 186 } 187 188 func (w *streamingWriter) writeIndexRelated( 189 id ident.BytesID, 190 encodedTags ts.EncodedTags, 191 entry indexEntry, 192 ) error { 193 // Add to the bloom filter, note this must be zero alloc or else this will 194 // cause heavy GC churn as we flush millions of series at end of each 195 // time window 196 w.bloomFilter.Add(id) 197 198 writeSummary := w.summaryEvery == 0 || entry.index%w.summaryEvery == 0 199 if writeSummary { 200 // Capture the offset for when we write this summary back, only capture 201 // for every summary we'll actually write to avoid a few memcopies 202 entry.indexFileOffset = w.indexOffset 203 } 204 205 length, err := w.writer.writeIndexWithEncodedTags(id, encodedTags, entry) 206 if err != nil { 207 return err 208 } 209 w.indexOffset += length 210 211 if writeSummary { 212 err = w.writer.writeSummariesEntry(id, entry) 213 if err != nil { 214 return err 215 } 216 w.summaries++ 217 } 218 219 return nil 220 } 221 222 func (w *streamingWriter) Close() error { 223 // Write the bloom filter bitset out 224 if err := w.writer.writeBloomFilterFileContents(w.bloomFilter); err != nil { 225 return err 226 } 227 228 if err := w.writer.writeInfoFileContents(w.bloomFilter, w.summaries, w.currIdx); err != nil { 229 return err 230 } 231 232 w.bloomFilter = nil 233 234 err := w.writer.closeWOIndex() 235 if err != nil { 236 w.writer.err = err 237 return err 238 } 239 240 // NB(xichen): only write out the checkpoint file if there are no errors 241 // encountered between calling writer.Open() and writer.Close(). 242 if err := writeCheckpointFile( 243 w.writer.checkpointFilePath, 244 w.writer.digestFdWithDigestContents.Digest().Sum32(), 245 w.writer.digestBuf, 246 w.writer.newFileMode, 247 ); err != nil { 248 w.writer.err = err 249 return err 250 } 251 252 return nil 253 } 254 255 func (w *streamingWriter) Abort() error { 256 err := w.writer.closeWOIndex() 257 if err != nil { 258 w.writer.err = err 259 return err 260 } 261 262 return nil 263 }