github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/journal_record.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "bufio" 19 "context" 20 "encoding/binary" 21 "errors" 22 "fmt" 23 "io" 24 "time" 25 26 "github.com/dolthub/dolt/go/store/d" 27 "github.com/dolthub/dolt/go/store/hash" 28 ) 29 30 // journalRec is a record in a chunk journal. Its serialization format uses 31 // uint8 tag prefixes to identify fields and allow for format evolution. 32 // 33 // There are two kinds of journalRecs: chunk records and root hash records. 34 // Chunk records store chunks from persisted memTables. Root hash records 35 // store root hash updates to the manifest state. 36 // Future records kinds may include other updates to manifest state such as 37 // updates to GC generation or the table set lock hash. 38 // 39 // +-----------------+-------+---------+-----+-------------------+ 40 // | length (uint32) | tag 0 | field 0 | ... | checksum (uint32) | 41 // +-----------------+-------+---------+-----+-------------------+ 42 // 43 // Currently, the payload field is always written as the penultimate field, 44 // followed only by the fixed-width record checksum. This allows the payload 45 // to be extracted from the journalRec using only the record length and payload 46 // offset. See recLookup for more detail. 47 type journalRec struct { 48 length uint32 49 kind journalRecKind 50 address hash.Hash 51 payload []byte 52 timestamp time.Time 53 checksum uint32 54 } 55 56 // payloadOffset returns the journalOffset of the payload within the record 57 // assuming only the checksum field follows the payload. 58 func (r journalRec) payloadOffset() uint32 { 59 return r.length - uint32(len(r.payload)+journalRecChecksumSz) 60 } 61 62 // uncompressedPayloadSize returns the uncompressed size of the payload. 63 func (r journalRec) uncompressedPayloadSize() (sz uint64) { 64 // |r.payload| is snappy-encoded and starts with 65 // the uvarint-encoded uncompressed data size 66 sz, _ = binary.Uvarint(r.payload) 67 return 68 } 69 70 type journalRecKind uint8 71 72 const ( 73 unknownJournalRecKind journalRecKind = 0 74 rootHashJournalRecKind journalRecKind = 1 75 chunkJournalRecKind journalRecKind = 2 76 ) 77 78 type journalRecTag uint8 79 80 const ( 81 unknownJournalRecTag journalRecTag = 0 82 kindJournalRecTag journalRecTag = 1 83 addrJournalRecTag journalRecTag = 2 84 payloadJournalRecTag journalRecTag = 3 85 timestampJournalRecTag journalRecTag = 4 86 ) 87 88 const ( 89 journalRecTagSz = 1 90 journalRecLenSz = 4 91 journalRecKindSz = 1 92 journalRecAddrSz = 20 93 journalRecChecksumSz = 4 94 journalRecTimestampSz = 8 95 96 // todo(andy): less arbitrary 97 journalRecMaxSz = 128 * 1024 98 ) 99 100 // journalRecordTimestampGenerator returns the current time in Unix epoch seconds. This function is stored in a 101 // variable so that unit tests can override it to ensure the journal record timestamps are a known, expected value. 102 var journalRecordTimestampGenerator = func() uint64 { 103 return uint64(time.Now().Unix()) 104 } 105 106 func chunkRecordSize(c CompressedChunk) (recordSz, payloadOff uint32) { 107 recordSz += journalRecLenSz 108 recordSz += journalRecTagSz + journalRecKindSz 109 recordSz += journalRecTagSz + journalRecAddrSz 110 recordSz += journalRecTagSz // payload tag 111 payloadOff = recordSz 112 recordSz += uint32(len(c.FullCompressedChunk)) 113 recordSz += journalRecChecksumSz 114 return 115 } 116 117 func rootHashRecordSize() (recordSz int) { 118 recordSz += journalRecLenSz 119 recordSz += journalRecTagSz + journalRecKindSz 120 recordSz += journalRecTagSz + journalRecAddrSz 121 recordSz += journalRecTagSz + journalRecTimestampSz 122 recordSz += journalRecChecksumSz 123 return 124 } 125 126 func writeChunkRecord(buf []byte, c CompressedChunk) (n uint32) { 127 // length 128 l, _ := chunkRecordSize(c) 129 writeUint32(buf[:journalRecLenSz], l) 130 n += journalRecLenSz 131 // kind 132 buf[n] = byte(kindJournalRecTag) 133 n += journalRecTagSz 134 buf[n] = byte(chunkJournalRecKind) 135 n += journalRecKindSz 136 // address 137 buf[n] = byte(addrJournalRecTag) 138 n += journalRecTagSz 139 copy(buf[n:], c.H[:]) 140 n += journalRecAddrSz 141 // payload 142 buf[n] = byte(payloadJournalRecTag) 143 n += journalRecTagSz 144 copy(buf[n:], c.FullCompressedChunk) 145 n += uint32(len(c.FullCompressedChunk)) 146 // checksum 147 writeUint32(buf[n:], crc(buf[:n])) 148 n += journalRecChecksumSz 149 d.PanicIfFalse(l == n) 150 return 151 } 152 153 func writeRootHashRecord(buf []byte, root hash.Hash) (n uint32) { 154 // length 155 l := rootHashRecordSize() 156 writeUint32(buf[:journalRecLenSz], uint32(l)) 157 n += journalRecLenSz 158 159 // kind 160 buf[n] = byte(kindJournalRecTag) 161 n += journalRecTagSz 162 buf[n] = byte(rootHashJournalRecKind) 163 n += journalRecKindSz 164 165 // timestamp 166 buf[n] = byte(timestampJournalRecTag) 167 n += journalRecTagSz 168 writeUint64(buf[n:], journalRecordTimestampGenerator()) 169 n += journalRecTimestampSz 170 171 // address 172 buf[n] = byte(addrJournalRecTag) 173 n += journalRecTagSz 174 copy(buf[n:], root[:]) 175 n += journalRecAddrSz 176 177 // empty payload 178 179 // checksum 180 writeUint32(buf[n:], crc(buf[:n])) 181 n += journalRecChecksumSz 182 return 183 } 184 185 func readJournalRecord(buf []byte) (rec journalRec, err error) { 186 rec.length = readUint32(buf) 187 buf = buf[journalRecLenSz:] 188 for len(buf) > journalRecChecksumSz { 189 tag := journalRecTag(buf[0]) 190 buf = buf[journalRecTagSz:] 191 switch tag { 192 case kindJournalRecTag: 193 rec.kind = journalRecKind(buf[0]) 194 buf = buf[journalRecKindSz:] 195 case addrJournalRecTag: 196 copy(rec.address[:], buf) 197 buf = buf[journalRecAddrSz:] 198 case timestampJournalRecTag: 199 unixSeconds := readUint64(buf) 200 rec.timestamp = time.Unix(int64(unixSeconds), 0) 201 buf = buf[journalRecTimestampSz:] 202 case payloadJournalRecTag: 203 sz := len(buf) - journalRecChecksumSz 204 rec.payload = buf[:sz] 205 buf = buf[sz:] 206 case unknownJournalRecTag: 207 fallthrough 208 default: 209 err = fmt.Errorf("unknown record field tag: %d", tag) 210 return 211 } 212 } 213 rec.checksum = readUint32(buf[:journalRecChecksumSz]) 214 return 215 } 216 217 func validateJournalRecord(buf []byte) bool { 218 if len(buf) < (journalRecLenSz + journalRecChecksumSz) { 219 return false 220 } 221 off := readUint32(buf) 222 if int(off) > len(buf) { 223 return false 224 } 225 off -= indexRecChecksumSz 226 return crc(buf[:off]) == readUint32(buf[off:]) 227 } 228 229 // processJournalRecords iterates over a chunk journal's records by reading from disk using |r|, starting at 230 // offset |off|, and calls the callback function |cb| with each journal record. The offset where reading was stopped 231 // is returned, or any error encountered along the way. 232 func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb func(o int64, r journalRec) error) (int64, error) { 233 var ( 234 buf []byte 235 err error 236 ) 237 238 // start processing records from |off| 239 if _, err = r.Seek(off, io.SeekStart); err != nil { 240 return 0, err 241 } 242 243 rdr := bufio.NewReaderSize(r, journalWriterBuffSize) 244 for { 245 // peek to read next record size 246 if buf, err = rdr.Peek(uint32Size); err != nil { 247 break 248 } 249 250 l := readUint32(buf) 251 if l > journalRecMaxSz { 252 break 253 } else if buf, err = rdr.Peek(int(l)); err != nil { 254 break 255 } 256 257 if !validateJournalRecord(buf) { 258 break // stop if we can't validate |rec| 259 } 260 261 var rec journalRec 262 if rec, err = readJournalRecord(buf); err != nil { 263 break // failed to read valid record 264 } 265 if err = cb(off, rec); err != nil { 266 break 267 } 268 269 // advance |rdr| state by |l| bytes 270 if _, err = io.ReadFull(rdr, buf); err != nil { 271 break 272 } 273 off += int64(len(buf)) 274 } 275 if err != nil && err != io.EOF { 276 return 0, err 277 } 278 // reset the file pointer to end of the last 279 // successfully processed journal record 280 if _, err = r.Seek(off, 0); err != nil { 281 return 0, err 282 } 283 return off, nil 284 } 285 286 func peekRootHashAt(journal io.ReaderAt, offset int64) (root hash.Hash, err error) { 287 expSz := rootHashRecordSize() 288 buf := make([]byte, expSz) // assumes len(rec) is exactly rootHashRecordSize 289 n, err := journal.ReadAt(buf, offset) 290 if errors.Is(err, io.EOF) { 291 err = nil // EOF is expected for last record 292 } else if err != nil { 293 return 294 } else if n != expSz { 295 err = fmt.Errorf("invalid root hash record at %d: %d", offset, n) 296 return 297 } 298 sz := readUint32(buf) 299 if sz > uint32(expSz) { 300 err = fmt.Errorf("invalid root hash record size at %d", offset) 301 return 302 } 303 buf = buf[:sz] 304 if !validateIndexRecord(buf) { 305 err = fmt.Errorf("failed to validate root hash record at %d", offset) 306 return 307 } 308 var rec journalRec 309 if rec, err = readJournalRecord(buf); err != nil { 310 return 311 } else if rec.kind != rootHashJournalRecKind { 312 err = fmt.Errorf("expected root hash record, got kind: %d", rec.kind) 313 return 314 } 315 return hash.Hash(rec.address), nil 316 } 317 318 func readUint32(buf []byte) uint32 { 319 return binary.BigEndian.Uint32(buf) 320 } 321 322 func writeUint32(buf []byte, u uint32) { 323 binary.BigEndian.PutUint32(buf, u) 324 } 325 326 func readUint64(buf []byte) uint64 { 327 return binary.BigEndian.Uint64(buf) 328 } 329 330 func writeUint64(buf []byte, u uint64) { 331 binary.BigEndian.PutUint64(buf, u) 332 }