github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/batch.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "encoding/binary" 15 16 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 17 "github.com/cockroachdb/cockroach/pkg/util/hlc" 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble" 20 ) 21 22 // BatchType represents the type of an entry in an encoded RocksDB batch. 23 type BatchType byte 24 25 // These constants come from rocksdb/db/dbformat.h. 26 const ( 27 BatchTypeDeletion BatchType = 0x0 28 BatchTypeValue BatchType = 0x1 29 BatchTypeMerge BatchType = 0x2 30 BatchTypeLogData BatchType = 0x3 31 // BatchTypeColumnFamilyDeletion BatchType = 0x4 32 // BatchTypeColumnFamilyValue BatchType = 0x5 33 // BatchTypeColumnFamilyMerge BatchType = 0x6 34 BatchTypeSingleDeletion BatchType = 0x7 35 // BatchTypeColumnFamilySingleDeletion BatchType = 0x8 36 // BatchTypeBeginPrepareXID BatchType = 0x9 37 // BatchTypeEndPrepareXID BatchType = 0xA 38 // BatchTypeCommitXID BatchType = 0xB 39 // BatchTypeRollbackXID BatchType = 0xC 40 // BatchTypeNoop BatchType = 0xD 41 // BatchTypeColumnFamilyRangeDeletion BatchType = 0xE 42 BatchTypeRangeDeletion BatchType = 0xF 43 // BatchTypeColumnFamilyBlobIndex BatchType = 0x10 44 // BatchTypeBlobIndex BatchType = 0x11 45 // BatchMaxValue BatchType = 0x7F 46 ) 47 48 const ( 49 // The batch header is composed of an 8-byte sequence number (all zeroes) and 50 // 4-byte count of the number of entries in the batch. 51 headerSize int = 12 52 countPos = 8 53 initialBatchSize = 1 << 10 // 1 KB 54 ) 55 56 // RocksDBBatchBuilder is used to construct the RocksDB batch representation. 57 // From the RocksDB code, the representation of a batch is: 58 // 59 // WriteBatch::rep_ := 60 // sequence: fixed64 61 // count: fixed32 62 // data: record[count] 63 // record := 64 // kTypeValue varstring varstring 65 // kTypeDeletion varstring 66 // [...] (see BatchType) 67 // varstring := 68 // len: varint32 69 // data: uint8[len] 70 // 71 // The RocksDBBatchBuilder code currently only supports kTypeValue 72 // (BatchTypeValue), kTypeDeletion (BatchTypeDeletion), kTypeMerge 73 // (BatchTypeMerge), and kTypeSingleDeletion (BatchTypeSingleDeletion) 74 // operations. Before a batch is written to the RocksDB write-ahead-log, 75 // the sequence number is 0. The "fixed32" format is little endian. 76 // 77 // The keys encoded into the batch are MVCC keys: a string key with a timestamp 78 // suffix. MVCC keys are encoded as: 79 // 80 // <key>[<wall_time>[<logical>]]<#timestamp-bytes> 81 // 82 // The <wall_time> and <logical> portions of the key are encoded as 64 and 83 // 32-bit big-endian integers. A custom RocksDB comparator is used to maintain 84 // the desired ordering as these keys do not sort lexicographically correctly. 85 // Note that the encoding of these keys needs to match up with the encoding in 86 // rocksdb/db.cc:EncodeKey(). 87 type RocksDBBatchBuilder struct { 88 batch pebble.Batch 89 logData bool 90 } 91 92 func (b *RocksDBBatchBuilder) reset() { 93 b.batch.Reset() 94 b.logData = false 95 } 96 97 // Finish returns the constructed batch representation. After calling Finish, 98 // the builder may be used to construct another batch, but the returned []byte 99 // is only valid until the next builder method is called. 100 func (b *RocksDBBatchBuilder) Finish() []byte { 101 repr := b.batch.Repr() 102 b.reset() 103 104 return repr 105 } 106 107 // Len returns the number of bytes currently in the under construction repr. 108 func (b *RocksDBBatchBuilder) Len() int { 109 return len(b.batch.Repr()) 110 } 111 112 var _ = (*RocksDBBatchBuilder).Len 113 114 // getRepr constructs the batch representation and returns it. 115 func (b *RocksDBBatchBuilder) getRepr() []byte { 116 return b.batch.Repr() 117 } 118 119 // Put sets the given key to the value provided. 120 // 121 // It is safe to modify the contents of the arguments after Put returns. 122 func (b *RocksDBBatchBuilder) Put(key MVCCKey, value []byte) { 123 keyLen := key.Len() 124 deferredOp := b.batch.SetDeferred(keyLen, len(value)) 125 encodeKeyToBuf(deferredOp.Key, key, keyLen) 126 copy(deferredOp.Value, value) 127 // NB: the batch is not indexed, obviating the need to call 128 // deferredOp.Finish. 129 } 130 131 // Merge is a high-performance write operation used for values which are 132 // accumulated over several writes. Multiple values can be merged sequentially 133 // into a single key; a subsequent read will return a "merged" value which is 134 // computed from the original merged values. 135 // 136 // It is safe to modify the contents of the arguments after Merge returns. 137 func (b *RocksDBBatchBuilder) Merge(key MVCCKey, value []byte) { 138 keyLen := key.Len() 139 deferredOp := b.batch.MergeDeferred(keyLen, len(value)) 140 encodeKeyToBuf(deferredOp.Key, key, keyLen) 141 copy(deferredOp.Value, value) 142 // NB: the batch is not indexed, obviating the need to call 143 // deferredOp.Finish. 144 } 145 146 // Clear removes the item from the db with the given key. 147 // 148 // It is safe to modify the contents of the arguments after Clear returns. 149 func (b *RocksDBBatchBuilder) Clear(key MVCCKey) { 150 keyLen := key.Len() 151 deferredOp := b.batch.DeleteDeferred(keyLen) 152 encodeKeyToBuf(deferredOp.Key, key, keyLen) 153 // NB: the batch is not indexed, obviating the need to call 154 // deferredOp.Finish. 155 } 156 157 // SingleClear removes the most recent item from the db with the given key. 158 // 159 // It is safe to modify the contents of the arguments after SingleClear returns. 160 func (b *RocksDBBatchBuilder) SingleClear(key MVCCKey) { 161 keyLen := key.Len() 162 deferredOp := b.batch.SingleDeleteDeferred(keyLen) 163 encodeKeyToBuf(deferredOp.Key, key, keyLen) 164 // NB: the batch is not indexed, obviating the need to call 165 // deferredOp.Finish. 166 } 167 168 // LogData adds a blob of log data to the batch. It will be written to the WAL, 169 // but otherwise uninterpreted by RocksDB. 170 // 171 // It is safe to modify the contents of the arguments after LogData returns. 172 func (b *RocksDBBatchBuilder) LogData(data []byte) { 173 _ = b.batch.LogData(data, nil) 174 b.logData = true 175 } 176 177 // ApplyRepr applies the mutations in repr to the current batch. 178 // 179 // It is safe to modify the contents of the arguments after ApplyRepr 180 // returns. 181 func (b *RocksDBBatchBuilder) ApplyRepr(repr []byte) error { 182 b2 := &pebble.Batch{} 183 if err := b2.SetRepr(repr); err != nil { 184 return err 185 } 186 187 return b.batch.Apply(b2, nil) 188 } 189 190 // Count returns the count of memtable-modifying operations in this batch. 191 func (b *RocksDBBatchBuilder) Count() uint32 { 192 return b.batch.Count() 193 } 194 195 // EncodeKey encodes an engine.MVCC key into the RocksDB representation. This 196 // encoding must match with the encoding in engine/db.cc:EncodeKey(). 197 func EncodeKey(key MVCCKey) []byte { 198 keyLen := key.Len() 199 buf := make([]byte, keyLen) 200 encodeKeyToBuf(buf, key, keyLen) 201 return buf 202 } 203 204 // EncodeKeyToBuf encodes an engine.MVCC key into the RocksDB representation. 205 // This encoding must match with the encoding in engine/db.cc:EncodeKey(). 206 func EncodeKeyToBuf(buf []byte, key MVCCKey) []byte { 207 keyLen := key.Len() 208 if cap(buf) < keyLen { 209 buf = make([]byte, keyLen) 210 } else { 211 buf = buf[:keyLen] 212 } 213 encodeKeyToBuf(buf, key, keyLen) 214 return buf 215 } 216 217 func encodeKeyToBuf(buf []byte, key MVCCKey, keyLen int) { 218 const ( 219 timestampSentinelLen = 1 220 walltimeEncodedLen = 8 221 logicalEncodedLen = 4 222 ) 223 224 copy(buf, key.Key) 225 226 pos := len(key.Key) 227 timestampLength := keyLen - pos - 1 228 if timestampLength > 0 { 229 buf[pos] = 0 230 pos += timestampSentinelLen 231 binary.BigEndian.PutUint64(buf[pos:], uint64(key.Timestamp.WallTime)) 232 pos += walltimeEncodedLen 233 if key.Timestamp.Logical != 0 { 234 binary.BigEndian.PutUint32(buf[pos:], uint32(key.Timestamp.Logical)) 235 pos += logicalEncodedLen 236 } 237 } 238 buf[len(buf)-1] = byte(timestampLength) 239 } 240 241 func encodeTimestamp(ts hlc.Timestamp) []byte { 242 _, encodedTS, _ := enginepb.SplitMVCCKey(EncodeKey(MVCCKey{Timestamp: ts})) 243 return encodedTS 244 } 245 246 // DecodeMVCCKey decodes an engine.MVCCKey from its serialized representation. This 247 // decoding must match engine/db.cc:DecodeKey(). 248 func DecodeMVCCKey(encodedKey []byte) (MVCCKey, error) { 249 k, ts, err := enginepb.DecodeKey(encodedKey) 250 return MVCCKey{k, ts}, err 251 } 252 253 // Decode the header of RocksDB batch repr, returning both the count of the 254 // entries in the batch and the suffix of data remaining in the batch. 255 func rocksDBBatchDecodeHeader(repr []byte) (count int, orepr pebble.BatchReader, err error) { 256 if len(repr) < headerSize { 257 return 0, nil, errors.Errorf("batch repr too small: %d < %d", len(repr), headerSize) 258 } 259 seq := binary.LittleEndian.Uint64(repr[:countPos]) 260 if seq != 0 { 261 return 0, nil, errors.Errorf("bad sequence: expected 0, but found %d", seq) 262 } 263 count = int(binary.LittleEndian.Uint32(repr[countPos:headerSize])) 264 return count, pebble.MakeBatchReader(repr), nil 265 } 266 267 // RocksDBBatchReader is used to iterate the entries in a RocksDB batch 268 // representation. 269 // 270 // Example: 271 // r, err := NewRocksDBBatchReader(...) 272 // if err != nil { 273 // return err 274 // } 275 // for r.Next() { 276 // switch r.BatchType() { 277 // case BatchTypeDeletion: 278 // fmt.Printf("delete(%x)", r.Key()) 279 // case BatchTypeValue: 280 // fmt.Printf("put(%x,%x)", r.Key(), r.Value()) 281 // case BatchTypeMerge: 282 // fmt.Printf("merge(%x,%x)", r.Key(), r.Value()) 283 // case BatchTypeSingleDeletion: 284 // fmt.Printf("single_delete(%x)", r.Key()) 285 // case BatchTypeRangeDeletion: 286 // fmt.Printf("delete_range(%x,%x)", r.Key(), r.Value()) 287 // } 288 // } 289 // if err := r.Error(); err != nil { 290 // return err 291 // } 292 type RocksDBBatchReader struct { 293 batchReader pebble.BatchReader 294 295 // The error encountered during iterator, if any 296 err error 297 298 // The total number of entries, decoded from the batch header 299 count int 300 301 // The following all represent the current entry and are updated by Next. 302 // `value` is not applicable for BatchTypeDeletion or BatchTypeSingleDeletion. 303 // `value` indicates the end key for BatchTypeRangeDeletion. 304 typ BatchType 305 key []byte 306 value []byte 307 } 308 309 // NewRocksDBBatchReader creates a RocksDBBatchReader from the given repr and 310 // verifies the header. 311 func NewRocksDBBatchReader(repr []byte) (*RocksDBBatchReader, error) { 312 count, batchReader, err := rocksDBBatchDecodeHeader(repr) 313 if err != nil { 314 return nil, err 315 } 316 return &RocksDBBatchReader{batchReader: batchReader, count: count}, nil 317 } 318 319 // Count returns the declared number of entries in the batch. 320 func (r *RocksDBBatchReader) Count() int { 321 return r.count 322 } 323 324 // Error returns the error, if any, which the iterator encountered. 325 func (r *RocksDBBatchReader) Error() error { 326 return r.err 327 } 328 329 // BatchType returns the type of the current batch entry. 330 func (r *RocksDBBatchReader) BatchType() BatchType { 331 return r.typ 332 } 333 334 // Key returns the key of the current batch entry. 335 func (r *RocksDBBatchReader) Key() []byte { 336 return r.key 337 } 338 339 func decodeMVCCKey(k []byte) (MVCCKey, error) { 340 k, ts, err := enginepb.DecodeKey(k) 341 return MVCCKey{k, ts}, err 342 } 343 344 // MVCCKey returns the MVCC key of the current batch entry. 345 func (r *RocksDBBatchReader) MVCCKey() (MVCCKey, error) { 346 return decodeMVCCKey(r.Key()) 347 } 348 349 // Value returns the value of the current batch entry. Value panics if the 350 // BatchType is BatchTypeDeleted. 351 func (r *RocksDBBatchReader) Value() []byte { 352 if r.typ == BatchTypeDeletion || r.typ == BatchTypeSingleDeletion { 353 panic("cannot call Value on a deletion entry") 354 } 355 return r.value 356 } 357 358 // MVCCEndKey returns the MVCC end key of the current batch entry. 359 func (r *RocksDBBatchReader) MVCCEndKey() (MVCCKey, error) { 360 if r.typ != BatchTypeRangeDeletion { 361 panic("cannot only call Value on a range deletion entry") 362 } 363 return decodeMVCCKey(r.Value()) 364 } 365 366 // Next advances to the next entry in the batch, returning false when the batch 367 // is empty. 368 func (r *RocksDBBatchReader) Next() bool { 369 kind, ukey, value, ok := r.batchReader.Next() 370 371 r.typ = BatchType(kind) 372 r.key = ukey 373 r.value = value 374 375 return ok 376 } 377 378 // RocksDBBatchCount provides an efficient way to get the count of mutations 379 // in a RocksDB Batch representation. 380 func RocksDBBatchCount(repr []byte) (int, error) { 381 if len(repr) < headerSize { 382 return 0, errors.Errorf("batch repr too small: %d < %d", len(repr), headerSize) 383 } 384 return int(binary.LittleEndian.Uint32(repr[countPos:headerSize])), nil 385 }