github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/suffix_rewriter.go (about) 1 package sstable 2 3 import ( 4 "bytes" 5 "math" 6 "os" 7 "sync" 8 "time" 9 10 "github.com/cespare/xxhash/v2" 11 "github.com/cockroachdb/errors" 12 "github.com/zuoyebang/bitalostable/internal/base" 13 "github.com/zuoyebang/bitalostable/internal/rangekey" 14 ) 15 16 // RewriteKeySuffixes copies the content of the passed SSTable bytes to a new 17 // sstable, written to `out`, in which the suffix `from` has is replaced with 18 // `to` in every key. The input sstable must consist of only Sets or RangeKeySets 19 // and every key must have `from` as its suffix as determined by the Split 20 // function of the Comparer in the passed WriterOptions. Range deletes must not 21 // exist in this sstable, as they will be ignored. 22 // 23 // Data blocks are rewritten in parallel by `concurrency` workers and then 24 // assembled into a final SST. Filters are copied from the original SST without 25 // modification as they are not affected by the suffix, while block and table 26 // properties are only minimally recomputed. 27 // 28 // Any block and table property collectors configured in the WriterOptions must 29 // implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector. 30 func RewriteKeySuffixes( 31 sst []byte, 32 rOpts ReaderOptions, 33 out writeCloseSyncer, 34 o WriterOptions, 35 from, to []byte, 36 concurrency int, 37 ) (*WriterMetadata, error) { 38 r, err := NewMemReader(sst, rOpts) 39 if err != nil { 40 return nil, err 41 } 42 defer r.Close() 43 return rewriteKeySuffixesInBlocks(r, out, o, from, to, concurrency) 44 } 45 46 func rewriteKeySuffixesInBlocks( 47 r *Reader, out writeCloseSyncer, o WriterOptions, from, to []byte, concurrency int, 48 ) (*WriterMetadata, error) { 49 if o.Comparer == nil || o.Comparer.Split == nil { 50 return nil, errors.New("a valid splitter is required to define suffix to replace replace suffix") 51 } 52 if concurrency < 1 { 53 return nil, errors.New("concurrency must be >= 1") 54 } 55 56 w := NewWriter(out, o) 57 defer w.Close() 58 59 for _, c := range w.propCollectors { 60 if _, ok := c.(SuffixReplaceableTableCollector); !ok { 61 return nil, errors.Errorf("property collector %s does not support suffix replacement", c.Name()) 62 } 63 } 64 for _, c := range w.blockPropCollectors { 65 if _, ok := c.(SuffixReplaceableBlockCollector); !ok { 66 return nil, errors.Errorf("block property collector %s does not support suffix replacement", c.Name()) 67 } 68 } 69 70 l, err := r.Layout() 71 if err != nil { 72 return nil, errors.Wrap(err, "reading layout") 73 } 74 75 if err := rewriteDataBlocksToWriter(r, w, l.Data, from, to, w.split, concurrency); err != nil { 76 return nil, errors.Wrap(err, "rewriting data blocks") 77 } 78 79 // Copy over the range key block and replace suffixes in it if it exists. 80 if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { 81 return nil, errors.Wrap(err, "rewriting range key blocks") 82 } 83 84 // Copy over the filter block if it exists (rewriteDataBlocksToWriter will 85 // already have ensured this is valid if it exists). 86 if w.filter != nil && l.Filter.Length > 0 { 87 filterBlock, _, err := readBlockBuf(r, l.Filter, nil) 88 if err != nil { 89 return nil, errors.Wrap(err, "reading filter") 90 } 91 w.filter = copyFilterWriter{ 92 origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock, 93 } 94 } 95 96 if err := w.Close(); err != nil { 97 return nil, err 98 } 99 100 return w.Metadata() 101 } 102 103 var errBadKind = errors.New("key does not have expected kind (set)") 104 105 type blockWithSpan struct { 106 start, end InternalKey 107 data []byte 108 } 109 110 func rewriteBlocks( 111 r *Reader, 112 restartInterval int, 113 checksumType ChecksumType, 114 compression Compression, 115 input []BlockHandleWithProperties, 116 output []blockWithSpan, 117 totalWorkers, worker int, 118 from, to []byte, 119 split Split, 120 ) error { 121 bw := blockWriter{ 122 restartInterval: restartInterval, 123 } 124 buf := blockBuf{checksummer: checksummer{checksumType: checksumType}} 125 if checksumType == ChecksumTypeXXHash { 126 buf.checksummer.xxHasher = xxhash.New() 127 } 128 129 var blockAlloc []byte 130 var keyAlloc []byte 131 var scratch InternalKey 132 133 var inputBlock, inputBlockBuf []byte 134 135 iter := &blockIter{} 136 137 // We'll assume all blocks are _roughly_ equal so round-robin static partition 138 // of each worker doing every ith block is probably enough. 139 for i := worker; i < len(input); i += totalWorkers { 140 bh := input[i] 141 142 var err error 143 inputBlock, inputBlockBuf, err = readBlockBuf(r, bh.BlockHandle, inputBlockBuf) 144 if err != nil { 145 return err 146 } 147 if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum); err != nil { 148 return err 149 } 150 151 if cap(bw.restarts) < int(iter.restarts) { 152 bw.restarts = make([]uint32, 0, iter.restarts) 153 } 154 if cap(bw.buf) == 0 { 155 bw.buf = make([]byte, 0, len(inputBlock)) 156 } 157 if cap(bw.restarts) < int(iter.numRestarts) { 158 bw.restarts = make([]uint32, 0, iter.numRestarts) 159 } 160 161 for key, val := iter.First(); key != nil; key, val = iter.Next() { 162 if key.Kind() != InternalKeyKindSet { 163 return errBadKind 164 } 165 si := split(key.UserKey) 166 oldSuffix := key.UserKey[si:] 167 if !bytes.Equal(oldSuffix, from) { 168 err := errors.Errorf("key has suffix %q, expected %q", oldSuffix, from) 169 return err 170 } 171 newLen := si + len(to) 172 if cap(scratch.UserKey) < newLen { 173 scratch.UserKey = make([]byte, 0, len(key.UserKey)*2+len(to)-len(from)) 174 } 175 176 scratch.Trailer = key.Trailer 177 scratch.UserKey = scratch.UserKey[:newLen] 178 copy(scratch.UserKey, key.UserKey[:si]) 179 copy(scratch.UserKey[si:], to) 180 181 bw.add(scratch, val) 182 if output[i].start.UserKey == nil { 183 keyAlloc, output[i].start = cloneKeyWithBuf(scratch, keyAlloc) 184 } 185 } 186 *iter = iter.resetForReuse() 187 188 keyAlloc, output[i].end = cloneKeyWithBuf(scratch, keyAlloc) 189 190 finished := compressAndChecksum(bw.finish(), compression, &buf) 191 192 // copy our finished block into the output buffer. 193 sz := len(finished) + blockTrailerLen 194 if cap(blockAlloc) < sz { 195 blockAlloc = make([]byte, sz*128) 196 } 197 output[i].data = blockAlloc[:sz:sz] 198 blockAlloc = blockAlloc[sz:] 199 copy(output[i].data, finished) 200 copy(output[i].data[len(finished):], buf.tmp[:blockTrailerLen]) 201 } 202 return nil 203 } 204 205 func rewriteDataBlocksToWriter( 206 r *Reader, 207 w *Writer, 208 data []BlockHandleWithProperties, 209 from, to []byte, 210 split Split, 211 concurrency int, 212 ) error { 213 if r.Properties.NumEntries == 0 { 214 // No point keys. 215 return nil 216 } 217 blocks := make([]blockWithSpan, len(data)) 218 219 if w.filter != nil { 220 if r.Properties.FilterPolicyName != w.filter.policyName() { 221 return errors.New("mismatched filters") 222 } 223 if was, is := r.Properties.ComparerName, w.props.ComparerName; was != is { 224 return errors.Errorf("mismatched Comparer %s vs %s, replacement requires same splitter to copy filters", was, is) 225 } 226 } 227 228 g := &sync.WaitGroup{} 229 g.Add(concurrency) 230 errCh := make(chan error, concurrency) 231 for i := 0; i < concurrency; i++ { 232 worker := i 233 go func() { 234 defer g.Done() 235 err := rewriteBlocks( 236 r, 237 w.dataBlockBuf.dataBlock.restartInterval, 238 w.blockBuf.checksummer.checksumType, 239 w.compression, 240 data, 241 blocks, 242 concurrency, 243 worker, 244 from, to, 245 split, 246 ) 247 if err != nil { 248 errCh <- err 249 } 250 }() 251 } 252 g.Wait() 253 close(errCh) 254 if err, ok := <-errCh; ok { 255 return err 256 } 257 258 for _, p := range w.propCollectors { 259 if err := p.(SuffixReplaceableTableCollector).UpdateKeySuffixes(r.Properties.UserProperties, from, to); err != nil { 260 return err 261 } 262 } 263 264 var decoder blockPropertiesDecoder 265 var oldShortIDs []shortID 266 var oldProps [][]byte 267 if len(w.blockPropCollectors) > 0 { 268 oldProps = make([][]byte, len(w.blockPropCollectors)) 269 oldShortIDs = make([]shortID, math.MaxUint8) 270 for i, p := range w.blockPropCollectors { 271 if prop, ok := r.Properties.UserProperties[p.Name()]; ok { 272 was, is := shortID(byte(prop[0])), shortID(i) 273 oldShortIDs[was] = is 274 } 275 } 276 } 277 278 for i := range blocks { 279 // Write the rewritten block to the file. 280 n, err := w.writer.Write(blocks[i].data) 281 if err != nil { 282 return err 283 } 284 285 bh := BlockHandle{Offset: w.meta.Size, Length: uint64(n) - blockTrailerLen} 286 // Update the overall size. 287 w.meta.Size += uint64(n) 288 289 // Load any previous values for our prop collectors into oldProps. 290 for i := range oldProps { 291 oldProps[i] = nil 292 } 293 decoder.props = data[i].Props 294 for !decoder.done() { 295 id, val, err := decoder.next() 296 if err != nil { 297 return err 298 } 299 oldProps[oldShortIDs[id]] = val 300 } 301 302 for i, p := range w.blockPropCollectors { 303 if err := p.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProps[i], from, to); err != nil { 304 return err 305 } 306 } 307 308 var bhp BlockHandleWithProperties 309 if bhp, err = w.maybeAddBlockPropertiesToBlockHandle(bh); err != nil { 310 return err 311 } 312 var nextKey InternalKey 313 if i+1 < len(blocks) { 314 nextKey = blocks[i+1].start 315 } 316 if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil { 317 return err 318 } 319 } 320 321 w.meta.updateSeqNum(blocks[0].start.SeqNum()) 322 w.props.NumEntries = r.Properties.NumEntries 323 w.props.RawKeySize = r.Properties.RawKeySize 324 w.props.RawValueSize = r.Properties.RawValueSize 325 w.meta.SetSmallestPointKey(blocks[0].start) 326 w.meta.SetLargestPointKey(blocks[len(blocks)-1].end) 327 return nil 328 } 329 330 func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error { 331 iter, err := r.NewRawRangeKeyIter() 332 if err != nil { 333 return err 334 } 335 if iter == nil { 336 // No range keys. 337 return nil 338 } 339 defer iter.Close() 340 341 for s := iter.First(); s != nil; s = iter.Next() { 342 if !s.Valid() { 343 break 344 } 345 for i := range s.Keys { 346 if s.Keys[i].Kind() != base.InternalKeyKindRangeKeySet { 347 return errBadKind 348 } 349 if !bytes.Equal(s.Keys[i].Suffix, from) { 350 return errors.Errorf("key has suffix %q, expected %q", s.Keys[i].Suffix, from) 351 } 352 s.Keys[i].Suffix = to 353 } 354 355 err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error { 356 // Calling AddRangeKey instead of addRangeKeySpan bypasses the fragmenter. 357 // This is okay because the raw fragments off of `iter` are already 358 // fragmented, and suffix replacement should not affect fragmentation. 359 return w.AddRangeKey(k, v) 360 }) 361 if err != nil { 362 return err 363 } 364 } 365 366 return nil 367 } 368 369 type copyFilterWriter struct { 370 origMetaName string 371 origPolicyName string 372 data []byte 373 } 374 375 func (copyFilterWriter) addKey(key []byte) { panic("unimplemented") } 376 func (c copyFilterWriter) finish() ([]byte, error) { return c.data, nil } 377 func (c copyFilterWriter) metaName() string { return c.origMetaName } 378 func (c copyFilterWriter) policyName() string { return c.origPolicyName } 379 380 // RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a 381 // single loop over the Reader that writes each key to the Writer with the new 382 // suffix. The is significantly slower than the parallelized rewriter, and does 383 // more work to rederive filters, props, etc, however re-doing that work makes 384 // it less restrictive -- props no longer need to 385 func RewriteKeySuffixesViaWriter( 386 r *Reader, out writeCloseSyncer, o WriterOptions, from, to []byte, 387 ) (*WriterMetadata, error) { 388 if o.Comparer == nil || o.Comparer.Split == nil { 389 return nil, errors.New("a valid splitter is required to define suffix to replace replace suffix") 390 } 391 392 w := NewWriter(out, o) 393 i, err := r.NewIter(nil, nil) 394 if err != nil { 395 return nil, err 396 } 397 defer i.Close() 398 399 k, v := i.First() 400 var scratch InternalKey 401 for k != nil { 402 if k.Kind() != InternalKeyKindSet { 403 return nil, errors.New("invalid key type") 404 } 405 oldSuffix := k.UserKey[r.Split(k.UserKey):] 406 if !bytes.Equal(oldSuffix, from) { 407 return nil, errors.Errorf("key has suffix %q, expected %q", oldSuffix, from) 408 } 409 scratch.UserKey = append(scratch.UserKey[:0], k.UserKey[:len(k.UserKey)-len(from)]...) 410 scratch.UserKey = append(scratch.UserKey, to...) 411 scratch.Trailer = k.Trailer 412 413 if w.addPoint(scratch, v); err != nil { 414 return nil, err 415 } 416 k, v = i.Next() 417 } 418 if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { 419 return nil, err 420 } 421 if err := w.Close(); err != nil { 422 return nil, err 423 } 424 return &w.meta, nil 425 } 426 427 // NewMemReader opens a reader over the SST stored in the passed []byte. 428 func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error) { 429 return NewReader(memReader{sst, bytes.NewReader(sst), sizeOnlyStat(int64(len(sst)))}, o) 430 } 431 432 func readBlockBuf(r *Reader, bh BlockHandle, buf []byte) ([]byte, []byte, error) { 433 raw := r.file.(memReader).b[bh.Offset : bh.Offset+bh.Length+blockTrailerLen] 434 if err := checkChecksum(r.checksumType, raw, bh, 0); err != nil { 435 return nil, buf, err 436 } 437 typ := blockType(raw[bh.Length]) 438 raw = raw[:bh.Length] 439 if typ == noCompressionBlockType { 440 return raw, buf, nil 441 } 442 decompressedLen, prefix, err := decompressedLen(typ, raw) 443 if err != nil { 444 return nil, buf, err 445 } 446 if cap(buf) < decompressedLen { 447 buf = make([]byte, decompressedLen) 448 } 449 res, err := decompressInto(typ, raw[prefix:], buf[:decompressedLen]) 450 return res, buf, err 451 } 452 453 // memReader is a thin wrapper around a []byte such that it can be passed to an 454 // sstable.Reader. It supports concurrent use, and does so without locking in 455 // contrast to the heavier read/write vfs.MemFile. 456 type memReader struct { 457 b []byte 458 r *bytes.Reader 459 s sizeOnlyStat 460 } 461 462 var _ ReadableFile = memReader{} 463 464 // ReadAt implements io.ReaderAt. 465 func (m memReader) ReadAt(p []byte, off int64) (n int, err error) { return m.r.ReadAt(p, off) } 466 467 // Close implements io.Closer. 468 func (memReader) Close() error { return nil } 469 470 // Stat implements ReadableFile. 471 func (m memReader) Stat() (os.FileInfo, error) { return m.s, nil } 472 473 type sizeOnlyStat int64 474 475 func (s sizeOnlyStat) Size() int64 { return int64(s) } 476 func (sizeOnlyStat) IsDir() bool { panic(errors.AssertionFailedf("unimplemented")) } 477 func (sizeOnlyStat) ModTime() time.Time { panic(errors.AssertionFailedf("unimplemented")) } 478 func (sizeOnlyStat) Mode() os.FileMode { panic(errors.AssertionFailedf("unimplemented")) } 479 func (sizeOnlyStat) Name() string { panic(errors.AssertionFailedf("unimplemented")) } 480 func (sizeOnlyStat) Sys() interface{} { panic(errors.AssertionFailedf("unimplemented")) }