github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/table/sstable/table.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package sstable 18 19 import ( 20 "fmt" 21 "io" 22 "math" 23 "os" 24 "path" 25 "path/filepath" 26 "strconv" 27 "strings" 28 "sync" 29 "sync/atomic" 30 "unsafe" 31 32 "github.com/coocood/badger/cache" 33 "github.com/coocood/badger/fileutil" 34 "github.com/coocood/badger/options" 35 "github.com/coocood/badger/surf" 36 "github.com/coocood/badger/y" 37 "github.com/coocood/bbloom" 38 "github.com/pingcap/errors" 39 ) 40 41 const ( 42 fileSuffix = ".sst" 43 idxFileSuffix = ".idx" 44 45 intSize = int(unsafe.Sizeof(int(0))) 46 ) 47 48 func IndexFilename(tableFilename string) string { return tableFilename + idxFileSuffix } 49 50 type tableIndex struct { 51 blockEndOffsets []uint32 52 baseKeys entrySlice 53 bf *bbloom.Bloom 54 hIdx *hashIndex 55 surf *surf.SuRF 56 } 57 58 // Table represents a loaded table file with the info we have about it 59 type Table struct { 60 sync.Mutex 61 62 fd *os.File // Own fd. 63 indexFd *os.File 64 65 globalTs uint64 66 tableSize int64 67 numBlocks int 68 smallest, biggest y.Key 69 id uint64 70 71 blockCache *cache.Cache 72 blocksMmap []byte 73 74 indexCache *cache.Cache 75 index *tableIndex 76 indexOnce sync.Once 77 indexMmap []byte 78 79 compacting int32 80 81 compression options.CompressionType 82 83 oldBlockLen int64 84 oldBlock []byte 85 } 86 87 // CompressionType returns the compression algorithm used for block compression. 88 func (t *Table) CompressionType() options.CompressionType { 89 return t.compression 90 } 91 92 // Delete delete table's file from disk. 93 func (t *Table) Delete() error { 94 if t.blockCache != nil { 95 for blk := 0; blk < t.numBlocks; blk++ { 96 t.blockCache.Del(t.blockCacheKey(blk)) 97 } 98 } 99 if t.indexCache != nil { 100 t.indexCache.Del(t.id) 101 } 102 if len(t.blocksMmap) != 0 { 103 y.Munmap(t.blocksMmap) 104 } 105 t.index = nil 106 if len(t.indexMmap) != 0 { 107 y.Munmap(t.indexMmap) 108 } 109 if err := t.fd.Truncate(0); err != nil { 110 // This is very important to let the FS know that the file is deleted. 111 return err 112 } 113 filename := t.fd.Name() 114 if err := t.fd.Close(); err != nil { 115 return err 116 } 117 if err := os.Remove(filename); err != nil { 118 return err 119 } 120 return os.Remove(filename + idxFileSuffix) 121 } 122 123 // OpenTable assumes file has only one table and opens it. Takes ownership of fd upon function 124 // entry. Returns a table with one reference count on it (decrementing which may delete the file! 125 // -- consider t.Close() instead). The fd has to writeable because we call Truncate on it before 126 // deleting. 127 func OpenTable(filename string, blockCache *cache.Cache, indexCache *cache.Cache) (*Table, error) { 128 id, ok := ParseFileID(filename) 129 if !ok { 130 return nil, errors.Errorf("Invalid filename: %s", filename) 131 } 132 133 // TODO: after we support cache of L2 storage, we will open block data file in cache manager. 134 fd, err := y.OpenExistingFile(filename, 0) 135 if err != nil { 136 return nil, err 137 } 138 139 indexFd, err := y.OpenExistingFile(filename+idxFileSuffix, 0) 140 if err != nil { 141 return nil, err 142 } 143 144 t := &Table{ 145 fd: fd, 146 indexFd: indexFd, 147 id: id, 148 blockCache: blockCache, 149 indexCache: indexCache, 150 } 151 152 if err := t.initTableInfo(); err != nil { 153 t.Close() 154 return nil, err 155 } 156 if blockCache == nil || t.oldBlockLen > 0 { 157 t.blocksMmap, err = y.Mmap(fd, false, t.Size()) 158 if err != nil { 159 t.Close() 160 return nil, y.Wrapf(err, "Unable to map file") 161 } 162 t.oldBlock = t.blocksMmap[t.tableSize-t.oldBlockLen : t.tableSize] 163 } 164 return t, nil 165 } 166 167 // Close closes the open table. (Releases resources back to the OS.) 168 func (t *Table) Close() error { 169 if t.fd != nil { 170 t.fd.Close() 171 } 172 if t.indexFd != nil { 173 if len(t.indexMmap) != 0 { 174 y.Munmap(t.indexMmap) 175 } 176 t.indexFd.Close() 177 } 178 return nil 179 } 180 181 func (t *Table) NewIterator(reversed bool) y.Iterator { 182 return t.newIterator(reversed) 183 } 184 185 func (t *Table) Get(key y.Key, keyHash uint64) (y.ValueStruct, error) { 186 resultKey, resultVs, ok, err := t.pointGet(key, keyHash) 187 if err != nil { 188 return y.ValueStruct{}, err 189 } 190 if !ok { 191 it := t.NewIterator(false) 192 it.Seek(key.UserKey) 193 if !it.Valid() { 194 return y.ValueStruct{}, nil 195 } 196 if !key.SameUserKey(it.Key()) { 197 return y.ValueStruct{}, nil 198 } 199 resultKey, resultVs = it.Key(), it.Value() 200 } else if resultKey.IsEmpty() { 201 return y.ValueStruct{}, nil 202 } 203 result := resultVs 204 result.Version = resultKey.Version 205 return result, nil 206 } 207 208 // pointGet try to lookup a key and its value by table's hash index. 209 // If it find an hash collision the last return value will be false, 210 // which means caller should fallback to seek search. Otherwise it value will be true. 211 // If the hash index does not contain such an element the returned key will be nil. 212 func (t *Table) pointGet(key y.Key, keyHash uint64) (y.Key, y.ValueStruct, bool, error) { 213 idx, err := t.getIndex() 214 if err != nil { 215 return y.Key{}, y.ValueStruct{}, false, err 216 } 217 if idx.bf != nil && !idx.bf.Has(keyHash) { 218 return y.Key{}, y.ValueStruct{}, true, err 219 } 220 221 blkIdx, offset := uint32(resultFallback), uint8(0) 222 if idx.hIdx != nil { 223 blkIdx, offset = idx.hIdx.lookup(keyHash) 224 } else if idx.surf != nil { 225 v, ok := idx.surf.Get(key.UserKey) 226 if !ok { 227 blkIdx = resultNoEntry 228 } else { 229 var pos entryPosition 230 pos.decode(v) 231 blkIdx, offset = uint32(pos.blockIdx), pos.offset 232 } 233 } 234 if blkIdx == resultFallback { 235 return y.Key{}, y.ValueStruct{}, false, nil 236 } 237 if blkIdx == resultNoEntry { 238 return y.Key{}, y.ValueStruct{}, true, nil 239 } 240 241 it := t.newIterator(false) 242 it.seekFromOffset(int(blkIdx), int(offset), key.UserKey) 243 244 if !it.Valid() || !key.SameUserKey(it.Key()) { 245 return y.Key{}, y.ValueStruct{}, true, it.Error() 246 } 247 if !y.SeekToVersion(it, key.Version) { 248 return y.Key{}, y.ValueStruct{}, true, it.Error() 249 } 250 return it.Key(), it.Value(), true, nil 251 } 252 253 func (t *Table) read(off int, sz int) ([]byte, error) { 254 if len(t.blocksMmap) > 0 { 255 if len(t.blocksMmap[off:]) < sz { 256 return nil, y.ErrEOF 257 } 258 return t.blocksMmap[off : off+sz], nil 259 } 260 res := make([]byte, sz) 261 _, err := t.fd.ReadAt(res, int64(off)) 262 return res, err 263 } 264 265 func (t *Table) initTableInfo() error { 266 d, err := t.loadIndexData(false) 267 if err != nil { 268 return err 269 } 270 271 t.compression = d.compression 272 t.globalTs = d.globalTS 273 274 for ; d.valid(); d.next() { 275 switch d.currentId() { 276 case idSmallest: 277 if k := d.decode(); len(k) != 0 { 278 t.smallest = y.KeyWithTs(y.Copy(k), math.MaxUint64) 279 } 280 case idBiggest: 281 if k := d.decode(); len(k) != 0 { 282 t.biggest = y.KeyWithTs(y.Copy(k), 0) 283 } 284 case idBlockEndOffsets: 285 offsets := bytesToU32Slice(d.decode()) 286 t.tableSize = int64(offsets[len(offsets)-1]) 287 t.numBlocks = len(offsets) 288 case idOldBlockLen: 289 t.oldBlockLen = int64(bytesToU32(d.decode())) 290 t.tableSize += t.oldBlockLen 291 } 292 } 293 return nil 294 } 295 296 func (t *Table) readTableIndex(d *metaDecoder) *tableIndex { 297 idx := new(tableIndex) 298 for ; d.valid(); d.next() { 299 switch d.currentId() { 300 case idBaseKeysEndOffs: 301 idx.baseKeys.endOffs = bytesToU32Slice(d.decode()) 302 case idBaseKeys: 303 idx.baseKeys.data = d.decode() 304 case idBlockEndOffsets: 305 idx.blockEndOffsets = bytesToU32Slice(d.decode()) 306 case idBloomFilter: 307 if d := d.decode(); len(d) != 0 { 308 idx.bf = new(bbloom.Bloom) 309 idx.bf.BinaryUnmarshal(d) 310 } 311 case idHashIndex: 312 if d := d.decode(); len(d) != 0 { 313 idx.hIdx = new(hashIndex) 314 idx.hIdx.readIndex(d) 315 } 316 case idSuRFIndex: 317 if d := d.decode(); len(d) != 0 { 318 idx.surf = new(surf.SuRF) 319 idx.surf.Unmarshal(d) 320 } 321 } 322 } 323 return idx 324 } 325 326 func (t *Table) getIndex() (*tableIndex, error) { 327 if t.indexCache == nil { 328 var err error 329 t.indexOnce.Do(func() { 330 var d *metaDecoder 331 d, err = t.loadIndexData(true) 332 if err != nil { 333 return 334 } 335 t.index = t.readTableIndex(d) 336 }) 337 return t.index, nil 338 } 339 340 index, err := t.indexCache.GetOrCompute(t.id, func() (interface{}, int64, error) { 341 d, err := t.loadIndexData(false) 342 if err != nil { 343 return nil, 0, err 344 } 345 return t.readTableIndex(d), int64(len(d.buf)), nil 346 }) 347 if err != nil { 348 return nil, err 349 } 350 return index.(*tableIndex), nil 351 } 352 353 func (t *Table) loadIndexData(useMmap bool) (*metaDecoder, error) { 354 fstat, err := t.indexFd.Stat() 355 if err != nil { 356 return nil, err 357 } 358 var idxData []byte 359 360 if useMmap { 361 idxData, err = y.Mmap(t.indexFd, false, fstat.Size()) 362 if err != nil { 363 return nil, err 364 } 365 t.indexMmap = idxData 366 } else { 367 idxData = make([]byte, fstat.Size()) 368 if _, err = t.indexFd.ReadAt(idxData, 0); err != nil { 369 return nil, err 370 } 371 } 372 373 decoder, err := newMetaDecoder(idxData) 374 if err != nil { 375 return nil, err 376 } 377 if decoder.compression != options.None && useMmap { 378 y.Munmap(idxData) 379 t.indexMmap = nil 380 } 381 return decoder, nil 382 } 383 384 type block struct { 385 offset int 386 data []byte 387 baseKey []byte 388 } 389 390 func (b *block) size() int64 { 391 return int64(intSize + len(b.data)) 392 } 393 394 func (t *Table) block(idx int, index *tableIndex) (block, error) { 395 y.Assert(idx >= 0) 396 397 if idx >= len(index.blockEndOffsets) { 398 return block{}, io.EOF 399 } 400 401 if t.blockCache == nil { 402 return t.loadBlock(idx, index) 403 } 404 405 key := t.blockCacheKey(idx) 406 blk, err := t.blockCache.GetOrCompute(key, func() (interface{}, int64, error) { 407 b, e := t.loadBlock(idx, index) 408 if e != nil { 409 return nil, 0, e 410 } 411 return b, int64(len(b.data)), nil 412 }) 413 if err != nil { 414 return block{}, err 415 } 416 return blk.(block), nil 417 } 418 419 func (t *Table) loadBlock(idx int, index *tableIndex) (block, error) { 420 var startOffset int 421 if idx > 0 { 422 startOffset = int(index.blockEndOffsets[idx-1]) 423 } 424 blk := block{ 425 offset: startOffset, 426 } 427 endOffset := int(index.blockEndOffsets[idx]) 428 dataLen := endOffset - startOffset 429 var err error 430 if blk.data, err = t.read(blk.offset, dataLen); err != nil { 431 return block{}, errors.Wrapf(err, 432 "failed to read from file: %s at offset: %d, len: %d", t.fd.Name(), blk.offset, dataLen) 433 } 434 435 blk.data, err = t.compression.Decompress(blk.data) 436 if err != nil { 437 return block{}, errors.Wrapf(err, 438 "failed to decode compressed data in file: %s at offset: %d, len: %d", 439 t.fd.Name(), blk.offset, dataLen) 440 } 441 blk.baseKey = index.baseKeys.getEntry(idx) 442 return blk, nil 443 } 444 445 // HasGlobalTs returns table does set global ts. 446 func (t *Table) HasGlobalTs() bool { 447 return t.globalTs != 0 448 } 449 450 // SetGlobalTs update the global ts of external ingested tables. 451 func (t *Table) SetGlobalTs(ts uint64) error { 452 if _, err := t.indexFd.WriteAt(u64ToBytes(ts), 0); err != nil { 453 return err 454 } 455 if err := fileutil.Fsync(t.indexFd); err != nil { 456 return err 457 } 458 t.globalTs = ts 459 return nil 460 } 461 462 func (t *Table) MarkCompacting(flag bool) { 463 if flag { 464 atomic.StoreInt32(&t.compacting, 1) 465 } 466 atomic.StoreInt32(&t.compacting, 0) 467 } 468 469 func (t *Table) IsCompacting() bool { 470 return atomic.LoadInt32(&t.compacting) == 1 471 } 472 473 func (t *Table) blockCacheKey(idx int) uint64 { 474 y.Assert(t.ID() < math.MaxUint32) 475 y.Assert(idx < math.MaxUint32) 476 return (t.ID() << 32) | uint64(idx) 477 } 478 479 // Size is its file size in bytes 480 func (t *Table) Size() int64 { return t.tableSize } 481 482 // Smallest is its smallest key, or nil if there are none 483 func (t *Table) Smallest() y.Key { return t.smallest } 484 485 // Biggest is its biggest key, or nil if there are none 486 func (t *Table) Biggest() y.Key { return t.biggest } 487 488 // Filename is NOT the file name. Just kidding, it is. 489 func (t *Table) Filename() string { return t.fd.Name() } 490 491 // ID is the table's ID number (used to make the file name). 492 func (t *Table) ID() uint64 { return t.id } 493 494 func (t *Table) HasOverlap(start, end y.Key, includeEnd bool) bool { 495 if start.Compare(t.Biggest()) > 0 { 496 return false 497 } 498 499 if cmp := end.Compare(t.Smallest()); cmp < 0 { 500 return false 501 } else if cmp == 0 { 502 return includeEnd 503 } 504 505 idx, err := t.getIndex() 506 if err != nil { 507 return true 508 } 509 510 if idx.surf != nil { 511 return idx.surf.HasOverlap(start.UserKey, end.UserKey, includeEnd) 512 } 513 514 // If there are errors occurred during seeking, 515 // we assume the table has overlapped with the range to prevent data loss. 516 it := t.newIteratorWithIdx(false, idx) 517 it.Seek(start.UserKey) 518 if !it.Valid() { 519 return it.Error() != nil 520 } 521 if cmp := it.Key().Compare(end); cmp > 0 { 522 return false 523 } else if cmp == 0 { 524 return includeEnd 525 } 526 return true 527 } 528 529 // ParseFileID reads the file id out of a filename. 530 func ParseFileID(name string) (uint64, bool) { 531 name = path.Base(name) 532 if !strings.HasSuffix(name, fileSuffix) { 533 return 0, false 534 } 535 // suffix := name[len(fileSuffix):] 536 name = strings.TrimSuffix(name, fileSuffix) 537 id, err := strconv.ParseUint(name, 16, 64) 538 if err != nil { 539 return 0, false 540 } 541 return id, true 542 } 543 544 // IDToFilename does the inverse of ParseFileID 545 func IDToFilename(id uint64) string { 546 return fmt.Sprintf("%08x", id) + fileSuffix 547 } 548 549 // NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table 550 // filepath. 551 func NewFilename(id uint64, dir string) string { 552 return filepath.Join(dir, IDToFilename(id)) 553 }