github.com/creachadair/ffs@v0.17.3/file/data.go (about) 1 // Copyright 2019 Michael J. Fromberger. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package file 16 17 import ( 18 "context" 19 "errors" 20 "io" 21 22 "github.com/creachadair/ffs/blob" 23 "github.com/creachadair/ffs/block" 24 "github.com/creachadair/ffs/file/wiretype" 25 "github.com/creachadair/mds/mbits" 26 ) 27 28 // A data value represents an ordered sequence of bytes stored in a blob.Store. 29 // Other than length, no metadata are preserved. File data are recorded as a 30 // flat array of discontiguous extents. 31 type fileData struct { 32 sc *block.SplitConfig 33 totalBytes int64 34 extents []*extent 35 36 // Cache of last successfully-read block. This helps avoid reloading the 37 // same block repeatedly during incremental reads. 38 lastKey string 39 lastData []byte 40 } 41 42 func (d *fileData) getBlock(ctx context.Context, s blob.CAS, key string) ([]byte, error) { 43 if key == d.lastKey { 44 return d.lastData, nil 45 } 46 data, err := s.Get(ctx, key) 47 if err == nil { 48 d.lastKey = key 49 d.lastData = data 50 } 51 return data, err 52 } 53 54 // isSingleBlock reports whether d can be represented as a single-block node. 55 func (d *fileData) isSingleBlock() bool { 56 return len(d.extents) == 1 && d.extents[0].base == 0 && // one extent starting at offset 0 57 len(d.extents[0].blocks) == 1 && // it contains exactly one block 58 d.extents[0].blocks[0].bytes == d.totalBytes // that block is the entire content 59 } 60 61 // toWireType converts d to wire encoding. 62 func (d *fileData) toWireType() *wiretype.Index { 63 if d.totalBytes == 0 && len(d.extents) == 0 { 64 // No data in this file. 65 return nil 66 } 67 68 // Many small files contain just one block of data spanning the entire file. 69 // When that occurs, just store the key of that block. No normalization is 70 // required in this case and we save a few bytes. 71 if d.isSingleBlock() { 72 return &wiretype.Index{ 73 TotalBytes: uint64(d.totalBytes), 74 Single: []byte(d.extents[0].blocks[0].key), 75 } 76 } 77 78 // At this point we have multiple blocks and/or a weird shape (e.g., sparse 79 // extents), so we actually have to do some work to pack and normalize them. 80 w := &wiretype.Index{ 81 TotalBytes: uint64(d.totalBytes), 82 Extents: make([]*wiretype.Extent, len(d.extents)), 83 } 84 for i, ext := range d.extents { 85 x := &wiretype.Extent{ 86 Base: uint64(ext.base), 87 Bytes: uint64(ext.bytes), 88 Blocks: make([]*wiretype.Block, len(ext.blocks)), 89 } 90 for j, blk := range ext.blocks { 91 x.Blocks[j] = &wiretype.Block{ 92 Bytes: uint64(blk.bytes), 93 Key: []byte(blk.key), 94 } 95 } 96 w.Extents[i] = x 97 } 98 w.Normalize() 99 return w 100 } 101 102 // fromWireType replaces the contents of d from the wire encoding pb. 103 func (d *fileData) fromWireType(pb *wiretype.Index) error { 104 if pb == nil { 105 return nil 106 } 107 108 d.totalBytes = int64(pb.TotalBytes) 109 if len(pb.Single) != 0 { 110 if len(pb.Extents) != 0 { 111 return errors.New("invalid index: single-block and extents both set") 112 } 113 d.extents = []*extent{{ 114 base: 0, 115 bytes: d.totalBytes, 116 blocks: []cblock{{key: string(pb.Single), bytes: d.totalBytes}}, 117 }} 118 return nil 119 } 120 121 pb.Normalize() 122 d.extents = make([]*extent, len(pb.Extents)) 123 for i, ext := range pb.Extents { 124 d.extents[i] = &extent{ 125 base: int64(ext.Base), 126 bytes: int64(ext.Bytes), 127 blocks: make([]cblock, len(ext.Blocks)), 128 } 129 for j, blk := range ext.Blocks { 130 d.extents[i].blocks[j] = cblock{ 131 bytes: int64(blk.Bytes), 132 key: string(blk.Key), 133 } 134 } 135 } 136 return nil 137 } 138 139 // size reports the size of the data in bytes. 140 func (d *fileData) size() int64 { return d.totalBytes } 141 142 // blocks calls f once for each block used by d, giving the key and the size of 143 // the blob. If the same blob is repeated, f will be called multiple times for 144 // the same key. 145 func (d *fileData) blocks(f func(int64, string)) { 146 for _, ext := range d.extents { 147 for _, blk := range ext.blocks { 148 f(blk.bytes, blk.key) 149 } 150 } 151 } 152 153 // truncate modifies the length of the file to end at offset, extending or 154 // contracting it as necessary. Contraction may require splitting a block. 155 func (d *fileData) truncate(ctx context.Context, s blob.CAS, offset int64) error { 156 if offset >= d.totalBytes { 157 d.totalBytes = offset 158 return nil 159 } 160 pre, span, _ := d.splitSpan(0, offset) 161 if len(span) != 0 { 162 n := len(span) - 1 163 last := span[n] 164 span = span[:n] 165 166 // If the offset transects a block, read that block and write back its 167 // prefix. If the offset is exactly at the start of the block, we can 168 // skip that step and discard the whole block. 169 if i, pos := last.findBlock(offset); i >= 0 && offset > pos { 170 keep := last.blocks[:i] 171 bits, err := s.Get(ctx, last.blocks[i].key) 172 if err != nil { 173 return err 174 } 175 blks, err := d.splitBlobs(ctx, s, bits[:int(offset-pos)]) 176 if err != nil { 177 return err 178 } 179 span = append(span, splitExtent(&extent{ 180 base: last.base, 181 bytes: offset - last.base, 182 blocks: append(keep, blks...), 183 })...) 184 } 185 } 186 d.extents = append(pre, span...) 187 d.totalBytes = offset 188 return nil 189 } 190 191 // writeAt writes the contents of data at the specified offset in d. It 192 // returns the number of bytes successfully written, and satisfies the 193 // semantics of io.WriterAt. 194 func (d *fileData) writeAt(ctx context.Context, s blob.CAS, data []byte, offset int64) (int, error) { 195 if len(data) == 0 { 196 return 0, nil 197 } 198 end := offset + int64(len(data)) 199 pre, span, post := d.splitSpan(offset, end) 200 201 var left, right []cblock 202 var parts [][]byte 203 newBase := offset 204 newEnd := end 205 206 // If this write does not span any existing extents, create a new one 207 // containing just this write. 208 if len(span) == 0 { 209 parts = append(parts, data) 210 } else { 211 if span[0].base < newBase { 212 // The first extent starts before the write. Find the first block 213 // split by or contiguous to the write, preserve everything before 214 // that, and read in the contents to set up the split. 215 newBase = span[0].base 216 217 pos := span[0].base 218 for _, blk := range span[0].blocks { 219 next := pos + blk.bytes 220 if next < offset { 221 left = append(left, blk) 222 pos = next 223 continue 224 } 225 226 bits, err := s.Get(ctx, blk.key) 227 if err != nil { 228 return 0, err 229 } 230 parts = append(parts, bits[:int(offset-pos)]) 231 break 232 } 233 } 234 235 // Insert the main body of the write. 236 parts = append(parts, data) 237 238 if last := span[len(span)-1]; last.base+last.bytes >= newEnd { 239 // The last extent ends after the write. Find the last block split by 240 // or contiguous to the write, preserve everything after that, and 241 // read in the contents to set up the split. 242 newEnd = last.base + last.bytes 243 244 pos := last.base 245 for i, blk := range last.blocks { 246 if pos > end { 247 // Preserve the rest of this extent 248 right = append(right, last.blocks[i:]...) 249 break 250 } 251 next := pos + blk.bytes 252 if next <= end { 253 pos = next 254 continue // skip overwritten block 255 } 256 257 bits, err := s.Get(ctx, blk.key) 258 if err != nil { 259 return 0, err 260 } 261 262 parts = append(parts, bits[int(end-pos):]) 263 pos = next 264 } 265 } 266 } 267 268 // Now write out the combined data and assemble the new index. 269 body, err := d.splitBlobs(ctx, s, parts...) 270 if err != nil { 271 return 0, err 272 } 273 274 // N.B. It is possible that this write has created contiguous extents. 275 // Rather than fix it here, we rely on the normalization that happens during 276 // conversion to wire format, which includes this merge check. 277 278 d.extents = make([]*extent, 0, len(pre)+1+len(post)) 279 // 280 // d.extents = [ ...pre... | ...merged ... | ...post... ] 281 // 282 d.extents = append(d.extents, pre...) 283 d.extents = append(d.extents, splitExtent(&extent{ 284 base: newBase, 285 bytes: newEnd - newBase, 286 blocks: append(left, append(body, right...)...), 287 })...) 288 d.extents = append(d.extents, post...) 289 if end > d.totalBytes { 290 d.totalBytes = end 291 } 292 293 return len(data), nil 294 } 295 296 // readAt reads the content of d into data from the specified offset, returning 297 // the number of bytes successfully read. It satisfies the semantics of the 298 // io.ReaderAt interface. 299 func (d *fileData) readAt(ctx context.Context, s blob.CAS, data []byte, offset int64) (int, error) { 300 if offset > d.totalBytes { 301 return 0, io.EOF 302 } 303 end := offset + int64(len(data)) 304 if end > d.totalBytes { 305 end = d.totalBytes 306 } 307 _, span, _ := d.splitSpan(offset, end) 308 309 // If the entire requested range is unstored, zero as much as we can 310 // attribute given the total file size. Note that io.ReaderAt requires we 311 // report an error if the total is less than requested. 312 if len(span) == 0 { 313 nr := mbits.Zero(data[:int(end-offset)]) 314 if nr < len(data) { 315 return nr, io.EOF 316 } 317 return nr, nil 318 } 319 320 // At this point, at least some of the data overlap a stored range. Walk 321 // through the extents copying data into the output till we have enough or 322 // we run out of spaces. 323 nr := 0 324 walkSpan: 325 for _, ext := range span { 326 // This extent starts after the current offset, zero-fill up to the 327 // beginning of the extent, or we run out ouf space. 328 if offset < ext.base { 329 cp := min(int(ext.base-offset), len(data)-nr) 330 nr += mbits.Zero(data[nr : nr+cp]) 331 if nr == len(data) { 332 break walkSpan 333 } 334 offset += int64(cp) 335 } 336 337 // The output is not full, and offset at or past the start of this extent. 338 // Find the first block containing offset and walk forward. 339 i, base := ext.findBlock(offset) 340 if i < 0 { 341 continue 342 } 343 for _, blk := range ext.blocks[i:] { 344 if base > end { 345 break walkSpan 346 } 347 348 // Fetch the block contents and copy whatever we can. 349 bits, err := d.getBlock(ctx, s, blk.key) 350 if err != nil { 351 return 0, err 352 } 353 354 pos := int(offset - base) 355 cp := min(len(bits)-pos, len(data)-nr) 356 nr += copy(data[nr:], bits[pos:pos+cp]) 357 if nr == len(data) { 358 break walkSpan 359 } 360 offset += int64(cp) 361 base += blk.bytes 362 } 363 364 // Reaching here, data is not yet full and we have not yet gone past the 365 // end of the requested range. Go back for another extent, if there is one. 366 } 367 368 // At this point we have all the stored data we can take. If there is still 369 // space in the output, the remaining portions of the range are unstored. 370 if nr < len(data) && end > offset { 371 cp := int(end - offset) 372 if max := len(data) - nr; cp > max { 373 cp = max 374 } 375 nr += mbits.Zero(data[nr : nr+cp]) 376 } 377 378 if nr < len(data) { 379 return nr, io.EOF 380 } 381 return nr, nil 382 } 383 384 // splitBlobs re-blocks the concatenation of the specified blobs and returns 385 // the resulting blocks. Zero-valued blocks are not stored, the caller can 386 // detect this by looking for a key of "". 387 func (d *fileData) splitBlobs(ctx context.Context, s blob.CAS, blobs ...[]byte) ([]cblock, error) { 388 data := newBlockReader(blobs) 389 390 var blks []cblock 391 if err := block.NewSplitter(data, d.sc).Split(func(blk []byte) error { 392 // We do not store blocks of zeroes. They count against the total file 393 // size, but we do not explicitly record them. 394 zhead, ztail, n := zeroCheck(blk) 395 if zhead == n { 396 // This block is all zeroes. 397 blks = append(blks, cblock{bytes: int64(len(blk))}) 398 return nil 399 } 400 401 if isWorthTrimming(zhead, n) { 402 // There is a tranch of zeroes at the head. Inject a "fake" zero block 403 // for the prefix, and remove it from the block to be stored. 404 blks = append(blks, cblock{bytes: int64(zhead)}) 405 blk = blk[zhead:] 406 } 407 wantTail := isWorthTrimming(ztail, n) 408 if wantTail { 409 // There is a block of zeroes at the tail. Remove the suffix from the 410 // block to be stored, and store a fake block for the suffix after it. 411 blk = blk[:len(blk)-ztail] 412 } 413 414 key, err := s.CASPut(ctx, blk) 415 if err != nil { 416 return err 417 } 418 blks = append(blks, cblock{bytes: int64(len(blk)), key: key}) 419 420 if wantTail { 421 // Inject a "fake" zero block for the suffix. 422 blks = append(blks, cblock{bytes: int64(ztail)}) 423 } 424 return nil 425 }); err != nil { 426 return nil, err 427 } 428 return blks, nil 429 } 430 431 // splitSpan returns three subslices of the extents of d, those which end 432 // entirely before offset lo, those fully containing the range from lo to hi, 433 // and those which begin entirely at or after offset hi. 434 // 435 // If span is empty, the range fully spans unstored data. Otherwise, the first 436 // and last elements of span are "split" by the range. 437 func (d *fileData) splitSpan(lo, hi int64) (pre, span, post []*extent) { 438 for i, ext := range d.extents { 439 if lo > ext.base+ext.bytes { 440 pre = append(pre, ext) 441 } else if hi < ext.base { 442 post = append(post, d.extents[i:]...) 443 break // nothing more to do; everything else is bigger 444 } else { 445 span = append(span, ext) 446 } 447 } 448 449 return 450 } 451 452 // newfileData constructs a new fileData value containing exactly the data from 453 // s. For each data block, newFileData calls put to store the block and return 454 // its key. An error from put stops construction and is reported to the caller. 455 func newFileData(s *block.Splitter, put func([]byte) (string, error)) (fileData, error) { 456 fd := fileData{sc: s.Config()} 457 458 ext := new(extent) 459 push := func() { 460 if len(ext.blocks) != 0 { 461 fd.extents = append(fd.extents, ext) 462 } 463 ext = &extent{base: fd.totalBytes} 464 } 465 466 err := s.Split(func(data []byte) error { 467 dlen := int64(len(data)) 468 469 zhead, ztail, n := zeroCheck(data) 470 // A block of zeroes ends the current extent. We count the block against 471 // the total file size, but do not explicitly store it. 472 if zhead == n { 473 // N.B. We have to update the total length first, so that push will 474 // see the correct new value for the next extent. 475 fd.totalBytes += dlen 476 push() 477 return nil 478 } 479 480 // If a block has a lot of zeroes at its head or tail, chop them. We 481 // define "a lot" as a fraction of the block size. 482 if zhead*zhead >= n { 483 fd.totalBytes += int64(zhead) 484 push() 485 data = data[zhead:] 486 dlen = int64(len(data)) 487 } 488 489 // Update the total length regardless whether we have trailing zeroes to 490 // remove from the block. Do this BEFORE adjusting the block. 491 fd.totalBytes += dlen 492 if ztail*ztail >= n { 493 data = data[:len(data)-ztail] 494 dlen = int64(len(data)) 495 defer push() // start a new extent after this block 496 } 497 ext.bytes += dlen 498 499 key, err := put(data) 500 if err != nil { 501 return err 502 } 503 ext.blocks = append(ext.blocks, cblock{ 504 bytes: dlen, 505 key: key, 506 }) 507 508 return nil 509 }) 510 if err != nil { 511 return fileData{}, err 512 } 513 push() // flush any trailing extent 514 515 return fd, nil 516 } 517 518 // An extent represents a single contiguous stored subrange of a file. The 519 // blocks record the offsets and block storage keys for the extent. 520 type extent struct { 521 base int64 // offset of the first byte within the file 522 bytes int64 // number of bytes in the extent 523 blocks []cblock // continguous extent blocks 524 starts []int64 // block starting offsets, for search 525 } 526 527 // findBlock returns the index and base offset of the first block in e that 528 // contains offset. It returns -1, -1 if no block in e contains offset. 529 func (e *extent) findBlock(offset int64) (int, int64) { 530 // After a change, do a linear scan to (re)initialize the offsets cache. 531 // Lookups will then fall through to binary search below. 532 if len(e.starts) != len(e.blocks) { 533 e.starts = make([]int64, len(e.blocks)) 534 pos := e.base 535 536 for i, blk := range e.blocks { 537 e.starts[i] = pos 538 pos += blk.bytes 539 } 540 } 541 542 // Subsequent searches binary search. 543 lo, hi := 0, len(e.starts) 544 for lo < hi { 545 mid := (lo + hi) / 2 546 base := e.starts[mid] 547 if offset < base { 548 hi = mid 549 } else if offset >= base+e.blocks[mid].bytes { 550 lo = mid + 1 551 } else { 552 return mid, base 553 } 554 } 555 return -1, -1 556 } 557 558 // A block represents a single content-addressable block of file data. 559 type cblock struct { 560 bytes int64 // number of bytes in the block 561 key string // storage key for this block 562 } 563 564 // isWorthTrimming reports whether a prefix or suffix of nz zeroes is worth 565 // removing from a block of length n. 566 // 567 // Since trimming a prefix or suffix induces an extent split, we should not 568 // bother doing this unless the overhead for another extent is at least as much 569 // as the data we save by trimming. Ignoring the blocks (which take the same 570 // amount of space regardless how many extents they are split over), the 571 // overhead of an extent is the type tag and three varints (message length, 572 // base, and byte count). Assuming a reasonable "expected worst case" with 573 // 4-byte varints (28 bits) that's 13 bytes. 574 // 575 // However, the smaller the block, the smaller the cost of an extent, with a 576 // minimum baseline of 1 byte for the byte count. Moreover, splitting a long 577 // extent shortens the byte count, so a reasonable heuristic average case is a 578 // 2-3 byte base and 1-2 byte count, or 4-6 bytes. To account for this, use the 579 // square root of the block size. That's cheaper than a log, and accuracy is 580 // not important on short sizes. 581 func isWorthTrimming(nz, n int) bool { return nz >= 13 || nz*nz >= n }