github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bgzf/reader.go (about) 1 // Copyright ©2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bgzf 6 7 import ( 8 "bufio" 9 "bytes" 10 "compress/flate" 11 "compress/gzip" 12 "io" 13 "runtime" 14 "sync" 15 16 "github.com/Schaudge/grailbase/compress/libdeflate" 17 ) 18 19 // countReader wraps flate.Reader, adding support for querying current offset. 20 type countReader struct { 21 // Underlying Reader. 22 fr flate.Reader 23 24 // Offset within the underlying reader. 25 off int64 26 } 27 28 // newCountReader returns a new countReader. 29 func newCountReader(r io.Reader) *countReader { 30 switch r := r.(type) { 31 case *countReader: 32 panic("bgzf: illegal use of internal type") 33 case flate.Reader: 34 return &countReader{fr: r} 35 default: 36 return &countReader{fr: bufio.NewReader(r)} 37 } 38 } 39 40 // Read is required to satisfy flate.Reader. 41 func (r *countReader) Read(p []byte) (int, error) { 42 n, err := r.fr.Read(p) 43 r.off += int64(n) 44 return n, err 45 } 46 47 // ReadByte is required to satisfy flate.Reader. 48 func (r *countReader) ReadByte() (byte, error) { 49 b, err := r.fr.ReadByte() 50 if err == nil { 51 r.off++ 52 } 53 return b, err 54 } 55 56 // offset returns the current offset in the underlying reader. 57 func (r *countReader) offset() int64 { return r.off } 58 59 // seek moves the countReader to the specified offset using rs as the 60 // underlying reader. 61 func (r *countReader) seek(rs io.ReadSeeker, off int64) error { 62 _, err := rs.Seek(off, 0) 63 if err != nil { 64 return err 65 } 66 67 type reseter interface { 68 Reset(io.Reader) 69 } 70 switch cr := r.fr.(type) { 71 case reseter: 72 cr.Reset(rs) 73 default: 74 r.fr = newCountReader(rs) 75 } 76 r.off = off 77 78 return nil 79 } 80 81 // buffer is a flate.Reader used by a decompressor to store read-ahead data. 82 type buffer struct { 83 // Buffered compressed data from read ahead. 84 off int // Current position in buffered data. 85 size int // Total size of buffered data. 86 data [MaxBlockSize]byte 87 } 88 89 // Read provides the flate.Decompressor Read method. 90 func (r *buffer) Read(b []byte) (int, error) { 91 if r.off >= r.size { 92 return 0, io.EOF 93 } 94 if n := r.size - r.off; len(b) > n { 95 b = b[:n] 96 } 97 n := copy(b, r.data[r.off:]) 98 r.off += n 99 return n, nil 100 } 101 102 // ReadByte provides the flate.Decompressor ReadByte method. 103 func (r *buffer) ReadByte() (byte, error) { 104 if r.off == r.size { 105 return 0, io.EOF 106 } 107 b := r.data[r.off] 108 r.off++ 109 return b, nil 110 } 111 112 // reset makes the buffer available to store data. 113 func (r *buffer) reset() { r.size = 0 } 114 115 // hasData returns whether the buffer has any data buffered. 116 func (r *buffer) hasData() bool { return r.size != 0 } 117 118 // readLimited reads n bytes into the buffer from the given source. 119 func (r *buffer) readLimited(n int, src *countReader) error { 120 if r.hasData() { 121 panic("bgzf: read into non-empty buffer") 122 } 123 r.off = 0 124 if n < 0 || n > len(r.data) { 125 return ErrCorrupt 126 } 127 var err error 128 r.size, err = io.ReadFull(src, r.data[:n]) 129 return err 130 } 131 132 // decompressor is a gzip member decompressor worker. 133 type decompressor struct { 134 owner *Reader 135 136 gz gzip.Reader 137 138 cr *countReader 139 140 // Current block size. 141 blockSize int 142 143 // Buffered compressed data from read ahead. 144 buf buffer 145 146 // Decompressed data. 147 wg sync.WaitGroup 148 blk Block 149 150 err error 151 } 152 153 // Read provides the Read method for the decompressor's gzip.Reader. 154 func (d *decompressor) Read(b []byte) (int, error) { 155 if d.buf.hasData() { 156 return d.buf.Read(b) 157 } 158 return d.cr.Read(b) 159 } 160 161 // ReadByte provides the ReadByte method for the decompressor's gzip.Reader. 162 func (d *decompressor) ReadByte() (byte, error) { 163 if d.buf.hasData() { 164 return d.buf.ReadByte() 165 } 166 return d.cr.ReadByte() 167 } 168 169 // lazyBlock conditionally creates a ready to use Block. 170 func (d *decompressor) lazyBlock() { 171 if d.blk == nil { 172 if w, ok := d.owner.cache.(Wrapper); ok { 173 d.blk = w.Wrap(&block{owner: d.owner}) 174 } else { 175 d.blk = &block{owner: d.owner} 176 } 177 return 178 } 179 if !d.blk.ownedBy(d.owner) { 180 d.blk.setOwner(d.owner) 181 } 182 } 183 184 // acquireHead gains the read head from the decompressor's owner. 185 func (d *decompressor) acquireHead() { 186 d.wg.Add(1) 187 d.cr = <-d.owner.head 188 } 189 190 // releaseHead releases the read head back to the decompressor's owner. 191 func (d *decompressor) releaseHead() { 192 d.owner.head <- d.cr 193 d.cr = nil // Defensively zero the reader. 194 } 195 196 // wait waits for the current member to be decompressed or fail, and returns 197 // the resulting error state. 198 func (d *decompressor) wait() (Block, error) { 199 d.wg.Wait() 200 blk := d.blk 201 d.blk = nil 202 return blk, d.err 203 } 204 205 // using sets the Block for the decompressor to work with. 206 func (d *decompressor) using(b Block) *decompressor { d.blk = b; return d } 207 208 // nextBlockAt makes the decompressor ready for reading decompressed data 209 // from its Block. It checks if there is a cached Block for the nextBase, 210 // otherwise it seeks to the correct location if decompressor is not 211 // correctly positioned, and then reads the compressed data and fills 212 // the decompressed Block. 213 // After nextBlockAt returns without error, the decompressor's Block 214 // holds a valid gzip.Header and base offset. 215 func (d *decompressor) nextBlockAt(off int64, rs io.ReadSeeker) *decompressor { 216 d.err = nil 217 for { 218 exists, next := d.owner.cacheHasBlockFor(off) 219 if !exists { 220 break 221 } 222 off = next 223 } 224 225 d.lazyBlock() 226 227 d.acquireHead() 228 229 if d.cr.offset() != off { 230 if rs == nil { 231 // It should not be possible for the expected next block base 232 // to be out of register with the count reader unless Seek 233 // has been called, so we know the base reader must be an 234 // io.ReadSeeker. 235 var ok bool 236 rs, ok = d.owner.r.(io.ReadSeeker) 237 if !ok { 238 d.err = ErrCorrupt 239 d.wg.Done() 240 d.releaseHead() 241 return d 242 } 243 } 244 d.err = d.cr.seek(rs, off) 245 if d.err != nil { 246 d.wg.Done() 247 d.releaseHead() 248 return d 249 } 250 } 251 252 d.blk.setBase(d.cr.offset()) 253 d.err = d.readMember() 254 if d.err != nil { 255 d.wg.Done() 256 d.releaseHead() 257 return d 258 } 259 d.blk.setHeader(d.gz.Header) 260 d.gz.Header = gzip.Header{} // Prevent retention of header field in next use. 261 262 // Decompress data into the decompressor's Block. 263 go func() { 264 // Possible todo: use a pool of preallocated libdeflate.Decompressor 265 // objects instead. 266 var dd libdeflate.Decompressor 267 d.err = dd.Init() 268 if d.err == nil { 269 d.err = d.blk.readBuf(d.buf.data[:d.buf.size], dd) 270 dd.Cleanup() 271 } 272 d.releaseHead() 273 d.wg.Done() 274 }() 275 return d 276 } 277 278 // expectedMemberSize returns the size of the BGZF conformant gzip member. 279 // It returns -1 if no BGZF block size field is found. 280 func expectedMemberSize(h gzip.Header) int { 281 i := bytes.Index(h.Extra, bgzfExtraPrefix) 282 if i < 0 || i+5 >= len(h.Extra) { 283 return -1 284 } 285 return (int(h.Extra[i+4]) | int(h.Extra[i+5])<<8) + 1 286 } 287 288 // readMember buffers the gzip member starting the current decompressor offset. 289 func (d *decompressor) readMember() error { 290 // Set the decompressor to Read from the underlying flate.Reader 291 // and mark the starting offset from which the underlying reader 292 // was used. 293 d.buf.reset() 294 mark := d.cr.offset() 295 296 err := d.gz.Reset(d) 297 if err != nil { 298 d.blockSize = -1 299 return err 300 } 301 302 d.blockSize = expectedMemberSize(d.gz.Header) 303 if d.blockSize < 0 { 304 return ErrNoBlockSize 305 } 306 skipped := int(d.cr.offset() - mark) 307 need := d.blockSize - skipped 308 if need == 0 { 309 return io.EOF 310 } else if need < 0 { 311 return ErrCorrupt 312 } 313 314 // Read compressed data into the decompressor buffer until the 315 // underlying flate.Reader is positioned at the end of the gzip 316 // member in which the readMember call was made. 317 return d.buf.readLimited(d.blockSize-skipped, d.cr) 318 } 319 320 // Offset is a BGZF virtual offset. 321 type Offset struct { 322 File int64 323 Block uint16 324 } 325 326 // Chunk is a region of a BGZF file. 327 type Chunk struct { 328 Begin Offset 329 End Offset 330 } 331 332 // Reader implements BGZF blocked gzip decompression. 333 type Reader struct { 334 gzip.Header 335 r io.Reader 336 337 // head serialises access to the underlying 338 // io.Reader. 339 head chan *countReader 340 341 // lastChunk is the virtual file offset 342 // interval of the last successful read 343 // or seek operation. 344 lastChunk Chunk 345 346 // Blocked specifies the behaviour of the 347 // Reader at the end of a BGZF member. 348 // If the Reader is Blocked, a Read that 349 // reaches the end of a BGZF block will 350 // return io.EOF. This error is not sticky, 351 // so a subsequent Read will progress to 352 // the next block if it is available. 353 Blocked bool 354 355 // Non-concurrent work decompressor. 356 dec *decompressor 357 358 // Concurrent work fields. 359 waiting chan *decompressor 360 working chan *decompressor 361 control chan int64 362 done chan struct{} 363 364 current Block 365 366 // cache is the Reader block cache. If Cache is not nil, 367 // the cache is queried for blocks before an attempt to 368 // read from the underlying io.Reader. 369 mu sync.RWMutex 370 cache Cache 371 372 err error 373 } 374 375 // NewReader returns a new BGZF reader. 376 // 377 // The number of concurrent read decompressors is specified by rd. 378 // If rd is 0, GOMAXPROCS concurrent will be created. The returned 379 // Reader should be closed after use to avoid leaking resources. 380 func NewReader(r io.Reader, rd int) (*Reader, error) { 381 if rd == 0 { 382 rd = runtime.GOMAXPROCS(0) 383 } 384 bg := &Reader{ 385 r: r, 386 387 head: make(chan *countReader, 1), 388 } 389 bg.head <- newCountReader(r) 390 391 // Make work loop control structures. 392 if rd > 1 { 393 bg.waiting = make(chan *decompressor, rd) 394 bg.working = make(chan *decompressor, rd) 395 bg.control = make(chan int64, 1) 396 bg.done = make(chan struct{}) 397 for ; rd > 1; rd-- { 398 bg.waiting <- &decompressor{owner: bg} 399 } 400 } 401 402 // Read the first block now so we can fail before 403 // the first Read call if there is a problem. 404 bg.dec = &decompressor{owner: bg} 405 blk, err := bg.dec.nextBlockAt(0, nil).wait() 406 if err != nil { 407 return nil, err 408 } 409 bg.current = blk 410 bg.Header = bg.current.header() 411 412 // Set up work loop if rd was > 1. 413 if bg.control != nil { 414 bg.waiting <- bg.dec 415 bg.dec = nil 416 next := blk.NextBase() 417 go func() { 418 defer func() { 419 bg.mu.Lock() 420 bg.cache = nil 421 bg.mu.Unlock() 422 close(bg.done) 423 }() 424 for dec := range bg.waiting { 425 var open bool 426 if next < 0 { 427 next, open = <-bg.control 428 if !open { 429 return 430 } 431 } else { 432 select { 433 case next, open = <-bg.control: 434 if !open { 435 return 436 } 437 default: 438 } 439 } 440 dec.nextBlockAt(next, nil) 441 next = dec.blk.NextBase() 442 bg.working <- dec 443 } 444 }() 445 } 446 447 return bg, nil 448 } 449 450 // SetCache sets the cache to be used by the Reader. 451 func (bg *Reader) SetCache(c Cache) { 452 bg.mu.Lock() 453 bg.cache = c 454 bg.mu.Unlock() 455 } 456 457 // Seek performs a seek operation to the given virtual offset. 458 func (bg *Reader) Seek(off Offset) error { 459 rs, ok := bg.r.(io.ReadSeeker) 460 if !ok { 461 return ErrNotASeeker 462 } 463 464 if off.File != bg.current.Base() || !bg.current.hasData() { 465 ok := bg.cacheSwap(off.File) 466 if !ok { 467 var dec *decompressor 468 if bg.dec != nil { 469 dec = bg.dec 470 } else { 471 select { 472 case dec = <-bg.waiting: 473 case dec = <-bg.working: 474 blk, err := dec.wait() 475 if err == nil { 476 bg.keep(blk) 477 } 478 } 479 } 480 bg.current, bg.err = dec. 481 using(bg.current). 482 nextBlockAt(off.File, rs). 483 wait() 484 if bg.dec == nil { 485 select { 486 case <-bg.control: 487 default: 488 } 489 bg.control <- bg.current.NextBase() 490 bg.waiting <- dec 491 } 492 bg.Header = bg.current.header() 493 if bg.err != nil { 494 return bg.err 495 } 496 } 497 } 498 499 bg.err = bg.current.seek(int64(off.Block)) 500 if bg.err == nil { 501 bg.lastChunk = Chunk{Begin: off, End: off} 502 } 503 504 return bg.err 505 } 506 507 // LastChunk returns the region of the BGZF file read by the last 508 // successful read operation or the resulting virtual offset of 509 // the last successful seek operation. 510 func (bg *Reader) LastChunk() Chunk { return bg.lastChunk } 511 512 // BlockLen returns the number of bytes remaining to be read from the 513 // current BGZF block. 514 func (bg *Reader) BlockLen() int { return bg.current.len() } 515 516 // Close closes the reader and releases resources. 517 func (bg *Reader) Close() error { 518 if bg.control != nil { 519 close(bg.control) 520 close(bg.waiting) 521 <-bg.done 522 } 523 if bg.err == io.EOF { 524 return nil 525 } 526 return bg.err 527 } 528 529 // Read implements the io.Reader interface. 530 func (bg *Reader) Read(p []byte) (int, error) { 531 if bg.err != nil { 532 return 0, bg.err 533 } 534 535 // Discard leading empty blocks. This is an indexing 536 // optimisation to avoid retaining useless members 537 // in a BAI/CSI. 538 for bg.current.len() == 0 { 539 bg.err = bg.nextBlock() 540 if bg.err != nil { 541 return 0, bg.err 542 } 543 } 544 545 bg.lastChunk.Begin = bg.current.txOffset() 546 547 var n int 548 for n < len(p) && bg.err == nil { 549 var _n int 550 _n, bg.err = bg.current.Read(p[n:]) 551 n += _n 552 if bg.err == io.EOF { 553 if n == len(p) { 554 bg.err = nil 555 break 556 } 557 558 if bg.Blocked { 559 bg.err = nil 560 bg.lastChunk.End = bg.current.txOffset() 561 return n, io.EOF 562 } 563 564 bg.err = bg.nextBlock() 565 if bg.err != nil { 566 break 567 } 568 } 569 } 570 571 bg.lastChunk.End = bg.current.txOffset() 572 return n, bg.err 573 } 574 575 // nextBlock swaps the current decompressed block for the next 576 // in the stream. If the block is available from the cache 577 // no additional work is done, otherwise a decompressor is 578 // used or waited on. 579 func (bg *Reader) nextBlock() error { 580 base := bg.current.NextBase() 581 ok := bg.cacheSwap(base) 582 if ok { 583 bg.Header = bg.current.header() 584 return nil 585 } 586 587 var err error 588 if bg.dec != nil { 589 bg.dec.using(bg.current).nextBlockAt(base, nil) 590 bg.current, err = bg.dec.wait() 591 } else { 592 var ok bool 593 for i := 0; i < cap(bg.working); i++ { 594 dec := <-bg.working 595 bg.current, err = dec.wait() 596 bg.waiting <- dec 597 if bg.current.Base() == base { 598 ok = true 599 break 600 } 601 if err == nil { 602 bg.keep(bg.current) 603 bg.current = nil 604 } 605 } 606 if !ok { 607 panic("bgzf: unexpected block") 608 } 609 } 610 if err != nil { 611 return err 612 } 613 614 // Only set header if there was no error. 615 h := bg.current.header() 616 if bg.current.isMagicBlock() { 617 // TODO(kortschak): Do this more carefully. It may be that 618 // someone actually has extra data in this field that we are 619 // clobbering. 620 bg.Header.Extra = h.Extra 621 } else { 622 bg.Header = h 623 } 624 625 return nil 626 } 627 628 // cacheSwap attempts to swap the current Block for a cached Block 629 // for the given base offset. It returns true if successful. 630 func (bg *Reader) cacheSwap(base int64) bool { 631 bg.mu.RLock() 632 defer bg.mu.RUnlock() 633 if bg.cache == nil { 634 return false 635 } 636 637 blk, err := bg.cachedBlockFor(base) 638 if err != nil { 639 return false 640 } 641 if blk != nil { 642 // TODO(kortschak): Under some conditions, e.g. FIFO 643 // cache we will be discarding a non-nil evicted Block. 644 // Consider retaining these in a sync.Pool. 645 bg.cachePut(bg.current) 646 bg.current = blk 647 return true 648 } 649 var retained bool 650 bg.current, retained = bg.cachePut(bg.current) 651 if retained { 652 bg.current = nil 653 } 654 return false 655 } 656 657 // cacheHasBlockFor returns whether the Reader's cache has a block 658 // for the given base offset. If the requested Block exists, the base 659 // offset of the following Block is returned. 660 func (bg *Reader) cacheHasBlockFor(base int64) (exists bool, next int64) { 661 bg.mu.RLock() 662 defer bg.mu.RUnlock() 663 if bg.cache == nil { 664 return false, -1 665 } 666 return bg.cache.Peek(base) 667 } 668 669 // cachedBlockFor returns a non-nil Block if the Reader has access to a 670 // cache and the cache holds the block with the given base and the 671 // correct owner, otherwise it returns nil. If the Block's owner is not 672 // correct, or the Block cannot seek to the start of its data, a non-nil 673 // error is returned. 674 func (bg *Reader) cachedBlockFor(base int64) (Block, error) { 675 blk := bg.cache.Get(base) 676 if blk != nil { 677 if !blk.ownedBy(bg) { 678 return nil, ErrContaminatedCache 679 } 680 err := blk.seek(0) 681 if err != nil { 682 return nil, err 683 } 684 } 685 return blk, nil 686 } 687 688 // cachePut puts the given Block into the cache if it exists, it returns 689 // the Block that was evicted or b if it was not retained, and whether 690 // the Block was retained by the cache. 691 func (bg *Reader) cachePut(b Block) (evicted Block, retained bool) { 692 if b == nil || !b.hasData() { 693 return b, false 694 } 695 return bg.cache.Put(b) 696 } 697 698 // keep puts the given Block into the cache if it exists. 699 func (bg *Reader) keep(b Block) { 700 if b == nil || !b.hasData() { 701 return 702 } 703 bg.mu.RLock() 704 defer bg.mu.RUnlock() 705 if bg.cache != nil { 706 bg.cache.Put(b) 707 } 708 } 709 710 // Begin returns a Tx that starts at the current virtual offset. 711 func (bg *Reader) Begin() Tx { return Tx{begin: bg.lastChunk.Begin, r: bg} } 712 713 // Tx represents a multi-read transaction. 714 type Tx struct { 715 begin Offset 716 r *Reader 717 } 718 719 // End returns the Chunk spanning the transaction. After return the Tx is 720 // no longer valid. 721 func (t *Tx) End() Chunk { 722 c := Chunk{Begin: t.begin, End: t.r.lastChunk.End} 723 t.r = nil 724 return c 725 }