github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bgzf/reader.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bgzf
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"compress/flate"
    11  	"compress/gzip"
    12  	"io"
    13  	"runtime"
    14  	"sync"
    15  
    16  	"github.com/Schaudge/grailbase/compress/libdeflate"
    17  )
    18  
    19  // countReader wraps flate.Reader, adding support for querying current offset.
    20  type countReader struct {
    21  	// Underlying Reader.
    22  	fr flate.Reader
    23  
    24  	// Offset within the underlying reader.
    25  	off int64
    26  }
    27  
    28  // newCountReader returns a new countReader.
    29  func newCountReader(r io.Reader) *countReader {
    30  	switch r := r.(type) {
    31  	case *countReader:
    32  		panic("bgzf: illegal use of internal type")
    33  	case flate.Reader:
    34  		return &countReader{fr: r}
    35  	default:
    36  		return &countReader{fr: bufio.NewReader(r)}
    37  	}
    38  }
    39  
    40  // Read is required to satisfy flate.Reader.
    41  func (r *countReader) Read(p []byte) (int, error) {
    42  	n, err := r.fr.Read(p)
    43  	r.off += int64(n)
    44  	return n, err
    45  }
    46  
    47  // ReadByte is required to satisfy flate.Reader.
    48  func (r *countReader) ReadByte() (byte, error) {
    49  	b, err := r.fr.ReadByte()
    50  	if err == nil {
    51  		r.off++
    52  	}
    53  	return b, err
    54  }
    55  
    56  // offset returns the current offset in the underlying reader.
    57  func (r *countReader) offset() int64 { return r.off }
    58  
    59  // seek moves the countReader to the specified offset using rs as the
    60  // underlying reader.
    61  func (r *countReader) seek(rs io.ReadSeeker, off int64) error {
    62  	_, err := rs.Seek(off, 0)
    63  	if err != nil {
    64  		return err
    65  	}
    66  
    67  	type reseter interface {
    68  		Reset(io.Reader)
    69  	}
    70  	switch cr := r.fr.(type) {
    71  	case reseter:
    72  		cr.Reset(rs)
    73  	default:
    74  		r.fr = newCountReader(rs)
    75  	}
    76  	r.off = off
    77  
    78  	return nil
    79  }
    80  
    81  // buffer is a flate.Reader used by a decompressor to store read-ahead data.
    82  type buffer struct {
    83  	// Buffered compressed data from read ahead.
    84  	off  int // Current position in buffered data.
    85  	size int // Total size of buffered data.
    86  	data [MaxBlockSize]byte
    87  }
    88  
    89  // Read provides the flate.Decompressor Read method.
    90  func (r *buffer) Read(b []byte) (int, error) {
    91  	if r.off >= r.size {
    92  		return 0, io.EOF
    93  	}
    94  	if n := r.size - r.off; len(b) > n {
    95  		b = b[:n]
    96  	}
    97  	n := copy(b, r.data[r.off:])
    98  	r.off += n
    99  	return n, nil
   100  }
   101  
   102  // ReadByte provides the flate.Decompressor ReadByte method.
   103  func (r *buffer) ReadByte() (byte, error) {
   104  	if r.off == r.size {
   105  		return 0, io.EOF
   106  	}
   107  	b := r.data[r.off]
   108  	r.off++
   109  	return b, nil
   110  }
   111  
   112  // reset makes the buffer available to store data.
   113  func (r *buffer) reset() { r.size = 0 }
   114  
   115  // hasData returns whether the buffer has any data buffered.
   116  func (r *buffer) hasData() bool { return r.size != 0 }
   117  
   118  // readLimited reads n bytes into the buffer from the given source.
   119  func (r *buffer) readLimited(n int, src *countReader) error {
   120  	if r.hasData() {
   121  		panic("bgzf: read into non-empty buffer")
   122  	}
   123  	r.off = 0
   124  	if n < 0 || n > len(r.data) {
   125  		return ErrCorrupt
   126  	}
   127  	var err error
   128  	r.size, err = io.ReadFull(src, r.data[:n])
   129  	return err
   130  }
   131  
   132  // decompressor is a gzip member decompressor worker.
   133  type decompressor struct {
   134  	owner *Reader
   135  
   136  	gz gzip.Reader
   137  
   138  	cr *countReader
   139  
   140  	// Current block size.
   141  	blockSize int
   142  
   143  	// Buffered compressed data from read ahead.
   144  	buf buffer
   145  
   146  	// Decompressed data.
   147  	wg  sync.WaitGroup
   148  	blk Block
   149  
   150  	err error
   151  }
   152  
   153  // Read provides the Read method for the decompressor's gzip.Reader.
   154  func (d *decompressor) Read(b []byte) (int, error) {
   155  	if d.buf.hasData() {
   156  		return d.buf.Read(b)
   157  	}
   158  	return d.cr.Read(b)
   159  }
   160  
   161  // ReadByte provides the ReadByte method for the decompressor's gzip.Reader.
   162  func (d *decompressor) ReadByte() (byte, error) {
   163  	if d.buf.hasData() {
   164  		return d.buf.ReadByte()
   165  	}
   166  	return d.cr.ReadByte()
   167  }
   168  
   169  // lazyBlock conditionally creates a ready to use Block.
   170  func (d *decompressor) lazyBlock() {
   171  	if d.blk == nil {
   172  		if w, ok := d.owner.cache.(Wrapper); ok {
   173  			d.blk = w.Wrap(&block{owner: d.owner})
   174  		} else {
   175  			d.blk = &block{owner: d.owner}
   176  		}
   177  		return
   178  	}
   179  	if !d.blk.ownedBy(d.owner) {
   180  		d.blk.setOwner(d.owner)
   181  	}
   182  }
   183  
   184  // acquireHead gains the read head from the decompressor's owner.
   185  func (d *decompressor) acquireHead() {
   186  	d.wg.Add(1)
   187  	d.cr = <-d.owner.head
   188  }
   189  
   190  // releaseHead releases the read head back to the decompressor's owner.
   191  func (d *decompressor) releaseHead() {
   192  	d.owner.head <- d.cr
   193  	d.cr = nil // Defensively zero the reader.
   194  }
   195  
   196  // wait waits for the current member to be decompressed or fail, and returns
   197  // the resulting error state.
   198  func (d *decompressor) wait() (Block, error) {
   199  	d.wg.Wait()
   200  	blk := d.blk
   201  	d.blk = nil
   202  	return blk, d.err
   203  }
   204  
   205  // using sets the Block for the decompressor to work with.
   206  func (d *decompressor) using(b Block) *decompressor { d.blk = b; return d }
   207  
   208  // nextBlockAt makes the decompressor ready for reading decompressed data
   209  // from its Block. It checks if there is a cached Block for the nextBase,
   210  // otherwise it seeks to the correct location if decompressor is not
   211  // correctly positioned, and then reads the compressed data and fills
   212  // the decompressed Block.
   213  // After nextBlockAt returns without error, the decompressor's Block
   214  // holds a valid gzip.Header and base offset.
   215  func (d *decompressor) nextBlockAt(off int64, rs io.ReadSeeker) *decompressor {
   216  	d.err = nil
   217  	for {
   218  		exists, next := d.owner.cacheHasBlockFor(off)
   219  		if !exists {
   220  			break
   221  		}
   222  		off = next
   223  	}
   224  
   225  	d.lazyBlock()
   226  
   227  	d.acquireHead()
   228  
   229  	if d.cr.offset() != off {
   230  		if rs == nil {
   231  			// It should not be possible for the expected next block base
   232  			// to be out of register with the count reader unless Seek
   233  			// has been called, so we know the base reader must be an
   234  			// io.ReadSeeker.
   235  			var ok bool
   236  			rs, ok = d.owner.r.(io.ReadSeeker)
   237  			if !ok {
   238  				d.err = ErrCorrupt
   239  				d.wg.Done()
   240  				d.releaseHead()
   241  				return d
   242  			}
   243  		}
   244  		d.err = d.cr.seek(rs, off)
   245  		if d.err != nil {
   246  			d.wg.Done()
   247  			d.releaseHead()
   248  			return d
   249  		}
   250  	}
   251  
   252  	d.blk.setBase(d.cr.offset())
   253  	d.err = d.readMember()
   254  	if d.err != nil {
   255  		d.wg.Done()
   256  		d.releaseHead()
   257  		return d
   258  	}
   259  	d.blk.setHeader(d.gz.Header)
   260  	d.gz.Header = gzip.Header{} // Prevent retention of header field in next use.
   261  
   262  	// Decompress data into the decompressor's Block.
   263  	go func() {
   264  		// Possible todo: use a pool of preallocated libdeflate.Decompressor
   265  		// objects instead.
   266  		var dd libdeflate.Decompressor
   267  		d.err = dd.Init()
   268  		if d.err == nil {
   269  			d.err = d.blk.readBuf(d.buf.data[:d.buf.size], dd)
   270  			dd.Cleanup()
   271  		}
   272  		d.releaseHead()
   273  		d.wg.Done()
   274  	}()
   275  	return d
   276  }
   277  
   278  // expectedMemberSize returns the size of the BGZF conformant gzip member.
   279  // It returns -1 if no BGZF block size field is found.
   280  func expectedMemberSize(h gzip.Header) int {
   281  	i := bytes.Index(h.Extra, bgzfExtraPrefix)
   282  	if i < 0 || i+5 >= len(h.Extra) {
   283  		return -1
   284  	}
   285  	return (int(h.Extra[i+4]) | int(h.Extra[i+5])<<8) + 1
   286  }
   287  
   288  // readMember buffers the gzip member starting the current decompressor offset.
   289  func (d *decompressor) readMember() error {
   290  	// Set the decompressor to Read from the underlying flate.Reader
   291  	// and mark the starting offset from which the underlying reader
   292  	// was used.
   293  	d.buf.reset()
   294  	mark := d.cr.offset()
   295  
   296  	err := d.gz.Reset(d)
   297  	if err != nil {
   298  		d.blockSize = -1
   299  		return err
   300  	}
   301  
   302  	d.blockSize = expectedMemberSize(d.gz.Header)
   303  	if d.blockSize < 0 {
   304  		return ErrNoBlockSize
   305  	}
   306  	skipped := int(d.cr.offset() - mark)
   307  	need := d.blockSize - skipped
   308  	if need == 0 {
   309  		return io.EOF
   310  	} else if need < 0 {
   311  		return ErrCorrupt
   312  	}
   313  
   314  	// Read compressed data into the decompressor buffer until the
   315  	// underlying flate.Reader is positioned at the end of the gzip
   316  	// member in which the readMember call was made.
   317  	return d.buf.readLimited(d.blockSize-skipped, d.cr)
   318  }
   319  
   320  // Offset is a BGZF virtual offset.
   321  type Offset struct {
   322  	File  int64
   323  	Block uint16
   324  }
   325  
   326  // Chunk is a region of a BGZF file.
   327  type Chunk struct {
   328  	Begin Offset
   329  	End   Offset
   330  }
   331  
   332  // Reader implements BGZF blocked gzip decompression.
   333  type Reader struct {
   334  	gzip.Header
   335  	r io.Reader
   336  
   337  	// head serialises access to the underlying
   338  	// io.Reader.
   339  	head chan *countReader
   340  
   341  	// lastChunk is the virtual file offset
   342  	// interval of the last successful read
   343  	// or seek operation.
   344  	lastChunk Chunk
   345  
   346  	// Blocked specifies the behaviour of the
   347  	// Reader at the end of a BGZF member.
   348  	// If the Reader is Blocked, a Read that
   349  	// reaches the end of a BGZF block will
   350  	// return io.EOF. This error is not sticky,
   351  	// so a subsequent Read will progress to
   352  	// the next block if it is available.
   353  	Blocked bool
   354  
   355  	// Non-concurrent work decompressor.
   356  	dec *decompressor
   357  
   358  	// Concurrent work fields.
   359  	waiting chan *decompressor
   360  	working chan *decompressor
   361  	control chan int64
   362  	done    chan struct{}
   363  
   364  	current Block
   365  
   366  	// cache is the Reader block cache. If Cache is not nil,
   367  	// the cache is queried for blocks before an attempt to
   368  	// read from the underlying io.Reader.
   369  	mu    sync.RWMutex
   370  	cache Cache
   371  
   372  	err error
   373  }
   374  
   375  // NewReader returns a new BGZF reader.
   376  //
   377  // The number of concurrent read decompressors is specified by rd.
   378  // If rd is 0, GOMAXPROCS concurrent will be created. The returned
   379  // Reader should be closed after use to avoid leaking resources.
   380  func NewReader(r io.Reader, rd int) (*Reader, error) {
   381  	if rd == 0 {
   382  		rd = runtime.GOMAXPROCS(0)
   383  	}
   384  	bg := &Reader{
   385  		r: r,
   386  
   387  		head: make(chan *countReader, 1),
   388  	}
   389  	bg.head <- newCountReader(r)
   390  
   391  	// Make work loop control structures.
   392  	if rd > 1 {
   393  		bg.waiting = make(chan *decompressor, rd)
   394  		bg.working = make(chan *decompressor, rd)
   395  		bg.control = make(chan int64, 1)
   396  		bg.done = make(chan struct{})
   397  		for ; rd > 1; rd-- {
   398  			bg.waiting <- &decompressor{owner: bg}
   399  		}
   400  	}
   401  
   402  	// Read the first block now so we can fail before
   403  	// the first Read call if there is a problem.
   404  	bg.dec = &decompressor{owner: bg}
   405  	blk, err := bg.dec.nextBlockAt(0, nil).wait()
   406  	if err != nil {
   407  		return nil, err
   408  	}
   409  	bg.current = blk
   410  	bg.Header = bg.current.header()
   411  
   412  	// Set up work loop if rd was > 1.
   413  	if bg.control != nil {
   414  		bg.waiting <- bg.dec
   415  		bg.dec = nil
   416  		next := blk.NextBase()
   417  		go func() {
   418  			defer func() {
   419  				bg.mu.Lock()
   420  				bg.cache = nil
   421  				bg.mu.Unlock()
   422  				close(bg.done)
   423  			}()
   424  			for dec := range bg.waiting {
   425  				var open bool
   426  				if next < 0 {
   427  					next, open = <-bg.control
   428  					if !open {
   429  						return
   430  					}
   431  				} else {
   432  					select {
   433  					case next, open = <-bg.control:
   434  						if !open {
   435  							return
   436  						}
   437  					default:
   438  					}
   439  				}
   440  				dec.nextBlockAt(next, nil)
   441  				next = dec.blk.NextBase()
   442  				bg.working <- dec
   443  			}
   444  		}()
   445  	}
   446  
   447  	return bg, nil
   448  }
   449  
   450  // SetCache sets the cache to be used by the Reader.
   451  func (bg *Reader) SetCache(c Cache) {
   452  	bg.mu.Lock()
   453  	bg.cache = c
   454  	bg.mu.Unlock()
   455  }
   456  
   457  // Seek performs a seek operation to the given virtual offset.
   458  func (bg *Reader) Seek(off Offset) error {
   459  	rs, ok := bg.r.(io.ReadSeeker)
   460  	if !ok {
   461  		return ErrNotASeeker
   462  	}
   463  
   464  	if off.File != bg.current.Base() || !bg.current.hasData() {
   465  		ok := bg.cacheSwap(off.File)
   466  		if !ok {
   467  			var dec *decompressor
   468  			if bg.dec != nil {
   469  				dec = bg.dec
   470  			} else {
   471  				select {
   472  				case dec = <-bg.waiting:
   473  				case dec = <-bg.working:
   474  					blk, err := dec.wait()
   475  					if err == nil {
   476  						bg.keep(blk)
   477  					}
   478  				}
   479  			}
   480  			bg.current, bg.err = dec.
   481  				using(bg.current).
   482  				nextBlockAt(off.File, rs).
   483  				wait()
   484  			if bg.dec == nil {
   485  				select {
   486  				case <-bg.control:
   487  				default:
   488  				}
   489  				bg.control <- bg.current.NextBase()
   490  				bg.waiting <- dec
   491  			}
   492  			bg.Header = bg.current.header()
   493  			if bg.err != nil {
   494  				return bg.err
   495  			}
   496  		}
   497  	}
   498  
   499  	bg.err = bg.current.seek(int64(off.Block))
   500  	if bg.err == nil {
   501  		bg.lastChunk = Chunk{Begin: off, End: off}
   502  	}
   503  
   504  	return bg.err
   505  }
   506  
   507  // LastChunk returns the region of the BGZF file read by the last
   508  // successful read operation or the resulting virtual offset of
   509  // the last successful seek operation.
   510  func (bg *Reader) LastChunk() Chunk { return bg.lastChunk }
   511  
   512  // BlockLen returns the number of bytes remaining to be read from the
   513  // current BGZF block.
   514  func (bg *Reader) BlockLen() int { return bg.current.len() }
   515  
   516  // Close closes the reader and releases resources.
   517  func (bg *Reader) Close() error {
   518  	if bg.control != nil {
   519  		close(bg.control)
   520  		close(bg.waiting)
   521  		<-bg.done
   522  	}
   523  	if bg.err == io.EOF {
   524  		return nil
   525  	}
   526  	return bg.err
   527  }
   528  
   529  // Read implements the io.Reader interface.
   530  func (bg *Reader) Read(p []byte) (int, error) {
   531  	if bg.err != nil {
   532  		return 0, bg.err
   533  	}
   534  
   535  	// Discard leading empty blocks. This is an indexing
   536  	// optimisation to avoid retaining useless members
   537  	// in a BAI/CSI.
   538  	for bg.current.len() == 0 {
   539  		bg.err = bg.nextBlock()
   540  		if bg.err != nil {
   541  			return 0, bg.err
   542  		}
   543  	}
   544  
   545  	bg.lastChunk.Begin = bg.current.txOffset()
   546  
   547  	var n int
   548  	for n < len(p) && bg.err == nil {
   549  		var _n int
   550  		_n, bg.err = bg.current.Read(p[n:])
   551  		n += _n
   552  		if bg.err == io.EOF {
   553  			if n == len(p) {
   554  				bg.err = nil
   555  				break
   556  			}
   557  
   558  			if bg.Blocked {
   559  				bg.err = nil
   560  				bg.lastChunk.End = bg.current.txOffset()
   561  				return n, io.EOF
   562  			}
   563  
   564  			bg.err = bg.nextBlock()
   565  			if bg.err != nil {
   566  				break
   567  			}
   568  		}
   569  	}
   570  
   571  	bg.lastChunk.End = bg.current.txOffset()
   572  	return n, bg.err
   573  }
   574  
   575  // nextBlock swaps the current decompressed block for the next
   576  // in the stream. If the block is available from the cache
   577  // no additional work is done, otherwise a decompressor is
   578  // used or waited on.
   579  func (bg *Reader) nextBlock() error {
   580  	base := bg.current.NextBase()
   581  	ok := bg.cacheSwap(base)
   582  	if ok {
   583  		bg.Header = bg.current.header()
   584  		return nil
   585  	}
   586  
   587  	var err error
   588  	if bg.dec != nil {
   589  		bg.dec.using(bg.current).nextBlockAt(base, nil)
   590  		bg.current, err = bg.dec.wait()
   591  	} else {
   592  		var ok bool
   593  		for i := 0; i < cap(bg.working); i++ {
   594  			dec := <-bg.working
   595  			bg.current, err = dec.wait()
   596  			bg.waiting <- dec
   597  			if bg.current.Base() == base {
   598  				ok = true
   599  				break
   600  			}
   601  			if err == nil {
   602  				bg.keep(bg.current)
   603  				bg.current = nil
   604  			}
   605  		}
   606  		if !ok {
   607  			panic("bgzf: unexpected block")
   608  		}
   609  	}
   610  	if err != nil {
   611  		return err
   612  	}
   613  
   614  	// Only set header if there was no error.
   615  	h := bg.current.header()
   616  	if bg.current.isMagicBlock() {
   617  		// TODO(kortschak): Do this more carefully. It may be that
   618  		// someone actually has extra data in this field that we are
   619  		// clobbering.
   620  		bg.Header.Extra = h.Extra
   621  	} else {
   622  		bg.Header = h
   623  	}
   624  
   625  	return nil
   626  }
   627  
   628  // cacheSwap attempts to swap the current Block for a cached Block
   629  // for the given base offset. It returns true if successful.
   630  func (bg *Reader) cacheSwap(base int64) bool {
   631  	bg.mu.RLock()
   632  	defer bg.mu.RUnlock()
   633  	if bg.cache == nil {
   634  		return false
   635  	}
   636  
   637  	blk, err := bg.cachedBlockFor(base)
   638  	if err != nil {
   639  		return false
   640  	}
   641  	if blk != nil {
   642  		// TODO(kortschak): Under some conditions, e.g. FIFO
   643  		// cache we will be discarding a non-nil evicted Block.
   644  		// Consider retaining these in a sync.Pool.
   645  		bg.cachePut(bg.current)
   646  		bg.current = blk
   647  		return true
   648  	}
   649  	var retained bool
   650  	bg.current, retained = bg.cachePut(bg.current)
   651  	if retained {
   652  		bg.current = nil
   653  	}
   654  	return false
   655  }
   656  
   657  // cacheHasBlockFor returns whether the Reader's cache has a block
   658  // for the given base offset. If the requested Block exists, the base
   659  // offset of the following Block is returned.
   660  func (bg *Reader) cacheHasBlockFor(base int64) (exists bool, next int64) {
   661  	bg.mu.RLock()
   662  	defer bg.mu.RUnlock()
   663  	if bg.cache == nil {
   664  		return false, -1
   665  	}
   666  	return bg.cache.Peek(base)
   667  }
   668  
   669  // cachedBlockFor returns a non-nil Block if the Reader has access to a
   670  // cache and the cache holds the block with the given base and the
   671  // correct owner, otherwise it returns nil. If the Block's owner is not
   672  // correct, or the Block cannot seek to the start of its data, a non-nil
   673  // error is returned.
   674  func (bg *Reader) cachedBlockFor(base int64) (Block, error) {
   675  	blk := bg.cache.Get(base)
   676  	if blk != nil {
   677  		if !blk.ownedBy(bg) {
   678  			return nil, ErrContaminatedCache
   679  		}
   680  		err := blk.seek(0)
   681  		if err != nil {
   682  			return nil, err
   683  		}
   684  	}
   685  	return blk, nil
   686  }
   687  
   688  // cachePut puts the given Block into the cache if it exists, it returns
   689  // the Block that was evicted or b if it was not retained, and whether
   690  // the Block was retained by the cache.
   691  func (bg *Reader) cachePut(b Block) (evicted Block, retained bool) {
   692  	if b == nil || !b.hasData() {
   693  		return b, false
   694  	}
   695  	return bg.cache.Put(b)
   696  }
   697  
   698  // keep puts the given Block into the cache if it exists.
   699  func (bg *Reader) keep(b Block) {
   700  	if b == nil || !b.hasData() {
   701  		return
   702  	}
   703  	bg.mu.RLock()
   704  	defer bg.mu.RUnlock()
   705  	if bg.cache != nil {
   706  		bg.cache.Put(b)
   707  	}
   708  }
   709  
   710  // Begin returns a Tx that starts at the current virtual offset.
   711  func (bg *Reader) Begin() Tx { return Tx{begin: bg.lastChunk.Begin, r: bg} }
   712  
   713  // Tx represents a multi-read transaction.
   714  type Tx struct {
   715  	begin Offset
   716  	r     *Reader
   717  }
   718  
   719  // End returns the Chunk spanning the transaction. After return the Tx is
   720  // no longer valid.
   721  func (t *Tx) End() Chunk {
   722  	c := Chunk{Begin: t.begin, End: t.r.lastChunk.End}
   723  	t.r = nil
   724  	return c
   725  }