github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bam/reader.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bam
     6  
     7  import (
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"reflect"
    13  	"unsafe"
    14  
    15  	"github.com/Schaudge/hts/bgzf"
    16  	"github.com/Schaudge/hts/sam"
    17  )
    18  
    19  // Reader implements BAM data reading.
    20  type Reader struct {
    21  	r *bgzf.Reader
    22  	h *sam.Header
    23  	c *bgzf.Chunk
    24  
    25  	// references is cached header
    26  	// reference count.
    27  	references int32
    28  
    29  	// omit specifies how much of the
    30  	// record should be omitted during
    31  	// a read of the BAM input.
    32  	omit int
    33  
    34  	lastChunk bgzf.Chunk
    35  
    36  	// sizeBuf and sizeStorage are used to read the block size of each record
    37  	// without having to allocate new storage and a slice everytime.
    38  	sizeStorage [4]byte
    39  	sizeBuf     []byte
    40  }
    41  
    42  const maxBAMRecordSize = 0xffffff
    43  
    44  // NewReader returns a new Reader using the given io.Reader
    45  // and setting the read concurrency to rd. If rd is zero
    46  // concurrency is set to GOMAXPROCS. The returned Reader
    47  // should be closed after use to avoid leaking resources.
    48  func NewReader(r io.Reader, rd int) (*Reader, error) {
    49  	bg, err := bgzf.NewReader(r, rd)
    50  	if err != nil {
    51  		return nil, err
    52  	}
    53  	h, _ := sam.NewHeader(nil, nil)
    54  	br := &Reader{
    55  		r: bg,
    56  		h: h,
    57  
    58  		references: int32(len(h.Refs())),
    59  	}
    60  	err = br.h.DecodeBinary(br.r)
    61  	if err != nil {
    62  		return nil, err
    63  	}
    64  	br.lastChunk.End = br.r.LastChunk().End
    65  	br.sizeBuf = br.sizeStorage[:]
    66  	return br, nil
    67  }
    68  
    69  // Header returns the SAM Header held by the Reader.
    70  func (br *Reader) Header() *sam.Header {
    71  	return br.h
    72  }
    73  
    74  // BAM record layout.
    75  type bamRecordFixed struct {
    76  	blockSize int32
    77  	refID     int32
    78  	pos       int32
    79  	nLen      uint8
    80  	mapQ      uint8
    81  	bin       uint16
    82  	nCigar    uint16
    83  	flags     sam.Flags
    84  	lSeq      int32
    85  	nextRefID int32
    86  	nextPos   int32
    87  	tLen      int32
    88  }
    89  
    90  var (
    91  	lenFieldSize      = binary.Size(bamRecordFixed{}.blockSize)
    92  	bamFixedRemainder = binary.Size(bamRecordFixed{}) - lenFieldSize
    93  )
    94  
    95  func vOffset(o bgzf.Offset) int64 {
    96  	return o.File<<16 | int64(o.Block)
    97  }
    98  
    99  // Omit specifies what portions of the Record to omit reading.
   100  // When o is None, a full sam.Record is returned by Read, when o
   101  // is AuxTags the auxiliary tag data is omitted and when o is
   102  // AllVariableLengthData, sequence, quality and auxiliary data
   103  // is omitted.
   104  func (br *Reader) Omit(o int) {
   105  	br.omit = o
   106  }
   107  
   108  // None, AuxTags and AllVariableLengthData are values taken
   109  // by the Reader Omit method.
   110  const (
   111  	None                  = iota // Omit no field data from the record.
   112  	AuxTags                      // Omit auxiliary tag data.
   113  	AllVariableLengthData        // Omit sequence, quality and auxiliary data.
   114  )
   115  
   116  // Read returns the next sam.Record in the BAM stream.
   117  //
   118  // The sam.Record returned will not contain the sequence, quality or
   119  // auxiliary tag data if Omit(AllVariableLengthData) has been called
   120  // prior to the Read call and will not contain the auxiliary tag data
   121  // is Omit(AuxTags) has been called.
   122  func (br *Reader) Read() (*sam.Record, error) {
   123  	if br.c != nil && vOffset(br.r.LastChunk().End) >= vOffset(br.c.End) {
   124  		return nil, io.EOF
   125  	}
   126  	// Use a pool of buffer's to share buffers between concurrent clients
   127  	// and hence reduce the number of allocations required.
   128  	buf := bufPool.Get().([]byte)
   129  	if err := readAlignment(br, &buf); err != nil {
   130  		bufPool.Put(buf)
   131  		return nil, err
   132  	}
   133  	rec, err := unmarshal(buf, br.h, br.omit)
   134  	bufPool.Put(buf)
   135  	return rec, err
   136  }
   137  
   138  // Unmarshal a serialized record.  Parameter omit is the value of Reader.Omit().
   139  // Most callers should pass zero as omit.
   140  func unmarshal(b []byte, header *sam.Header, omit int) (*sam.Record, error) {
   141  	rec := sam.GetFromFreePool()
   142  	if len(b) < 32 {
   143  		return nil, errors.New("bam: record too short")
   144  	}
   145  	// Need to use int(int32(uint32)) to ensure 2's complement extension of -1.
   146  	refID := int(int32(binary.LittleEndian.Uint32(b)))
   147  	rec.Pos = int(int32(binary.LittleEndian.Uint32(b[4:])))
   148  	nLen := int(b[8])
   149  	rec.MapQ = b[9]
   150  	nCigar := int(binary.LittleEndian.Uint16(b[12:]))
   151  	rec.Flags = sam.Flags(binary.LittleEndian.Uint16(b[14:]))
   152  	lSeq := int(binary.LittleEndian.Uint32(b[16:]))
   153  	nextRefID := int(int32(binary.LittleEndian.Uint32(b[20:])))
   154  	rec.MatePos = int(int32(binary.LittleEndian.Uint32(b[24:])))
   155  	rec.TempLen = int(int32(binary.LittleEndian.Uint32(b[28:])))
   156  
   157  	// Read variable length data.
   158  	pos := 32
   159  
   160  	blen := len(b) - pos
   161  	cigarOffset := alignOffset(blen)                     // store the cigar int32s here
   162  	auxOffset := alignOffset(cigarOffset + (nCigar * 4)) // store the AuxFields here
   163  
   164  	nDoubletBytes := (lSeq + 1) >> 1
   165  	bAuxOffset := pos + nLen + (nCigar * 4) + nDoubletBytes + lSeq
   166  	if len(b) < bAuxOffset {
   167  		return nil, fmt.Errorf("Corrupt BAM aux record: len(b)=%d, auxoffset=%d", len(b), bAuxOffset)
   168  	}
   169  	nAuxFields, err := countAuxFields(b[bAuxOffset:])
   170  	if err != nil {
   171  		return nil, err
   172  	}
   173  	shadowSize := auxOffset + (nAuxFields * sizeofSliceHeader)
   174  
   175  	// shadowBuf is used as an 'arena' from which all objects/slices
   176  	// required to store the result of parsing the bam alignment record.
   177  	// This reduces the load on GC and consequently allows for better
   178  	// scalability with the number of cores used by clients of this package.
   179  	shadowOffset := 0
   180  	resizeScratch(&rec.Scratch, shadowSize)
   181  	shadowBuf := rec.Scratch
   182  	copy(shadowBuf, b[pos:])
   183  
   184  	bufHdr := (*reflect.SliceHeader)(unsafe.Pointer(&shadowBuf))
   185  
   186  	// Note that rec.Name now points to the shadow buffer
   187  	hdr := (*reflect.StringHeader)(unsafe.Pointer(&rec.Name))
   188  	hdr.Data = uintptr(unsafe.Pointer(bufHdr.Data))
   189  	hdr.Len = nLen - 1 // drop trailing '\0'
   190  	shadowOffset += nLen
   191  
   192  	var sliceHdr *reflect.SliceHeader
   193  
   194  	if nCigar > 0 {
   195  		for i := 0; i < nCigar; i++ {
   196  			*(*uint32)(unsafe.Pointer(&shadowBuf[cigarOffset+(i*4)])) = binary.LittleEndian.Uint32(shadowBuf[shadowOffset+(i*4):])
   197  		}
   198  		sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Cigar))
   199  		sliceHdr.Data = bufHdr.Data + uintptr(cigarOffset)
   200  		sliceHdr.Len = nCigar
   201  		sliceHdr.Cap = sliceHdr.Len
   202  		shadowOffset += nCigar * 4
   203  	} else {
   204  		sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Cigar))
   205  		sliceHdr.Data = uintptr(0)
   206  		sliceHdr.Len = 0
   207  		sliceHdr.Cap = 0
   208  	}
   209  
   210  	if omit >= AllVariableLengthData {
   211  		goto done
   212  	}
   213  
   214  	rec.Seq.Length = lSeq
   215  
   216  	sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Seq.Seq))
   217  	sliceHdr.Data = uintptr(unsafe.Pointer(bufHdr.Data + uintptr(shadowOffset)))
   218  	sliceHdr.Len = nDoubletBytes
   219  	sliceHdr.Cap = sliceHdr.Len
   220  	shadowOffset += nDoubletBytes
   221  
   222  	if omit >= AuxTags {
   223  		goto done
   224  	}
   225  
   226  	sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Qual))
   227  	sliceHdr.Data = uintptr(unsafe.Pointer(bufHdr.Data + uintptr(shadowOffset)))
   228  	sliceHdr.Len = lSeq
   229  	sliceHdr.Cap = sliceHdr.Len
   230  
   231  	shadowOffset += lSeq
   232  
   233  	if nAuxFields > 0 {
   234  		// Clear the array before updating rec.AuxFields. GC will be
   235  		// confused otherwise.
   236  		for i := auxOffset; i < auxOffset+nAuxFields*sizeofSliceHeader; i++ {
   237  			shadowBuf[i] = 0
   238  		}
   239  		sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.AuxFields))
   240  		sliceHdr.Data = uintptr(unsafe.Pointer(bufHdr.Data + uintptr(auxOffset)))
   241  		sliceHdr.Len = nAuxFields
   242  		sliceHdr.Cap = sliceHdr.Len
   243  		parseAux(shadowBuf[shadowOffset:blen], rec.AuxFields)
   244  	}
   245  
   246  done:
   247  	refs := len(header.Refs())
   248  	if refID != -1 {
   249  		if refID < -1 || refID >= refs {
   250  			return nil, errors.New("bam: reference id out of range")
   251  		}
   252  		rec.Ref = header.Refs()[refID]
   253  	}
   254  	if nextRefID != -1 {
   255  		if refID == nextRefID {
   256  			rec.MateRef = rec.Ref
   257  			return rec, nil
   258  		}
   259  		if nextRefID < -1 || nextRefID >= refs {
   260  			return nil, errors.New("bam: mate reference id out of range")
   261  		}
   262  		rec.MateRef = header.Refs()[nextRefID]
   263  	}
   264  	return rec, nil
   265  }
   266  
   267  // SetCache sets the cache to be used by the Reader.
   268  func (bg *Reader) SetCache(c bgzf.Cache) {
   269  	bg.r.SetCache(c)
   270  }
   271  
   272  // Seek performs a seek to the specified bgzf.Offset.
   273  func (br *Reader) Seek(off bgzf.Offset) error {
   274  	return br.r.Seek(off)
   275  }
   276  
   277  // SetChunk sets a limited range of the underlying BGZF file to read, after
   278  // seeking to the start of the given chunk. It may be used to iterate over
   279  // a defined genomic interval.
   280  func (br *Reader) SetChunk(c *bgzf.Chunk) error {
   281  	if c != nil {
   282  		err := br.r.Seek(c.Begin)
   283  		if err != nil {
   284  			return err
   285  		}
   286  	}
   287  	br.c = c
   288  	return nil
   289  }
   290  
   291  // LastChunk returns the bgzf.Chunk corresponding to the last Read operation.
   292  // The bgzf.Chunk returned is only valid if the last Read operation returned a
   293  // nil error.
   294  func (br *Reader) LastChunk() bgzf.Chunk {
   295  	return br.lastChunk
   296  }
   297  
   298  // Close closes the Reader.
   299  func (br *Reader) Close() error {
   300  	return br.r.Close()
   301  }
   302  
   303  // Iterator wraps a Reader to provide a convenient loop interface for reading BAM data.
   304  // Successive calls to the Next method will step through the features of the provided
   305  // Reader. Iteration stops unrecoverably at EOF or the first error.
   306  type Iterator struct {
   307  	r *Reader
   308  
   309  	chunks []bgzf.Chunk
   310  
   311  	rec *sam.Record
   312  	err error
   313  }
   314  
   315  // NewIterator returns a Iterator to read from r, limiting the reads to the provided
   316  // chunks.
   317  //
   318  //  chunks, err := idx.Chunks(ref, beg, end)
   319  //  if err != nil {
   320  //  	return err
   321  //  }
   322  //  i, err := NewIterator(r, chunks)
   323  //  if err != nil {
   324  //  	return err
   325  //  }
   326  //  for i.Next() {
   327  //  	fn(i.Record())
   328  //  }
   329  //  return i.Close()
   330  //
   331  func NewIterator(r *Reader, chunks []bgzf.Chunk) (*Iterator, error) {
   332  	if len(chunks) == 0 {
   333  		return &Iterator{r: r, err: io.EOF}, nil
   334  	}
   335  	err := r.SetChunk(&chunks[0])
   336  	if err != nil {
   337  		return nil, err
   338  	}
   339  	chunks = chunks[1:]
   340  	return &Iterator{r: r, chunks: chunks}, nil
   341  }
   342  
   343  // Next advances the Iterator past the next record, which will then be available through
   344  // the Record method. It returns false when the iteration stops, either by reaching the end of the
   345  // input or an error. After Next returns false, the Error method will return any error that
   346  // occurred during iteration, except that if it was io.EOF, Error will return nil.
   347  func (i *Iterator) Next() bool {
   348  	if i.err != nil {
   349  		return false
   350  	}
   351  	i.rec, i.err = i.r.Read()
   352  	if len(i.chunks) != 0 && i.err == io.EOF {
   353  		i.err = i.r.SetChunk(&i.chunks[0])
   354  		i.chunks = i.chunks[1:]
   355  		return i.Next()
   356  	}
   357  	return i.err == nil
   358  }
   359  
   360  // Error returns the first non-EOF error that was encountered by the Iterator.
   361  func (i *Iterator) Error() error {
   362  	if i.err == io.EOF {
   363  		return nil
   364  	}
   365  	return i.err
   366  }
   367  
   368  // Record returns the most recent record read by a call to Next.
   369  func (i *Iterator) Record() *sam.Record { return i.rec }
   370  
   371  // Close releases the underlying Reader.
   372  func (i *Iterator) Close() error {
   373  	i.r.SetChunk(nil)
   374  	return i.Error()
   375  }
   376  
   377  var jumps = [256]int{
   378  	'A': 1,
   379  	'c': 1, 'C': 1,
   380  	's': 2, 'S': 2,
   381  	'i': 4, 'I': 4,
   382  	'f': 4,
   383  	'Z': -1,
   384  	'H': -1,
   385  	'B': -1,
   386  }
   387  
   388  var errCorruptAuxField = errors.New("Corrupt aux field")
   389  
   390  // countAuxFields examines the data of a SAM record's OPT field to determine
   391  // the number of auxFields there are.
   392  func countAuxFields(aux []byte) (int, error) {
   393  	naux := 0
   394  	for i := 0; i+2 < len(aux); {
   395  		t := aux[i+2]
   396  		switch j := jumps[t]; {
   397  		case j > 0:
   398  			j += 3
   399  			i += j
   400  			naux++
   401  		case j < 0:
   402  			switch t {
   403  			case 'Z', 'H':
   404  				var (
   405  					j int
   406  					v byte
   407  				)
   408  				for j, v = range aux[i:] {
   409  					if v == 0 { // C string termination
   410  						break // Truncate terminal zero.
   411  					}
   412  				}
   413  				i += j + 1
   414  				naux++
   415  			case 'B':
   416  				if len(aux) < i+8 {
   417  					return -1, errCorruptAuxField
   418  				}
   419  				length := binary.LittleEndian.Uint32(aux[i+4 : i+8])
   420  				j = int(length)*jumps[aux[i+3]] + int(unsafe.Sizeof(length)) + 4
   421  				i += j
   422  				naux++
   423  			}
   424  		default:
   425  			return -1, errCorruptAuxField
   426  		}
   427  	}
   428  	return naux, nil
   429  }
   430  
   431  // parseAux examines the data of a SAM record's OPT fields,
   432  // returning a slice of sam.Aux that are backed by the original data.
   433  func parseAux(aux []byte, aa []sam.Aux) {
   434  	naa := 0
   435  	/*	var sliceHdr *reflect.SliceHeader
   436  		auxSlice := (*reflect.SliceHeader)(unsafe.Pointer(&aux))*/
   437  	for i := 0; i+2 < len(aux); {
   438  		t := aux[i+2]
   439  		switch j := jumps[t]; {
   440  		case j > 0:
   441  			j += 3
   442  			aa[naa] = sam.Aux(aux[i : i+j : i+j])
   443  			naa++
   444  			i += j
   445  		case j < 0:
   446  			switch t {
   447  			case 'Z', 'H':
   448  				var (
   449  					j int
   450  					v byte
   451  				)
   452  				for j, v = range aux[i:] {
   453  					if v == 0 { // C string termination
   454  						break // Truncate terminal zero.
   455  					}
   456  				}
   457  				aa[naa] = sam.Aux(aux[i : i+j : i+j])
   458  				naa++
   459  				i += j + 1
   460  			case 'B':
   461  				length := binary.LittleEndian.Uint32(aux[i+4 : i+8])
   462  				j = int(length)*jumps[aux[i+3]] + int(unsafe.Sizeof(length)) + 4
   463  				aa[naa] = sam.Aux(aux[i : i+j : i+j])
   464  				naa++
   465  				i += j
   466  			}
   467  		default:
   468  			panic(fmt.Sprintf("bam: unrecognised optional field type: %q", t))
   469  		}
   470  	}
   471  }
   472  
   473  // readAlignment reads the alignment record from the Reader's underlying
   474  // bgzf.Reader into the supplied bytes.Buffer and updates the Reader's lastChunk
   475  // field.
   476  func readAlignment(br *Reader, buf *[]byte) error {
   477  	n, err := io.ReadFull(br.r, br.sizeBuf)
   478  	// br.r.Chunk() is only valid after the call the Read(), so this
   479  	// must come after the first read in the record.
   480  	tx := br.r.Begin()
   481  	defer func() {
   482  		br.lastChunk = tx.End()
   483  	}()
   484  	if err != nil {
   485  		return err
   486  	}
   487  	if n != 4 {
   488  		return errors.New("bam: invalid record: short block size")
   489  	}
   490  	size := int(binary.LittleEndian.Uint32(br.sizeBuf))
   491  	if size > maxBAMRecordSize {
   492  		return errors.New("bam: record too large")
   493  	}
   494  	resizeScratch(buf, size)
   495  	nn, err := io.ReadFull(br.r, *buf)
   496  	if err != nil {
   497  		return err
   498  	}
   499  	if nn != size {
   500  		return errors.New("bam: truncated record")
   501  	}
   502  	return nil
   503  }
   504  
   505  // buildAux constructs a single byte slice that represents a slice of sam.Aux.
   506  // *buf should be an empty slice on call, and it is filled with the result on
   507  // return.
   508  func buildAux(aa []sam.Aux, buf *[]byte) {
   509  	for _, a := range aa {
   510  		// TODO: validate each 'a'
   511  		*buf = append(*buf, []byte(a)...)
   512  		switch a.Type() {
   513  		case 'Z', 'H':
   514  			*buf = append(*buf, 0)
   515  		}
   516  	}
   517  }
   518  
   519  type doublets []sam.Doublet
   520  
   521  func (np doublets) Bytes() []byte { return *(*[]byte)(unsafe.Pointer(&np)) }