github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/record.go (about)

     1  // Copyright ©2012-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package sam
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"strconv"
    12  	"unsafe"
    13  
    14  	"github.com/Schaudge/grailbase/simd"
    15  	"github.com/Schaudge/grailbio/biosimd"
    16  	"github.com/Schaudge/hts/internal"
    17  )
    18  
    19  // DupType enumerates the different possible values for the Duplicate
    20  // Type (DT) aux tag.
    21  type DupType int
    22  
    23  const (
    24  	// DupTypeNone specifies duplicate type not present.
    25  	DupTypeNone DupType = iota
    26  
    27  	// DupTypeLB specifies "library" or PCR duplicate type.
    28  	DupTypeLB
    29  
    30  	// DupTypeSQ specifies "sequencer" or optical duplicate type.
    31  	DupTypeSQ
    32  )
    33  
    34  // LinearDupState enumerates the different possible Linear Duplicate
    35  // states for a read pair, stored in the LD aux tag.
    36  type LinearDupState int
    37  
    38  const (
    39  	// LinearNone specifies linear dup tag not present.
    40  	LinearNone = iota
    41  
    42  	// LinearPrimary specifies linear primary.
    43  	LinearPrimary
    44  
    45  	// LinearDuplicate specifies linear duplicate.
    46  	LinearDuplicate
    47  )
    48  
    49  // Record represents a SAM/BAM record.
    50  type Record struct {
    51  	Name      string
    52  	Ref       *Reference
    53  	Pos       int
    54  	MapQ      byte
    55  	Cigar     Cigar
    56  	Flags     Flags
    57  	MateRef   *Reference
    58  	MatePos   int
    59  	TempLen   int
    60  	Seq       Seq
    61  	Qual      []byte
    62  	AuxFields AuxFields
    63  
    64  	Scratch []byte
    65  }
    66  
    67  // NewRecord returns a Record, checking for consistency of the provided
    68  // attributes.
    69  func NewRecord(name string, ref, mRef *Reference, p, mPos, tLen int, mapQ byte, co []CigarOp, seq, qual []byte, aux []Aux) (*Record, error) {
    70  	if !(validPos(p) && validPos(mPos) && validTmpltLen(tLen) && validLen(len(seq)) && (qual == nil || validLen(len(qual)))) {
    71  		return nil, errors.New("sam: value out of range")
    72  	}
    73  	if len(name) == 0 || len(name) > 254 {
    74  		return nil, errors.New("sam: name absent or too long")
    75  	}
    76  	if qual != nil && len(qual) != len(seq) {
    77  		return nil, errors.New("sam: sequence/quality length mismatch")
    78  	}
    79  	if ref != nil {
    80  		if ref.id < 0 {
    81  			return nil, errors.New("sam: linking to invalid reference")
    82  		}
    83  	} else {
    84  		if p != -1 {
    85  			return nil, errors.New("sam: specified position != -1 without reference")
    86  		}
    87  	}
    88  	if mRef != nil {
    89  		if mRef.id < 0 {
    90  			return nil, errors.New("sam: linking to invalid mate reference")
    91  		}
    92  	} else {
    93  		if mPos != -1 {
    94  			return nil, errors.New("sam: specified mate position != -1 without mate reference")
    95  		}
    96  	}
    97  	r := GetFromFreePool()
    98  	r.Name = name
    99  	r.Ref = ref
   100  	r.Pos = p
   101  	r.MapQ = mapQ
   102  	r.Cigar = co
   103  	r.Flags = 0
   104  	r.MateRef = mRef
   105  	r.MatePos = mPos
   106  	r.TempLen = tLen
   107  	r.Seq = NewSeq(seq)
   108  	r.Qual = qual
   109  	r.AuxFields = aux
   110  	return r, nil
   111  }
   112  
   113  // IsValidRecord returns whether the record satisfies the conditions that
   114  // it has the Unmapped flag set if it not placed; that the MateUnmapped
   115  // flag is set if it paired its mate is unplaced; that the CIGAR length
   116  // matches the sequence and quality string lengths if they are non-zero; and
   117  // that the Paired, ProperPair, Unmapped and MateUnmapped flags are consistent.
   118  func IsValidRecord(r *Record) bool {
   119  	if (r.Ref == nil || r.Pos == -1) && r.Flags&Unmapped == 0 {
   120  		return false
   121  	}
   122  	if r.Flags&Paired != 0 && (r.MateRef == nil || r.MatePos == -1) && r.Flags&MateUnmapped == 0 {
   123  		return false
   124  	}
   125  	if r.Flags&(Unmapped|ProperPair) == Unmapped|ProperPair {
   126  		return false
   127  	}
   128  	if r.Flags&(Paired|MateUnmapped|ProperPair) == Paired|MateUnmapped|ProperPair {
   129  		return false
   130  	}
   131  	if len(r.Qual) != 0 && r.Seq.Length != len(r.Qual) {
   132  		return false
   133  	}
   134  	if cigarLen := r.Len(); cigarLen < 0 || (r.Seq.Length != 0 && r.Seq.Length != cigarLen) {
   135  		return false
   136  	}
   137  	return true
   138  }
   139  
   140  // Tag returns an Aux tag whose tag ID matches the first two bytes of tag and true.
   141  // If no tag matches, nil and false are returned.
   142  func (r *Record) Tag(tag []byte) (v Aux, ok bool) {
   143  	if len(tag) < 2 {
   144  		panic("sam: tag too short")
   145  	}
   146  	for _, aux := range r.AuxFields {
   147  		if aux.matches(tag) {
   148  			return aux, true
   149  		}
   150  	}
   151  	return nil, false
   152  }
   153  
   154  // RefID returns the reference ID for the Record.
   155  func (r *Record) RefID() int {
   156  	return r.Ref.ID()
   157  }
   158  
   159  // Start returns the lower-coordinate end of the alignment.
   160  func (r *Record) Start() int {
   161  	return r.Pos
   162  }
   163  
   164  // Bin returns the BAM index bin of the record.
   165  func (r *Record) Bin() int {
   166  	if r.Flags&(Unmapped|MateUnmapped) == Unmapped|MateUnmapped {
   167  		return 4680 // reg2bin(-1, 0)
   168  	}
   169  	end := r.End()
   170  
   171  	// If the alignment length is zero (for example, if the read is
   172  	// unmapped), then increment end by 1 and treat the read as length
   173  	// 1 for binning purposes.
   174  	if end == r.Pos {
   175  		end++
   176  	}
   177  
   178  	if !internal.IsValidIndexPos(r.Pos) || !internal.IsValidIndexPos(end) {
   179  		return -1
   180  	}
   181  	return int(internal.BinFor(r.Pos, end))
   182  }
   183  
   184  // Len returns the length of the alignment.
   185  func (r *Record) Len() int {
   186  	return r.End() - r.Start()
   187  }
   188  
   189  func max(a, b int) int {
   190  	if a < b {
   191  		return b
   192  	}
   193  	return a
   194  }
   195  
   196  // End returns the highest query-consuming coordinate end of the alignment.
   197  // The position returned by End is not valid if r.Cigar.IsValid(r.Seq.Length)
   198  // is false.
   199  func (r *Record) End() int {
   200  	if r.Flags&Unmapped != 0 || len(r.Cigar) == 0 {
   201  		return r.Pos + 1
   202  	}
   203  	pos := r.Pos
   204  	end := pos
   205  	for _, co := range r.Cigar {
   206  		pos += co.Len() * co.Type().Consumes().Reference
   207  		end = max(end, pos)
   208  	}
   209  	return end
   210  }
   211  
   212  // Strand returns an int8 indicating the strand of the alignment. A positive return indicates
   213  // alignment in the forward orientation, a negative returns indicates alignment in the reverse
   214  // orientation.
   215  func (r *Record) Strand() int8 {
   216  	if r.Flags&Reverse == Reverse {
   217  		return -1
   218  	}
   219  	return 1
   220  }
   221  
   222  // LessByName returns true if the receiver sorts by record name before other.
   223  func (r *Record) LessByName(other *Record) bool {
   224  	return r.Name < other.Name
   225  }
   226  
   227  // LessByCoordinate returns true if the receiver sorts by coordinate before other
   228  // according to the SAM specification.
   229  func (r *Record) LessByCoordinate(other *Record) bool {
   230  	rRefName := r.Ref.Name()
   231  	oRefName := other.Ref.Name()
   232  	switch {
   233  	case oRefName == "*":
   234  		return true
   235  	case rRefName == "*":
   236  		return false
   237  	}
   238  	return (rRefName < oRefName) || (rRefName == oRefName && r.Pos < other.Pos)
   239  }
   240  
   241  // BagID returns the bag id (given by aux tag "DI") for r. If the DI
   242  // tag is not set, returns (-1, nil). If the tag is present, but malformed,
   243  // returns (-1, err).
   244  func (r *Record) BagID() (int64, error) {
   245  	val, found, err := r.auxInt64Value(bagIDTag)
   246  	if found && val < 0 {
   247  		return -1, fmt.Errorf("bag id: expected bag id >= 0, not %d", val)
   248  	}
   249  	return val, err
   250  }
   251  
   252  // BagSize returns the size of the bag as defined in the "DS" aux
   253  // tag. For a description of the DS tag, please see
   254  // bio-mark-duplicates. If the aux tag is not present, returns (-1,
   255  // nil). If the aux tag is malformed, returns (-1, err).
   256  func (r *Record) BagSize() (int, error) {
   257  	val, found, err := r.auxIntValue(bagSizeTag)
   258  	if found && val <= 0 {
   259  		return -1, fmt.Errorf("bag size: expected bag size >= 1, not %d", val)
   260  	}
   261  	return val, err
   262  }
   263  
   264  // DupType returns (DupTypeSQ, nil) if r has the DT tag, and its value
   265  // is "SQ" (optical). If the DT tag is present, and its value is "LB"
   266  // (PCR), then returns (DupTypeLB, nil). If the DT tag is not present,
   267  // then returns (DupTypeNone, nil). If the aux value is malformed,
   268  // then returns (DupTypeNone, err).
   269  func (r *Record) DupType() (DupType, error) {
   270  	aux, err := r.AuxFields.GetUnique(dupTypeTag)
   271  	if err != nil || aux == nil {
   272  		return DupTypeNone, err
   273  	}
   274  
   275  	s, ok := aux.Value().(string)
   276  	if !ok {
   277  		return DupTypeNone, fmt.Errorf("optical dup: unexpected type: %s", aux.String())
   278  	}
   279  
   280  	switch s {
   281  	case "SQ":
   282  		return DupTypeSQ, nil
   283  	case "LB":
   284  		return DupTypeLB, nil
   285  	}
   286  	return DupTypeNone, fmt.Errorf("optical dup: unexpected value: %s", aux.String())
   287  }
   288  
   289  // LibraryBagSize returns the number of library duplicate fragments in the bag of the given
   290  // record, as defined by the DL tag. For a description of the DL tag and how it relates to
   291  // the DS tag, please see bio-mark-duplicates. If the DL tag is not present (e.g., earlier
   292  // pipeline versions or read pairs with an unmapped read), -1 will be returned without an
   293  // error. If the DL tag is malformed, an error will be returned.
   294  func (r *Record) LibraryBagSize() (int, error) {
   295  	val, found, err := r.auxIntValue(libraryBagSizeTag)
   296  	if found && val < 1 {
   297  		return -1, fmt.Errorf("%s: expected value >= 1, not %d", libraryBagSizeTag, val)
   298  	}
   299  	return val, err
   300  }
   301  
   302  // LinearDup returns the linear duplicate state of the record as
   303  // specified in the LD aux tag. If the LD tag is not present (e.g.,
   304  // earlier pipeline versions or read pairs with an unmapped read, or
   305  // mapq below threshold), returns (LinearNone, nil). If the LD
   306  // tag is present, but has an invalid value, then returns
   307  // (LinearNone, err). Otherwise, returns (LinearPrimary,
   308  // nil) or (LinearDuplicate, nil) depending on the value of the
   309  // LD tag.
   310  func (r *Record) LinearDup() (LinearDupState, error) {
   311  	aux, err := r.AuxFields.GetUnique(linearDupTag)
   312  	if err != nil || aux == nil {
   313  		return LinearNone, err
   314  	}
   315  
   316  	s, ok := aux.Value().(string)
   317  	if !ok {
   318  		return LinearNone, fmt.Errorf("linear dup: unexpected type: %s", aux.String())
   319  	}
   320  	switch s {
   321  	case "primary":
   322  		return LinearPrimary, nil
   323  	case "duplicate":
   324  		return LinearDuplicate, nil
   325  	}
   326  	return LinearNone, fmt.Errorf("linear dup: unexpected value: %s", aux.String())
   327  }
   328  
   329  // LinearBagID returns the linear bag id (given by aux tag "LI") for r. If the LI
   330  // tag is not set, returns (-1, nil). If the tag is present, but malformed,
   331  // returns (-1, err).
   332  func (r *Record) LinearBagID() (int64, error) {
   333  	val, found, err := r.auxInt64Value(linearBagIDTag)
   334  	if found && val < 0 {
   335  		return -1, fmt.Errorf("linear bag id: expected bag id >= 0, not %d", val)
   336  	}
   337  	return val, err
   338  }
   339  
   340  // LinearBagSize returns the size of the linear bag as defined in the "LS" aux
   341  // tag. If the aux tag is not present, returns (-1, nil). If the aux
   342  // tag is malformed, returns (-1, err).
   343  func (r *Record) LinearBagSize() (int, error) {
   344  	val, found, err := r.auxIntValue(linearBagSizeTag)
   345  	if found && val <= 0 {
   346  		return -1, fmt.Errorf("linear bag size: expected bag size >= 1, not %d", val)
   347  	}
   348  	return val, err
   349  }
   350  
   351  // auxIntValue finds the unique specified aux tag. If there is an
   352  // error while looking for the aux tag or the type is not an int,
   353  // return (-1, false, err). If the aux tag is not present, return (-1,
   354  // false, nil). If the aux tag is found, and it is an integer type,
   355  // then return (value, true, nil).
   356  func (r *Record) auxIntValue(tag Tag) (val int, found bool, err error) {
   357  	aux, err := r.AuxFields.GetUnique(tag)
   358  	if err != nil || aux == nil {
   359  		return -1, false, err
   360  	}
   361  
   362  	switch v := aux.Value().(type) {
   363  	case uint8:
   364  		val = int(v)
   365  	case int8:
   366  		val = int(v)
   367  	case int16:
   368  		val = int(v)
   369  	case uint16:
   370  		val = int(v)
   371  	case int32:
   372  		val = int(v)
   373  	default:
   374  		return -1, false, fmt.Errorf("%s: unexpected type: %T", tag, v)
   375  	}
   376  	return val, true, nil
   377  }
   378  
   379  // auxInt64Value finds the unique specified aux tag. It is like
   380  // auxIntValue, but returns an int64 and also converts a string type
   381  // aux tag if it can be parsed as an integer.
   382  //
   383  // If there is an error while looking for the aux tag or the type is
   384  // not an int, return (-1, false, err). If the aux tag is not present,
   385  // return (-1, false, nil). If the aux tag is found, and it is an
   386  // integer type, then return (value, true, nil).
   387  func (r *Record) auxInt64Value(tag Tag) (val int64, found bool, err error) {
   388  	aux, err := r.AuxFields.GetUnique(tag)
   389  	if err != nil || aux == nil {
   390  		return -1, false, err
   391  	}
   392  
   393  	switch v := aux.Value().(type) {
   394  	case uint8:
   395  		val = int64(v)
   396  	case int8:
   397  		val = int64(v)
   398  	case int16:
   399  		val = int64(v)
   400  	case uint16:
   401  		val = int64(v)
   402  	case int32:
   403  		val = int64(v)
   404  	case string:
   405  		val, err = strconv.ParseInt(v, 10, 64)
   406  		if err != nil {
   407  			return -1, false, err
   408  		}
   409  	default:
   410  		return -1, false, fmt.Errorf("%s: unexpected type: %T", tag, v)
   411  	}
   412  	return val, true, nil
   413  }
   414  
   415  // String returns a string representation of the Record.
   416  func (r *Record) String() string {
   417  	end := r.End()
   418  	return fmt.Sprintf("%s %v %v %d %s:%d..%d (%d) %d %s:%d %d %s %v %v",
   419  		r.Name,
   420  		r.Flags,
   421  		r.Cigar,
   422  		r.MapQ,
   423  		r.Ref.Name(),
   424  		r.Pos,
   425  		end,
   426  		r.Bin(),
   427  		end-r.Pos,
   428  		r.MateRef.Name(),
   429  		r.MatePos,
   430  		r.TempLen,
   431  		r.Seq.Expand(),
   432  		r.Qual,
   433  		r.AuxFields,
   434  	)
   435  }
   436  
   437  // UnmarshalText implements the encoding.TextUnmarshaler. It calls UnmarshalSAM with
   438  // a nil Header.
   439  func (r *Record) UnmarshalText(b []byte) error {
   440  	return r.UnmarshalSAM(nil, b)
   441  }
   442  
   443  // UnmarshalSAM parses a SAM format alignment line in the provided []byte, using
   444  // references from the provided Header. If a nil Header is passed to UnmarshalSAM
   445  // and the SAM data include non-empty refence and mate reference names, fake
   446  // references with zero length and an ID of -1 are created to hold the reference
   447  // names.
   448  func (r *Record) UnmarshalSAM(h *Header, b []byte) error {
   449  	f := bytes.Split(b, []byte{'\t'})
   450  	if len(f) < 11 {
   451  		return errors.New("sam: missing SAM fields")
   452  	}
   453  	*r = Record{Name: string(f[0])}
   454  	// TODO(kortschak): Consider parsing string format flags.
   455  	flags, err := strconv.ParseUint(string(f[1]), 0, 16)
   456  	if err != nil {
   457  		return fmt.Errorf("sam: failed to parse flags: %v", err)
   458  	}
   459  	r.Flags = Flags(flags)
   460  	r.Ref, err = referenceForName(h, string(f[2]))
   461  	if err != nil {
   462  		return fmt.Errorf("sam: failed to assign reference: %v", err)
   463  	}
   464  	r.Pos, err = strconv.Atoi(string(f[3]))
   465  	r.Pos--
   466  	if err != nil {
   467  		return fmt.Errorf("sam: failed to parse position: %v", err)
   468  	}
   469  	mapQ, err := strconv.ParseUint(string(f[4]), 10, 8)
   470  	if err != nil {
   471  		return fmt.Errorf("sam: failed to parse map quality: %v", err)
   472  	}
   473  	r.MapQ = byte(mapQ)
   474  	r.Cigar, err = ParseCigar(f[5])
   475  	if err != nil {
   476  		return fmt.Errorf("sam: failed to parse cigar string: %v", err)
   477  	}
   478  	if bytes.Equal(f[2], f[6]) || bytes.Equal(f[6], []byte{'='}) {
   479  		r.MateRef = r.Ref
   480  	} else {
   481  		r.MateRef, err = referenceForName(h, string(f[6]))
   482  		if err != nil {
   483  			return fmt.Errorf("sam: failed to assign mate reference: %v", err)
   484  		}
   485  	}
   486  	r.MatePos, err = strconv.Atoi(string(f[7]))
   487  	r.MatePos--
   488  	if err != nil {
   489  		return fmt.Errorf("sam: failed to parse mate position: %v", err)
   490  	}
   491  	r.TempLen, err = strconv.Atoi(string(f[8]))
   492  	if err != nil {
   493  		return fmt.Errorf("sam: failed to parse template length: %v", err)
   494  	}
   495  	if !bytes.Equal(f[9], []byte{'*'}) {
   496  		r.Seq = NewSeq(f[9])
   497  		if len(r.Cigar) != 0 && !r.Cigar.IsValid(r.Seq.Length) {
   498  			return errors.New("sam: sequence/CIGAR length mismatch")
   499  		}
   500  	}
   501  	if !bytes.Equal(f[10], []byte{'*'}) {
   502  		r.Qual = append(r.Qual, f[10]...)
   503  		simd.AddConst8Inplace(r.Qual, 256-33)
   504  	} else if r.Seq.Length != 0 {
   505  		r.Qual = make([]byte, r.Seq.Length)
   506  		simd.Memset8(r.Qual, 0xff)
   507  	}
   508  	if len(r.Qual) != 0 && len(r.Qual) != r.Seq.Length {
   509  		return errors.New("sam: sequence/quality length mismatch")
   510  	}
   511  	if len(f) > 11 {
   512  		r.AuxFields = make([]Aux, len(f)-11)
   513  		for i, aux := range f[11:] {
   514  			a, err := ParseAux(aux)
   515  			if err != nil {
   516  				return err
   517  			}
   518  			r.AuxFields[i] = a
   519  		}
   520  	}
   521  	return nil
   522  }
   523  
   524  func referenceForName(h *Header, name string) (*Reference, error) {
   525  	if name == "*" {
   526  		return nil, nil
   527  	}
   528  	if h == nil {
   529  		// If we don't have a Header, return a fake Reference.
   530  		return &Reference{
   531  			id:   -1,
   532  			name: name,
   533  		}, nil
   534  	}
   535  
   536  	for _, r := range h.refs {
   537  		if r.Name() == name {
   538  			return r, nil
   539  		}
   540  	}
   541  	return nil, fmt.Errorf("no reference with name %q", name)
   542  }
   543  
   544  // MarshalText implements encoding.TextMarshaler. It calls MarshalSAM with FlagDecimal.
   545  func (r *Record) MarshalText() ([]byte, error) {
   546  	return r.MarshalSAM(0)
   547  }
   548  
   549  // MarshalSAM formats a Record as SAM using the specified flag format. Acceptable
   550  // formats are FlagDecimal, FlagHex and FlagString.
   551  func (r *Record) MarshalSAM(flags int) ([]byte, error) {
   552  	if flags < FlagDecimal || flags > FlagString {
   553  		return nil, errors.New("sam: flag format option out of range")
   554  	}
   555  	if r.Qual != nil && len(r.Qual) != r.Seq.Length {
   556  		return nil, errors.New("sam: sequence/quality length mismatch")
   557  	}
   558  	var buf bytes.Buffer
   559  	fmt.Fprintf(&buf, "%s\t%v\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
   560  		r.Name,
   561  		formatFlags(r.Flags, flags),
   562  		r.Ref.Name(),
   563  		r.Pos+1,
   564  		r.MapQ,
   565  		r.Cigar,
   566  		formatMate(r.Ref, r.MateRef),
   567  		r.MatePos+1,
   568  		r.TempLen,
   569  		formatSeq(r.Seq),
   570  		formatQual(r.Qual),
   571  	)
   572  	for _, t := range r.AuxFields {
   573  		fmt.Fprintf(&buf, "\t%v", samAux(t))
   574  	}
   575  	return buf.Bytes(), nil
   576  }
   577  
   578  // Flag format constants.
   579  const (
   580  	FlagDecimal = iota
   581  	FlagHex
   582  	FlagString
   583  )
   584  
   585  func formatFlags(f Flags, format int) interface{} {
   586  	switch format {
   587  	case FlagDecimal:
   588  		return uint16(f)
   589  	case FlagHex:
   590  		return fmt.Sprintf("0x%x", f)
   591  	case FlagString:
   592  		// If 0x01 is unset, no assumptions can be made about 0x02, 0x08, 0x20, 0x40 and 0x80
   593  		const pairedMask = ProperPair | MateUnmapped | MateReverse | MateReverse | Read1 | Read2
   594  		if f&1 == 0 {
   595  			f &^= pairedMask
   596  		}
   597  
   598  		const flags = "pPuUrR12sfdS"
   599  
   600  		b := make([]byte, 0, len(flags))
   601  		for i, c := range flags {
   602  			if f&(1<<uint(i)) != 0 {
   603  				b = append(b, byte(c))
   604  			}
   605  		}
   606  
   607  		return string(b)
   608  	default:
   609  		panic("sam: invalid flag format")
   610  	}
   611  }
   612  
   613  func formatMate(ref, mate *Reference) string {
   614  	if mate != nil && ref == mate {
   615  		return "="
   616  	}
   617  	return mate.Name()
   618  }
   619  
   620  func formatSeq(s Seq) []byte {
   621  	if s.Length == 0 {
   622  		return []byte{'*'}
   623  	}
   624  	return s.Expand()
   625  }
   626  
   627  func formatQual(q []byte) []byte {
   628  	for _, v := range q {
   629  		if v != 0xff {
   630  			a := make([]byte, len(q))
   631  			simd.AddConst8(a, q, 33)
   632  			return a
   633  		}
   634  	}
   635  	return []byte{'*'}
   636  }
   637  
   638  // Doublet is a nybble-encode pair of nucleotide bases.
   639  type Doublet byte
   640  
   641  // Seq is a nybble-encode pair of nucleotide sequence.
   642  type Seq struct {
   643  	Length int
   644  	Seq    []Doublet
   645  }
   646  
   647  var (
   648  	n16TableRev = simd.MakeNibbleLookupTable([16]byte{'=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'})
   649  	n16Table    = [256]Doublet{
   650  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   651  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   652  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   653  		0x1, 0x2, 0x4, 0x8, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x0, 0xf, 0xf,
   654  		0xf, 0x1, 0xe, 0x2, 0xd, 0xf, 0xf, 0x4, 0xb, 0xf, 0xf, 0xc, 0xf, 0x3, 0xf, 0xf,
   655  		0xf, 0xf, 0x5, 0x6, 0x8, 0xf, 0x7, 0x9, 0xf, 0xa, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   656  		0xf, 0x1, 0xe, 0x2, 0xd, 0xf, 0xf, 0x4, 0xb, 0xf, 0xf, 0xc, 0xf, 0x3, 0xf, 0xf,
   657  		0xf, 0xf, 0x5, 0x6, 0x8, 0xf, 0x7, 0x9, 0xf, 0xa, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   658  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   659  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   660  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   661  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   662  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   663  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   664  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   665  		0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
   666  	}
   667  )
   668  
   669  // NewSeq returns a new Seq based on the given byte slice.
   670  func NewSeq(s []byte) Seq {
   671  	return Seq{
   672  		Length: len(s),
   673  		Seq:    contract(s),
   674  	}
   675  }
   676  
   677  func contract(s []byte) []Doublet {
   678  	ns := make([]Doublet, (len(s)+1)>>1)
   679  	var np Doublet
   680  	for i, b := range s {
   681  		if i&1 == 0 {
   682  			np = n16Table[b] << 4
   683  		} else {
   684  			ns[i>>1] = np | n16Table[b]
   685  		}
   686  	}
   687  	// We haven't written the last base if the
   688  	// sequence was odd length, so do that now.
   689  	if len(s)&1 != 0 {
   690  		ns[len(ns)-1] = np
   691  	}
   692  	return ns
   693  }
   694  
   695  // Expand returns the byte encoded form of the receiver.
   696  //
   697  // This now has decent performance for ns.Length >= 32 (allocation is now the
   698  // main bottleneck in that case), but it should still be avoided in new code.
   699  // Base/BaseChar() is better if you are just performing a small number of point
   700  // queries.  Direct calls to biosimd.UnpackSeq{Unsafe} or
   701  // UnpackAndReplaceSeq{Unsafe}, which populate preallocated buffers, are better
   702  // when you are iterating through many bases.  (The main advantage of the
   703  // Unsafe functions is great performance for length < 32.)
   704  func (ns Seq) Expand() []byte {
   705  	s := make([]byte, ns.Length)
   706  	nsSeqPtr := (*[]byte)(unsafe.Pointer(&ns.Seq))
   707  	biosimd.UnpackAndReplaceSeq(s, *nsSeqPtr, &n16TableRev)
   708  	return s
   709  }
   710  
   711  // SeqBase is BAM's 4-bit encoding of nucleotide base types. See section 4.2 of
   712  // https://samtools.github.io/hts-specs/SAMv1.pdf
   713  type SeqBase byte
   714  
   715  const (
   716  	// Commonly used SeqBase constants.
   717  	BaseA SeqBase = 1
   718  	BaseC SeqBase = 2
   719  	BaseG SeqBase = 4
   720  	BaseT SeqBase = 8
   721  	BaseS SeqBase = 6
   722  	BaseN SeqBase = 15
   723  
   724  	// NumSeqBaseTypes is number of possible SeqBase values.  SeqBase starts
   725  	// from 0.
   726  	NumSeqBaseTypes = 16
   727  )
   728  
   729  func CharToSeqBase(char byte) SeqBase {
   730  	return SeqBase(n16Table[char])
   731  }
   732  
   733  // Base returns the pos'th base of the sequence.
   734  //
   735  // REQUIRES: 0 <= pos < seq.Length
   736  func (ns Seq) Base(pos int) SeqBase {
   737  	var base SeqBase
   738  	if pos%2 == 0 {
   739  		base = SeqBase(ns.Seq[pos/2] >> 4)
   740  	} else {
   741  		base = SeqBase(ns.Seq[pos/2] & 0xf)
   742  	}
   743  	return base
   744  }
   745  
   746  // BaseChar returns the pos'th base of the as a character, such as 'A', 'T'.
   747  //
   748  // REQUIRES: 0 <= pos < seq.Length
   749  func (ns Seq) BaseChar(pos int) byte {
   750  	return n16TableRev.Get(byte(ns.Base(pos)))
   751  }
   752  
   753  // Char converts a SeqBase to a human-readable character.  For example,
   754  // BaseA.Char() == 'A'.
   755  //
   756  // REQUIRES: 0 <= b < NumSeqBaseTypes
   757  func (b SeqBase) Char() byte {
   758  	return n16TableRev.Get(byte(b))
   759  }