github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/cigar.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package sam
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  )
    11  
    12  // Cigar is a set of CIGAR operations.
    13  type Cigar []CigarOp
    14  
    15  // IsValid returns whether the CIGAR string is valid for a record of the given
    16  // sequence length. Validity is defined by the sum of query consuming operations
    17  // matching the given length, clipping operations only being present at the ends
    18  // of alignments, and that CigarBack operations only result in query-consuming
    19  // positions at or right of the start of the alignment.
    20  func (c Cigar) IsValid(length int) bool {
    21  	var pos int
    22  	for i, co := range c {
    23  		ct := co.Type()
    24  		if ct == CigarHardClipped && i != 0 && i != len(c)-1 {
    25  			return false
    26  		}
    27  		if ct == CigarSoftClipped && i != 0 && i != len(c)-1 {
    28  			if c[i-1].Type() != CigarHardClipped && c[i+1].Type() != CigarHardClipped {
    29  				return false
    30  			}
    31  		}
    32  		con := ct.Consumes()
    33  		if pos < 0 && con.Query != 0 {
    34  			return false
    35  		}
    36  		length -= co.Len() * con.Query
    37  		pos += co.Len() * con.Reference
    38  	}
    39  	return length == 0
    40  }
    41  
    42  // String returns the CIGAR string for c.
    43  func (c Cigar) String() string {
    44  	if len(c) == 0 {
    45  		return "*"
    46  	}
    47  	var b bytes.Buffer
    48  	for _, co := range c {
    49  		fmt.Fprint(&b, co)
    50  	}
    51  	return b.String()
    52  }
    53  
    54  // Lengths returns the number of reference and read bases described by the Cigar.
    55  func (c Cigar) Lengths() (ref, read int) {
    56  	var con Consume
    57  	for _, co := range c {
    58  		con = co.Type().Consumes()
    59  		if co.Type() != CigarBack {
    60  			ref += co.Len() * con.Reference
    61  		}
    62  		read += co.Len() * con.Query
    63  	}
    64  	return ref, read
    65  }
    66  
    67  // CigarOp is a single CIGAR operation including the operation type and the
    68  // length of the operation.
    69  type CigarOp uint32
    70  
    71  // NewCigarOp returns a CIGAR operation of the specified type with length n.
    72  // Due to a limitation of the BAM format, CIGAR operation lengths are limited
    73  // to 2^28-1, and NewCigarOp will panic if n is above this or negative.
    74  func NewCigarOp(t CigarOpType, n int) CigarOp {
    75  	if uint64(n) > 1<<28-1 {
    76  		panic("sam: illegal CIGAR op length")
    77  	}
    78  	return CigarOp(t) | (CigarOp(n) << 4)
    79  }
    80  
    81  // Type returns the type of the CIGAR operation for the CigarOp.
    82  func (co CigarOp) Type() CigarOpType { return CigarOpType(co & 0xf) }
    83  
    84  // Len returns the number of positions affected by the CigarOp CIGAR operation.
    85  func (co CigarOp) Len() int { return int(co >> 4) }
    86  
    87  // String returns the string representation of the CigarOp
    88  func (co CigarOp) String() string { return fmt.Sprintf("%d%s", co.Len(), co.Type().String()) }
    89  
    90  // A CigarOpType represents the type of operation described by a CigarOp.
    91  type CigarOpType byte
    92  
    93  const (
    94  	CigarMatch       CigarOpType = iota // Alignment match (can be a sequence match or mismatch).
    95  	CigarInsertion                      // Insertion to the reference.
    96  	CigarDeletion                       // Deletion from the reference.
    97  	CigarSkipped                        // Skipped region from the reference.
    98  	CigarSoftClipped                    // Soft clipping (clipped sequences present in SEQ).
    99  	CigarHardClipped                    // Hard clipping (clipped sequences NOT present in SEQ).
   100  	CigarPadded                         // Padding (silent deletion from padded reference).
   101  	CigarEqual                          // Sequence match.
   102  	CigarMismatch                       // Sequence mismatch.
   103  	CigarBack                           // Skip backwards.
   104  	lastCigar
   105  )
   106  
   107  var cigarOps = []string{"M", "I", "D", "N", "S", "H", "P", "=", "X", "B", "?"}
   108  
   109  // Consumes returns the CIGAR operation alignment consumption characteristics for the CigarOpType.
   110  //
   111  // The Consume values for each of the CigarOpTypes is as follows:
   112  //
   113  //                    Query  Reference
   114  //  CigarMatch          1        1
   115  //  CigarInsertion      1        0
   116  //  CigarDeletion       0        1
   117  //  CigarSkipped        0        1
   118  //  CigarSoftClipped    1        0
   119  //  CigarHardClipped    0        0
   120  //  CigarPadded         0        0
   121  //  CigarEqual          1        1
   122  //  CigarMismatch       1        1
   123  //  CigarBack           0       -1
   124  //
   125  func (ct CigarOpType) Consumes() Consume { return consume[ct] }
   126  
   127  // String returns the string representation of a CigarOpType.
   128  func (ct CigarOpType) String() string {
   129  	if ct < 0 || ct > lastCigar {
   130  		ct = lastCigar
   131  	}
   132  	return cigarOps[ct]
   133  }
   134  
   135  // Consume describes how CIGAR operations consume alignment bases.
   136  type Consume struct {
   137  	Query, Reference int
   138  }
   139  
   140  // A few years ago, Complete Genomics (CG) proposed to add a new CIGAR
   141  // operator 'B' for an operation of moving backward along the reference
   142  // genome. It is the opposite of the 'N', the reference skip. In a later
   143  // discussion on a separate issue, Fred expressed his preference to a
   144  // negative reference skip which is equivalent to a positive 'B' operation.
   145  // Now the SRA group from NCBI intends to archive the CG alignment in
   146  // the SAM format and raises this request again. I think it may be the
   147  // time to add this backward operation.
   148  //
   149  // The backward operation is designed to describe such an alignment:
   150  //
   151  // REF:: GCATACGATCGACTAGTCACGT
   152  // READ: --ATACGATCGA----------
   153  // READ: ---------CGACTAGTCAC--
   154  //
   155  // i.e. there is an overlap between two segments of a read, which is quite
   156  // frequent in CG data. We are unable to fully describe such an alignment
   157  // with the original CIGAR. In the current spec, we suggest using a CIGAR
   158  // 18M and storing the overlap information in optional tags. This is a
   159  // little clumsy and is not compressed well for the purpose of archiving.
   160  // With 'B', the new CIGAR is "10M3B11M" with no other optional tags.
   161  //
   162  // Using "B" in this case is cleaner, but the major concern is that it breaks
   163  // the compatibility and is also likely to complicate SNP calling and many
   164  // other applications. As I think now, the solution is to implement a
   165  // "remove_B()" routine in samtools. This routine collapses overlapping
   166  // sequences, recalculates base quality in the overlap and gives a CIGAR
   167  // without 'B'. For the example above, remove_B() gives CIGAR 18M. For SNP
   168  // calling, we may call remove_B() immediately after the alignment loaded
   169  // into memory. The downstream pileup engine does not need any changes. Other
   170  // SNP callers can do the same. A new option will be added to "samtools view"
   171  // as a way to remove 'B' operations on the command-line.
   172  //
   173  // The implementation of remove_B() may be quite complicated in the generic
   174  // case - we may be dealing with a multiple-sequence alignment, but it should
   175  // be straightforward in the simple cases such as the example above. Users may
   176  // not need to care too much about how remove_B() is implemented.
   177  //
   178  // http://sourceforge.net/p/samtools/mailman/message/28463294/
   179  var consume = []Consume{
   180  	CigarMatch:       {Query: 1, Reference: 1},
   181  	CigarInsertion:   {Query: 1, Reference: 0},
   182  	CigarDeletion:    {Query: 0, Reference: 1},
   183  	CigarSkipped:     {Query: 0, Reference: 1},
   184  	CigarSoftClipped: {Query: 1, Reference: 0},
   185  	CigarHardClipped: {Query: 0, Reference: 0},
   186  	CigarPadded:      {Query: 0, Reference: 0},
   187  	CigarEqual:       {Query: 1, Reference: 1},
   188  	CigarMismatch:    {Query: 1, Reference: 1},
   189  	CigarBack:        {Query: 0, Reference: -1}, // See notes above.
   190  	lastCigar:        {},
   191  }
   192  
   193  var cigarOpTypeLookup [256]CigarOpType
   194  
   195  func init() {
   196  	for i := range cigarOpTypeLookup {
   197  		cigarOpTypeLookup[i] = lastCigar
   198  	}
   199  	for op, c := range []byte{'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B'} {
   200  		cigarOpTypeLookup[c] = CigarOpType(op)
   201  	}
   202  }
   203  
   204  var powers = []int64{1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12}
   205  
   206  // atoi returns the integer interpretation of b which must be an ASCII decimal number representation.
   207  func atoi(b []byte) (int, error) {
   208  	if len(b) > len(powers) {
   209  		return 0, fmt.Errorf("sam: integer overflow: %q", b)
   210  	}
   211  	var n int64
   212  	k := len(b) - 1
   213  	for i, v := range b {
   214  		n += int64(v-'0') * powers[k-i]
   215  		if int64(int(n)) != n {
   216  			return 0, fmt.Errorf("sam: integer overflow: %q at %d", b, i)
   217  		}
   218  	}
   219  	return int(n), nil
   220  }
   221  
   222  // ParseCigar returns a Cigar parsed from the provided byte slice.
   223  // ParseCigar will break CIGAR operations longer than 2^28-1 into
   224  // multiple operations summing to the same length.
   225  func ParseCigar(b []byte) (Cigar, error) {
   226  	if len(b) == 1 && b[0] == '*' {
   227  		return nil, nil
   228  	}
   229  	var (
   230  		c   Cigar
   231  		op  CigarOpType
   232  		n   int
   233  		err error
   234  	)
   235  	for i := 0; i < len(b); i++ {
   236  		for j := i; j < len(b); j++ {
   237  			if b[j] < '0' || '9' < b[j] {
   238  				n, err = atoi(b[i:j])
   239  				if err != nil {
   240  					return nil, err
   241  				}
   242  				op = cigarOpTypeLookup[b[j]]
   243  				i = j
   244  				break
   245  			}
   246  		}
   247  		if op == lastCigar {
   248  			return nil, fmt.Errorf("sam: failed to parse cigar string %q: unknown operation %q", b, op)
   249  		}
   250  
   251  		for {
   252  			c = append(c, NewCigarOp(op, minInt(n, 1<<28-1)))
   253  			n -= 1<<28 - 1
   254  			if n <= 0 {
   255  				break
   256  			}
   257  		}
   258  	}
   259  	return c, nil
   260  }
   261  
   262  func minInt(a, b int) int {
   263  	if a < b {
   264  		return a
   265  	}
   266  	return b
   267  }