github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/cigar.go (about) 1 // Copyright ©2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package sam 6 7 import ( 8 "bytes" 9 "fmt" 10 ) 11 12 // Cigar is a set of CIGAR operations. 13 type Cigar []CigarOp 14 15 // IsValid returns whether the CIGAR string is valid for a record of the given 16 // sequence length. Validity is defined by the sum of query consuming operations 17 // matching the given length, clipping operations only being present at the ends 18 // of alignments, and that CigarBack operations only result in query-consuming 19 // positions at or right of the start of the alignment. 20 func (c Cigar) IsValid(length int) bool { 21 var pos int 22 for i, co := range c { 23 ct := co.Type() 24 if ct == CigarHardClipped && i != 0 && i != len(c)-1 { 25 return false 26 } 27 if ct == CigarSoftClipped && i != 0 && i != len(c)-1 { 28 if c[i-1].Type() != CigarHardClipped && c[i+1].Type() != CigarHardClipped { 29 return false 30 } 31 } 32 con := ct.Consumes() 33 if pos < 0 && con.Query != 0 { 34 return false 35 } 36 length -= co.Len() * con.Query 37 pos += co.Len() * con.Reference 38 } 39 return length == 0 40 } 41 42 // String returns the CIGAR string for c. 43 func (c Cigar) String() string { 44 if len(c) == 0 { 45 return "*" 46 } 47 var b bytes.Buffer 48 for _, co := range c { 49 fmt.Fprint(&b, co) 50 } 51 return b.String() 52 } 53 54 // Lengths returns the number of reference and read bases described by the Cigar. 55 func (c Cigar) Lengths() (ref, read int) { 56 var con Consume 57 for _, co := range c { 58 con = co.Type().Consumes() 59 if co.Type() != CigarBack { 60 ref += co.Len() * con.Reference 61 } 62 read += co.Len() * con.Query 63 } 64 return ref, read 65 } 66 67 // CigarOp is a single CIGAR operation including the operation type and the 68 // length of the operation. 69 type CigarOp uint32 70 71 // NewCigarOp returns a CIGAR operation of the specified type with length n. 72 // Due to a limitation of the BAM format, CIGAR operation lengths are limited 73 // to 2^28-1, and NewCigarOp will panic if n is above this or negative. 74 func NewCigarOp(t CigarOpType, n int) CigarOp { 75 if uint64(n) > 1<<28-1 { 76 panic("sam: illegal CIGAR op length") 77 } 78 return CigarOp(t) | (CigarOp(n) << 4) 79 } 80 81 // Type returns the type of the CIGAR operation for the CigarOp. 82 func (co CigarOp) Type() CigarOpType { return CigarOpType(co & 0xf) } 83 84 // Len returns the number of positions affected by the CigarOp CIGAR operation. 85 func (co CigarOp) Len() int { return int(co >> 4) } 86 87 // String returns the string representation of the CigarOp 88 func (co CigarOp) String() string { return fmt.Sprintf("%d%s", co.Len(), co.Type().String()) } 89 90 // A CigarOpType represents the type of operation described by a CigarOp. 91 type CigarOpType byte 92 93 const ( 94 CigarMatch CigarOpType = iota // Alignment match (can be a sequence match or mismatch). 95 CigarInsertion // Insertion to the reference. 96 CigarDeletion // Deletion from the reference. 97 CigarSkipped // Skipped region from the reference. 98 CigarSoftClipped // Soft clipping (clipped sequences present in SEQ). 99 CigarHardClipped // Hard clipping (clipped sequences NOT present in SEQ). 100 CigarPadded // Padding (silent deletion from padded reference). 101 CigarEqual // Sequence match. 102 CigarMismatch // Sequence mismatch. 103 CigarBack // Skip backwards. 104 lastCigar 105 ) 106 107 var cigarOps = []string{"M", "I", "D", "N", "S", "H", "P", "=", "X", "B", "?"} 108 109 // Consumes returns the CIGAR operation alignment consumption characteristics for the CigarOpType. 110 // 111 // The Consume values for each of the CigarOpTypes is as follows: 112 // 113 // Query Reference 114 // CigarMatch 1 1 115 // CigarInsertion 1 0 116 // CigarDeletion 0 1 117 // CigarSkipped 0 1 118 // CigarSoftClipped 1 0 119 // CigarHardClipped 0 0 120 // CigarPadded 0 0 121 // CigarEqual 1 1 122 // CigarMismatch 1 1 123 // CigarBack 0 -1 124 // 125 func (ct CigarOpType) Consumes() Consume { return consume[ct] } 126 127 // String returns the string representation of a CigarOpType. 128 func (ct CigarOpType) String() string { 129 if ct < 0 || ct > lastCigar { 130 ct = lastCigar 131 } 132 return cigarOps[ct] 133 } 134 135 // Consume describes how CIGAR operations consume alignment bases. 136 type Consume struct { 137 Query, Reference int 138 } 139 140 // A few years ago, Complete Genomics (CG) proposed to add a new CIGAR 141 // operator 'B' for an operation of moving backward along the reference 142 // genome. It is the opposite of the 'N', the reference skip. In a later 143 // discussion on a separate issue, Fred expressed his preference to a 144 // negative reference skip which is equivalent to a positive 'B' operation. 145 // Now the SRA group from NCBI intends to archive the CG alignment in 146 // the SAM format and raises this request again. I think it may be the 147 // time to add this backward operation. 148 // 149 // The backward operation is designed to describe such an alignment: 150 // 151 // REF:: GCATACGATCGACTAGTCACGT 152 // READ: --ATACGATCGA---------- 153 // READ: ---------CGACTAGTCAC-- 154 // 155 // i.e. there is an overlap between two segments of a read, which is quite 156 // frequent in CG data. We are unable to fully describe such an alignment 157 // with the original CIGAR. In the current spec, we suggest using a CIGAR 158 // 18M and storing the overlap information in optional tags. This is a 159 // little clumsy and is not compressed well for the purpose of archiving. 160 // With 'B', the new CIGAR is "10M3B11M" with no other optional tags. 161 // 162 // Using "B" in this case is cleaner, but the major concern is that it breaks 163 // the compatibility and is also likely to complicate SNP calling and many 164 // other applications. As I think now, the solution is to implement a 165 // "remove_B()" routine in samtools. This routine collapses overlapping 166 // sequences, recalculates base quality in the overlap and gives a CIGAR 167 // without 'B'. For the example above, remove_B() gives CIGAR 18M. For SNP 168 // calling, we may call remove_B() immediately after the alignment loaded 169 // into memory. The downstream pileup engine does not need any changes. Other 170 // SNP callers can do the same. A new option will be added to "samtools view" 171 // as a way to remove 'B' operations on the command-line. 172 // 173 // The implementation of remove_B() may be quite complicated in the generic 174 // case - we may be dealing with a multiple-sequence alignment, but it should 175 // be straightforward in the simple cases such as the example above. Users may 176 // not need to care too much about how remove_B() is implemented. 177 // 178 // http://sourceforge.net/p/samtools/mailman/message/28463294/ 179 var consume = []Consume{ 180 CigarMatch: {Query: 1, Reference: 1}, 181 CigarInsertion: {Query: 1, Reference: 0}, 182 CigarDeletion: {Query: 0, Reference: 1}, 183 CigarSkipped: {Query: 0, Reference: 1}, 184 CigarSoftClipped: {Query: 1, Reference: 0}, 185 CigarHardClipped: {Query: 0, Reference: 0}, 186 CigarPadded: {Query: 0, Reference: 0}, 187 CigarEqual: {Query: 1, Reference: 1}, 188 CigarMismatch: {Query: 1, Reference: 1}, 189 CigarBack: {Query: 0, Reference: -1}, // See notes above. 190 lastCigar: {}, 191 } 192 193 var cigarOpTypeLookup [256]CigarOpType 194 195 func init() { 196 for i := range cigarOpTypeLookup { 197 cigarOpTypeLookup[i] = lastCigar 198 } 199 for op, c := range []byte{'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B'} { 200 cigarOpTypeLookup[c] = CigarOpType(op) 201 } 202 } 203 204 var powers = []int64{1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12} 205 206 // atoi returns the integer interpretation of b which must be an ASCII decimal number representation. 207 func atoi(b []byte) (int, error) { 208 if len(b) > len(powers) { 209 return 0, fmt.Errorf("sam: integer overflow: %q", b) 210 } 211 var n int64 212 k := len(b) - 1 213 for i, v := range b { 214 n += int64(v-'0') * powers[k-i] 215 if int64(int(n)) != n { 216 return 0, fmt.Errorf("sam: integer overflow: %q at %d", b, i) 217 } 218 } 219 return int(n), nil 220 } 221 222 // ParseCigar returns a Cigar parsed from the provided byte slice. 223 // ParseCigar will break CIGAR operations longer than 2^28-1 into 224 // multiple operations summing to the same length. 225 func ParseCigar(b []byte) (Cigar, error) { 226 if len(b) == 1 && b[0] == '*' { 227 return nil, nil 228 } 229 var ( 230 c Cigar 231 op CigarOpType 232 n int 233 err error 234 ) 235 for i := 0; i < len(b); i++ { 236 for j := i; j < len(b); j++ { 237 if b[j] < '0' || '9' < b[j] { 238 n, err = atoi(b[i:j]) 239 if err != nil { 240 return nil, err 241 } 242 op = cigarOpTypeLookup[b[j]] 243 i = j 244 break 245 } 246 } 247 if op == lastCigar { 248 return nil, fmt.Errorf("sam: failed to parse cigar string %q: unknown operation %q", b, op) 249 } 250 251 for { 252 c = append(c, NewCigarOp(op, minInt(n, 1<<28-1))) 253 n -= 1<<28 - 1 254 if n <= 0 { 255 break 256 } 257 } 258 } 259 return c, nil 260 } 261 262 func minInt(a, b int) int { 263 if a < b { 264 return a 265 } 266 return b 267 }