github.com/vertgenlab/gonomics@v1.0.0/cigar/cigar.go (about) 1 // Package cigar contains functions to manipulate cigar data in the SAM file format. 2 // More information on cigars can be found in http://samtools.github.io/hts-specs/SAMv1.pdf 3 package cigar 4 5 import ( 6 "fmt" 7 "github.com/vertgenlab/gonomics/numbers/parse" 8 "log" 9 "unicode" 10 ) 11 12 // Cigar contains information on the runLength, operation, and DNA sequence associated with a particular cigar character. 13 type Cigar struct { 14 RunLength int 15 Op rune 16 } 17 18 // NumInsertions calculates the number of inserted bases relative to a reference genome for an input Cigar slice. 19 func NumInsertions(input []Cigar) int { 20 var count int 21 if input[0].Op == '*' { 22 log.Panic("Cannot calculate NumInsertions from unaligned reads.") 23 } 24 for i := range input { 25 if !ConsumesReference(input[i].Op) && ConsumesQuery(input[i].Op) { 26 count += input[i].RunLength 27 } 28 } 29 return count 30 } 31 32 // NumDeletions calculates the number of deletions relative to a reference genome for an input Cigar slice. 33 func NumDeletions(input []Cigar) int { 34 var count int 35 if input[0].Op == '*' { 36 log.Panic("Cannot calculate NumDeletions from unaligned reads.") 37 } 38 for i := range input { 39 if ConsumesReference(input[i].Op) && !ConsumesQuery(input[i].Op) { 40 count += input[i].RunLength 41 } 42 } 43 return count 44 } 45 46 // ToString converts a slice of Cigar structs to a string for producing readable outputs for files or standard out. 47 func ToString(c []Cigar) string { 48 if len(c) == 0 { 49 return "*" 50 } 51 var output string = "" 52 for _, v := range c { 53 if v.Op == '*' { 54 output = "*" 55 break 56 } 57 output += fmt.Sprintf("%v%c", v.RunLength, v.Op) 58 } 59 return output 60 } 61 62 // FromString parses an input string into a slice of Cigar structs. 63 func FromString(input string) []Cigar { 64 var output []Cigar 65 var currentNumber string 66 var currentCigar Cigar 67 if input == "*" || input == "**" { 68 currentCigar = Cigar{RunLength: 0, Op: '*'} 69 return append(output, currentCigar) 70 } 71 72 for _, v := range input { 73 if unicode.IsDigit(v) { 74 currentNumber = currentNumber + fmt.Sprintf("%c", v) 75 } else if validOp(v) { 76 currentCigar := Cigar{RunLength: parse.StringToInt(currentNumber), Op: v} 77 output = append(output, currentCigar) 78 currentNumber = "" 79 } else { 80 log.Panicf("Invalid character: %c", v) 81 } 82 } 83 return output 84 } 85 86 // MatchLength returns the number of bases in a Cigar slice that align to the reference. 87 func MatchLength(c []Cigar) int { 88 var ans int 89 if c[0].Op == '*' { 90 log.Panic("Cannot calculate MatchLength from unaligned reads.") 91 } 92 for _, v := range c { 93 if ConsumesReference(v.Op) && ConsumesQuery(v.Op) { 94 ans = ans + v.RunLength 95 } 96 } 97 return ans 98 } 99 100 // ReferenceLength calculates the number of reference positions that a Cigar slice spans. 101 func ReferenceLength(c []Cigar) int { 102 var ans int 103 if c[0].Op == '*' { 104 log.Panic("Cannot calculate NumInsertions from unaligned reads.") 105 } 106 for _, v := range c { 107 if ConsumesReference(v.Op) { 108 ans = ans + v.RunLength 109 } 110 } 111 return ans 112 } 113 114 // QueryLength calculates the length of the query read from a slice of Cigar structs. 115 func QueryLength(c []Cigar) int { 116 var ans int 117 if c[0].Op == '*' { 118 log.Panic("Cannot calculate NumInsertions from unaligned reads.") 119 } 120 for _, v := range c { 121 if ConsumesQuery(v.Op) { 122 ans = ans + v.RunLength 123 } 124 } 125 return ans 126 } 127 128 // validOp returns true if a particular input rune matches any of the acceptable Cigar operation characters. 129 func validOp(r rune) bool { 130 switch r { 131 case 'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X': 132 return true 133 default: 134 return false 135 } 136 } 137 138 // ConsumesReference returns true of the rune matches an operation character that is reference consuming for Cigars. 139 func ConsumesReference(r rune) bool { 140 switch r { 141 case 'M', 'D', 'N', '=', 'X': 142 return true 143 case 'I', 'S', 'H', 'P': 144 return false 145 } 146 log.Panicf("Invalid rune: %c", r) 147 return false 148 } 149 150 // ConsumesQuery returns true for input runes that match query consuming characters for Cigars. 151 func ConsumesQuery(r rune) bool { 152 switch r { 153 case 'M', 'I', 'S', '=', 'X': 154 return true 155 case 'D', 'N', 'H', 'P': 156 return false 157 } 158 log.Panicf("Invalid rune: %c", r) 159 return false 160 }