github.com/vertgenlab/gonomics@v1.0.0/cigar/cigar.go (about)

     1  // Package cigar contains functions to manipulate cigar data in the SAM file format.
     2  // More information on cigars can be found in http://samtools.github.io/hts-specs/SAMv1.pdf
     3  package cigar
     4  
     5  import (
     6  	"fmt"
     7  	"github.com/vertgenlab/gonomics/numbers/parse"
     8  	"log"
     9  	"unicode"
    10  )
    11  
    12  // Cigar contains information on the runLength, operation, and DNA sequence associated with a particular cigar character.
    13  type Cigar struct {
    14  	RunLength int
    15  	Op        rune
    16  }
    17  
    18  // NumInsertions calculates the number of inserted bases relative to a reference genome for an input Cigar slice.
    19  func NumInsertions(input []Cigar) int {
    20  	var count int
    21  	if input[0].Op == '*' {
    22  		log.Panic("Cannot calculate NumInsertions from unaligned reads.")
    23  	}
    24  	for i := range input {
    25  		if !ConsumesReference(input[i].Op) && ConsumesQuery(input[i].Op) {
    26  			count += input[i].RunLength
    27  		}
    28  	}
    29  	return count
    30  }
    31  
    32  // NumDeletions calculates the number of deletions relative to a reference genome for an input Cigar slice.
    33  func NumDeletions(input []Cigar) int {
    34  	var count int
    35  	if input[0].Op == '*' {
    36  		log.Panic("Cannot calculate NumDeletions from unaligned reads.")
    37  	}
    38  	for i := range input {
    39  		if ConsumesReference(input[i].Op) && !ConsumesQuery(input[i].Op) {
    40  			count += input[i].RunLength
    41  		}
    42  	}
    43  	return count
    44  }
    45  
    46  // ToString converts a slice of Cigar structs to a string for producing readable outputs for files or standard out.
    47  func ToString(c []Cigar) string {
    48  	if len(c) == 0 {
    49  		return "*"
    50  	}
    51  	var output string = ""
    52  	for _, v := range c {
    53  		if v.Op == '*' {
    54  			output = "*"
    55  			break
    56  		}
    57  		output += fmt.Sprintf("%v%c", v.RunLength, v.Op)
    58  	}
    59  	return output
    60  }
    61  
    62  // FromString parses an input string into a slice of Cigar structs.
    63  func FromString(input string) []Cigar {
    64  	var output []Cigar
    65  	var currentNumber string
    66  	var currentCigar Cigar
    67  	if input == "*" || input == "**" {
    68  		currentCigar = Cigar{RunLength: 0, Op: '*'}
    69  		return append(output, currentCigar)
    70  	}
    71  
    72  	for _, v := range input {
    73  		if unicode.IsDigit(v) {
    74  			currentNumber = currentNumber + fmt.Sprintf("%c", v)
    75  		} else if validOp(v) {
    76  			currentCigar := Cigar{RunLength: parse.StringToInt(currentNumber), Op: v}
    77  			output = append(output, currentCigar)
    78  			currentNumber = ""
    79  		} else {
    80  			log.Panicf("Invalid character: %c", v)
    81  		}
    82  	}
    83  	return output
    84  }
    85  
    86  // MatchLength returns the number of bases in a Cigar slice that align to the reference.
    87  func MatchLength(c []Cigar) int {
    88  	var ans int
    89  	if c[0].Op == '*' {
    90  		log.Panic("Cannot calculate MatchLength from unaligned reads.")
    91  	}
    92  	for _, v := range c {
    93  		if ConsumesReference(v.Op) && ConsumesQuery(v.Op) {
    94  			ans = ans + v.RunLength
    95  		}
    96  	}
    97  	return ans
    98  }
    99  
   100  // ReferenceLength calculates the number of reference positions that a Cigar slice spans.
   101  func ReferenceLength(c []Cigar) int {
   102  	var ans int
   103  	if c[0].Op == '*' {
   104  		log.Panic("Cannot calculate NumInsertions from unaligned reads.")
   105  	}
   106  	for _, v := range c {
   107  		if ConsumesReference(v.Op) {
   108  			ans = ans + v.RunLength
   109  		}
   110  	}
   111  	return ans
   112  }
   113  
   114  // QueryLength calculates the length of the query read from a slice of Cigar structs.
   115  func QueryLength(c []Cigar) int {
   116  	var ans int
   117  	if c[0].Op == '*' {
   118  		log.Panic("Cannot calculate NumInsertions from unaligned reads.")
   119  	}
   120  	for _, v := range c {
   121  		if ConsumesQuery(v.Op) {
   122  			ans = ans + v.RunLength
   123  		}
   124  	}
   125  	return ans
   126  }
   127  
   128  // validOp returns true if a particular input rune matches any of the acceptable Cigar operation characters.
   129  func validOp(r rune) bool {
   130  	switch r {
   131  	case 'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X':
   132  		return true
   133  	default:
   134  		return false
   135  	}
   136  }
   137  
   138  // ConsumesReference returns true of the rune matches an operation character that is reference consuming for Cigars.
   139  func ConsumesReference(r rune) bool {
   140  	switch r {
   141  	case 'M', 'D', 'N', '=', 'X':
   142  		return true
   143  	case 'I', 'S', 'H', 'P':
   144  		return false
   145  	}
   146  	log.Panicf("Invalid rune: %c", r)
   147  	return false
   148  }
   149  
   150  // ConsumesQuery returns true for input runes that match query consuming characters for Cigars.
   151  func ConsumesQuery(r rune) bool {
   152  	switch r {
   153  	case 'M', 'I', 'S', '=', 'X':
   154  		return true
   155  	case 'D', 'N', 'H', 'P':
   156  		return false
   157  	}
   158  	log.Panicf("Invalid rune: %c", r)
   159  	return false
   160  }