github.com/vertgenlab/gonomics@v1.0.0/cigar/byteCigar.go (about)

     1  package cigar
     2  
     3  import (
     4  	"github.com/vertgenlab/gonomics/exception"
     5  	"github.com/vertgenlab/gonomics/numbers/parse"
     6  	"log"
     7  	"strconv"
     8  	"strings"
     9  )
    10  
    11  // ByteCigar struct encodes sequence comparison operations and includes run length info.
    12  type ByteCigar struct {
    13  	RunLen uint16
    14  	Op     byte
    15  }
    16  
    17  // Defined const for byte cigar.
    18  const (
    19  	Match     byte = 'M'
    20  	Insertion byte = 'I'
    21  	Deletion  byte = 'D'
    22  	N         byte = 'N'
    23  	SoftClip  byte = 'S'
    24  	HardClip  byte = 'H'
    25  	Padded    byte = 'P'
    26  	Equal     byte = '='
    27  	Mismatch  byte = 'X'
    28  	Unknown   byte = '*'
    29  )
    30  
    31  // LookUpCigByte is a helper function decode uint32 into a byte cigar struct.
    32  // In the sam/bam specs: CIGAR: op len<<4|op. Hash is as follows: ‘MIDNSHP=X’→‘012345678’.
    33  func lookUpCigByte(op uint32) byte {
    34  	switch op {
    35  	case 0:
    36  		return Match
    37  	case 1:
    38  		return Insertion
    39  	case 2:
    40  		return Deletion
    41  	case 3:
    42  		return N
    43  	case 4:
    44  		return SoftClip
    45  	case 5:
    46  		return HardClip
    47  	case 6:
    48  		return Padded
    49  	case 7:
    50  		return Equal
    51  	case 8:
    52  		return Mismatch
    53  	default:
    54  		log.Fatalf("Error: cound not identify input byte")
    55  		return 0
    56  	}
    57  }
    58  
    59  // lookUpUint32 will return a uint32 representation of a cigar Op.
    60  func lookUpUint32(op byte) uint32 {
    61  	switch op {
    62  	case Match:
    63  		return 0
    64  	case Insertion:
    65  		return 1
    66  	case Deletion:
    67  		return 2
    68  	case N:
    69  		return 3
    70  	case SoftClip:
    71  		return 4
    72  	case HardClip:
    73  		return 5
    74  	case Padded:
    75  		return 6
    76  	case Equal:
    77  		return 7
    78  	case Mismatch:
    79  		return 8
    80  	default:
    81  		log.Fatalf("Error: cound not identify input byte")
    82  		return 0
    83  	}
    84  }
    85  
    86  // ReadToBytesCigar will process a byte slice and define a small.
    87  func ReadToBytesCigar(cigar []byte) []ByteCigar {
    88  	if cigar[0] == '*' {
    89  		return nil
    90  	}
    91  	var ans []ByteCigar = make([]ByteCigar, 0, 1)
    92  	var lastNum int = 0
    93  	for i := 0; i < len(cigar); i++ {
    94  		if IsValidCigar(cigar[i]) {
    95  			ans = append(ans, ByteCigar{RunLen: parse.StringToUint16(string(cigar[lastNum:i])), Op: cigar[i]})
    96  			lastNum = i + 1
    97  		}
    98  	}
    99  	return ans
   100  }
   101  
   102  // IsValidCigar will perform a check to make sure op is a valid byte.
   103  func IsValidCigar(op byte) bool {
   104  	switch op {
   105  	case 'M':
   106  		return true
   107  	case 'I':
   108  		return true
   109  	case 'D':
   110  		return true
   111  	case 'N':
   112  		return true
   113  	case 'S':
   114  		return true
   115  	case 'H':
   116  		return true
   117  	case 'P':
   118  		return true
   119  	case '=':
   120  		return true
   121  	case 'X':
   122  		return true
   123  	case '*':
   124  		return true
   125  	default:
   126  		return false
   127  	}
   128  	return false
   129  }
   130  
   131  // ByteCigarToString will process the cigar byte struct and parse and/or convert the data into a string.
   132  func ByteCigarToString(cigar []ByteCigar) string {
   133  	if len(cigar) == 0 || cigar == nil {
   134  		return "*"
   135  	}
   136  	var str strings.Builder
   137  	var err error
   138  	for _, c := range cigar {
   139  		_, err = str.WriteString(strconv.Itoa(int(c.RunLen)))
   140  		exception.FatalOnErr(err)
   141  		err = str.WriteByte(c.Op)
   142  		exception.FatalOnErr(err)
   143  	}
   144  	return str.String()
   145  }
   146  
   147  // ByteMatrixTrace will trace smith-waterman matrix alignment and return one of 3 cigar Op's.
   148  // M: matches or mismatches, I: insertions, D: for deletions.
   149  func ByteMatrixTrace(a int64, b int64, c int64) (int64, byte) {
   150  	if a >= b && a >= c {
   151  		return a, 'M'
   152  	} else if b >= c {
   153  		return b, 'I'
   154  	} else {
   155  		return c, 'D'
   156  	}
   157  }
   158  
   159  // ReverseBytesCigar cigar will reverse the order of a cigar slice. Typically performed after matrix traceback
   160  // from a local alignment.
   161  func ReverseBytesCigar(alpha []ByteCigar) {
   162  	var i, off int
   163  	for i = len(alpha)/2 - 1; i >= 0; i-- {
   164  		off = len(alpha) - 1 - i
   165  		alpha[i], alpha[off] = alpha[off], alpha[i]
   166  	}
   167  }
   168  
   169  // QueryLength calculates the length of the query read from a slice of Cigar structs.
   170  func QueryRunLen(c []ByteCigar) int {
   171  	if c == nil {
   172  		return 0
   173  	}
   174  	var ans uint16
   175  	for _, v := range c {
   176  		if v.Op == Match || v.Op == Insertion || v.Op == SoftClip || v.Op == Equal || v.Op == Mismatch {
   177  			ans = ans + v.RunLen
   178  		}
   179  	}
   180  	return int(ans)
   181  }
   182  
   183  // CatByteCigar will concatenate two cigar slices into one merged.
   184  func CatByteCigar(cigs []ByteCigar, newCigs []ByteCigar) []ByteCigar {
   185  	if len(newCigs) == 0 || newCigs == nil {
   186  		return cigs
   187  	} else if len(cigs) == 0 {
   188  		return newCigs
   189  	} else {
   190  		cigs = AddCigarByte(cigs, newCigs[0])
   191  		cigs = append(cigs, newCigs[1:]...)
   192  		return cigs
   193  	}
   194  }
   195  
   196  // AddCigarByte will add append a cigar byte to an existing slice. The function
   197  // will perform a check on the tail of the slice and incurment the run length if the
   198  // cigar Op values are the same.
   199  func AddCigarByte(cigs []ByteCigar, newCig ByteCigar) []ByteCigar {
   200  	if len(cigs) == 0 {
   201  		cigs = append(cigs, newCig)
   202  	} else if cigs[len(cigs)-1].Op == newCig.Op {
   203  		cigs[len(cigs)-1].RunLen += newCig.RunLen
   204  	} else {
   205  		cigs = append(cigs, newCig)
   206  	}
   207  	return cigs
   208  }
   209  
   210  // MatrixSetup will allocate memory for smith-waterman matrix to be used with byte cigar opertations and trace back.
   211  func MatrixSetup(size int) ([][]int64, [][]byte) {
   212  	m := make([][]int64, size)
   213  	trace := make([][]byte, size)
   214  	for idx := range m {
   215  		m[idx] = make([]int64, size)
   216  		trace[idx] = make([]byte, size)
   217  	}
   218  	return m, trace
   219  }
   220  
   221  // Uint32ToByteCigar will process a uint32 slice and decode each number into a byte cigar struct.
   222  // CIGAR operation lengths are limited to 2^28-1 in the current sam/bam formats.
   223  func Uint32ToByteCigar(cigar []uint32) []ByteCigar {
   224  	var answer []ByteCigar = make([]ByteCigar, len(cigar))
   225  	for i := 0; i < len(cigar); i++ {
   226  		answer[i] = ByteCigar{RunLen: uint16(cigar[i] >> 4), Op: lookUpCigByte(cigar[i] & 0xf)}
   227  	}
   228  	return answer
   229  }
   230  
   231  // ByteCigarToUint32 will convert a slice of []ByteCigar to a slice of []uint32.
   232  func ByteCigarToUint32(cigar []ByteCigar) []uint32 {
   233  	var answer []uint32 = make([]uint32, len(cigar))
   234  	for i := 0; i < len(cigar); i++ {
   235  		answer[i] = lookUpUint32(cigar[i].Op) | uint32(cigar[i].RunLen)<<4
   236  	}
   237  	return answer
   238  }