github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/toGiraf.go (about)

     1  package genomeGraph
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/vertgenlab/gonomics/cigar"
     6  	"github.com/vertgenlab/gonomics/fastq"
     7  	"github.com/vertgenlab/gonomics/fileio"
     8  	"github.com/vertgenlab/gonomics/giraf"
     9  	"github.com/vertgenlab/gonomics/numbers/parse"
    10  	"github.com/vertgenlab/gonomics/sam"
    11  	"math"
    12  	"strings"
    13  	"sync"
    14  )
    15  
    16  func GraphSmithWatermanToGiraf(gg *GenomeGraph, read fastq.FastqBig, seedHash map[uint64][]uint64, seedLen int, stepSize int, matrix *MatrixAln, scoreMatrix [][]int64, seedPool *sync.Pool, dnaPool *sync.Pool, sk scoreKeeper, dynamicScore dynamicScoreKeeper, seedBuildHelper *seedHelper) *giraf.Giraf {
    17  	var currBest giraf.Giraf = giraf.Giraf{
    18  		QName:     read.Name,
    19  		QStart:    0,
    20  		QEnd:      0,
    21  		PosStrand: true,
    22  		Path:      giraf.Path{},
    23  		Cigar:     nil,
    24  		AlnScore:  0,
    25  		MapQ:      255,
    26  		Seq:       read.Seq,
    27  		Qual:      read.Qual,
    28  		Notes:     []giraf.Note{{Tag: []byte{'X', 'O'}, Type: 'Z', Value: "~"}},
    29  	}
    30  	resetScoreKeeper(sk)
    31  	sk.perfectScore = perfectMatchBig(read, scoreMatrix)
    32  	sk.extension = int(sk.perfectScore/600) + len(read.Seq)
    33  	seeds := seedPool.Get().(*memoryPool)
    34  	seeds.Hits = seeds.Hits[:0]
    35  	seeds.Worker = seeds.Worker[:0]
    36  	seeds.Hits = seedMapMemPool(seedHash, gg.Nodes, &read, seedLen, sk.perfectScore, scoreMatrix, seeds.Hits, seeds.Worker, seedBuildHelper)
    37  	for i := 0; i < len(seeds.Hits) && seedCouldBeBetter(int64(seeds.Hits[i].TotalLength), int64(currBest.AlnScore), sk.perfectScore, int64(len(read.Seq)), 100, 90, -196, -296); i++ {
    38  		sk.currSeed = seeds.Hits[i]
    39  		sk.tailSeed = *getLastPart(&sk.currSeed)
    40  		if sk.currSeed.PosStrand {
    41  			sk.currSeq = read.Seq
    42  		} else {
    43  			sk.currSeq = read.SeqRc
    44  		}
    45  		sk.seedScore = scoreSeedSeq(sk.currSeq, sk.currSeed.QueryStart, sk.tailSeed.QueryStart+sk.tailSeed.Length, scoreMatrix)
    46  		if int(sk.currSeed.TotalLength) == len(sk.currSeq) {
    47  			sk.targetStart = int(sk.currSeed.TargetStart)
    48  			sk.targetEnd = int(sk.tailSeed.TargetStart + sk.tailSeed.Length)
    49  			sk.queryStart = int(sk.currSeed.QueryStart)
    50  			sk.currScore = sk.seedScore
    51  		} else {
    52  			sk.leftAlignment, sk.leftScore, sk.targetStart, sk.queryStart, sk.leftPath = LeftAlignTraversal(&gg.Nodes[sk.currSeed.TargetId], sk.leftSeq, int(sk.currSeed.TargetStart), sk.leftPath, sk.extension-int(sk.currSeed.TotalLength), sk.currSeq[:sk.currSeed.QueryStart], scoreMatrix, matrix, sk, dynamicScore, dnaPool)
    53  			sk.rightAlignment, sk.rightScore, sk.targetEnd, sk.queryEnd, sk.rightPath = RightAlignTraversal(&gg.Nodes[sk.tailSeed.TargetId], sk.rightSeq, int(sk.tailSeed.TargetStart+sk.tailSeed.Length), sk.rightPath, sk.extension-int(sk.currSeed.TotalLength), sk.currSeq[sk.tailSeed.QueryStart+sk.tailSeed.Length:], scoreMatrix, matrix, sk, dynamicScore, dnaPool)
    54  			sk.currScore = sk.leftScore + sk.seedScore + sk.rightScore
    55  		}
    56  		if sk.currScore > int64(currBest.AlnScore) {
    57  			currBest.QStart = sk.queryStart
    58  			currBest.QEnd = int(sk.currSeed.QueryStart) + sk.queryStart + sk.queryEnd + int(sk.currSeed.TotalLength) - 1
    59  			currBest.PosStrand = sk.currSeed.PosStrand
    60  			currBest.Path = setPath(currBest.Path, sk.targetStart, CatPaths(CatPaths(sk.leftPath, getSeedPath(&sk.currSeed)), sk.rightPath), sk.targetEnd)
    61  			currBest.Cigar = SoftClipBases(sk.queryStart, len(sk.currSeq), cigar.CatByteCigar(cigar.AddCigarByte(sk.leftAlignment, cigar.ByteCigar{RunLen: uint16(sk.currSeed.TotalLength), Op: 'M'}), sk.rightAlignment))
    62  			currBest.AlnScore = int(sk.currScore)
    63  			currBest.Seq = sk.currSeq
    64  		}
    65  	}
    66  	seedPool.Put(seeds)
    67  	if !currBest.PosStrand {
    68  		fastq.ReverseQualUint8Record(currBest.Qual)
    69  	}
    70  	return &currBest
    71  }
    72  
    73  func readFastqGsw(fileOne string, fileTwo string, answer chan<- fastq.PairedEndBig) {
    74  	readOne, readTwo := fileio.NewByteReader(fileOne), fileio.NewByteReader(fileTwo)
    75  	for fq, done := fastq.ReadFqBigPair(readOne, readTwo); !done; fq, done = fastq.ReadFqBigPair(readOne, readTwo) {
    76  		answer <- fq
    77  	}
    78  	close(answer)
    79  }
    80  
    81  type ScoreMatrixHelper struct {
    82  	Matrix                         [][]int64
    83  	MaxMatch                       int64
    84  	MinMatch                       int64
    85  	LeastSevereMismatch            int64
    86  	LeastSevereMatchMismatchChange int64
    87  }
    88  
    89  func getScoreMatrixHelp(scoreMatrix [][]int64) *ScoreMatrixHelper {
    90  	help := ScoreMatrixHelper{Matrix: scoreMatrix}
    91  	help.MaxMatch, help.MinMatch, help.LeastSevereMismatch, help.LeastSevereMatchMismatchChange = MismatchStats(scoreMatrix)
    92  	return &help
    93  }
    94  
    95  func MismatchStats(scoreMatrix [][]int64) (int64, int64, int64, int64) {
    96  	var maxMatch int64 = 0
    97  	var minMatch int64
    98  	var leastSevereMismatch int64 = scoreMatrix[0][1]
    99  	var i, j int
   100  	for i = 0; i < len(scoreMatrix); i++ {
   101  		for j = 0; j < len(scoreMatrix[i]); j++ {
   102  			if scoreMatrix[i][j] > maxMatch {
   103  				minMatch = maxMatch
   104  				maxMatch = scoreMatrix[i][j]
   105  			} else {
   106  				if scoreMatrix[i][j] < 0 && leastSevereMismatch < scoreMatrix[i][j] {
   107  					leastSevereMismatch = scoreMatrix[i][j]
   108  				}
   109  			}
   110  		}
   111  	}
   112  	var leastSevereMatchMismatchChange int64 = leastSevereMismatch - maxMatch
   113  	return maxMatch, minMatch, leastSevereMismatch, leastSevereMatchMismatchChange
   114  }
   115  
   116  func WrapPairGiraf(gg *GenomeGraph, fq fastq.PairedEndBig, seedHash map[uint64][]uint64, seedLen int, stepSize int, matrix *MatrixAln, scoreMatrix [][]int64, seedPool *sync.Pool, dnaPool *sync.Pool, sk scoreKeeper, dynamicScore dynamicScoreKeeper, seedBuildHelper *seedHelper) giraf.GirafPair {
   117  	var mappedPair giraf.GirafPair = giraf.GirafPair{
   118  		Fwd: *GraphSmithWatermanToGiraf(gg, fq.Fwd, seedHash, seedLen, stepSize, matrix, scoreMatrix, seedPool, dnaPool, sk, dynamicScore, seedBuildHelper),
   119  		Rev: *GraphSmithWatermanToGiraf(gg, fq.Rev, seedHash, seedLen, stepSize, matrix, scoreMatrix, seedPool, dnaPool, sk, dynamicScore, seedBuildHelper),
   120  	}
   121  	setGirafFlags(&mappedPair)
   122  	return mappedPair
   123  }
   124  
   125  // setGirafFlags generates the appropriate flags for each giraf in a pair.
   126  func setGirafFlags(pair *giraf.GirafPair) {
   127  	pair.Fwd.Flag = getGirafFlags(&pair.Fwd)
   128  	pair.Rev.Flag = getGirafFlags(&pair.Rev)
   129  	pair.Fwd.Flag += 8  // Forward
   130  	pair.Fwd.Flag += 16 // Paired Reads
   131  	pair.Fwd.Flag += 16 // Paired Reads
   132  	if isProperPairAlign(pair) {
   133  		pair.Fwd.Flag += 1 // Properly Aligned
   134  		pair.Rev.Flag += 1 // Properly Aligned
   135  	}
   136  }
   137  
   138  func GirafToSam(ag *giraf.Giraf) sam.Sam {
   139  	curr := sam.Sam{QName: ag.QName, Flag: 4, RName: "*", Pos: 0, MapQ: 255, Cigar: []cigar.Cigar{{Op: '*'}}, RNext: "*", PNext: 0, TLen: 0, Seq: ag.Seq, Qual: fastq.QualString(ag.Qual), Extra: "BZ:i:0\tGP:Z:-1\tXO:Z:~"}
   140  	//read is unMapped
   141  	if strings.Compare(ag.Notes[0].Value, "~") == 0 {
   142  		return curr
   143  	} else {
   144  		target := strings.Split(ag.Notes[0].Value, "=")
   145  		curr.RName = target[0]
   146  		curr.Pos = uint32(ag.Path.TStart + parse.StringToInt(target[1]))
   147  		curr.Flag = getSamFlags(ag)
   148  		if len(ag.Notes) == 2 {
   149  			curr.Extra = fmt.Sprintf("BZ:i:%d\tGP:Z:%s\tXO:i:%d\t%s", ag.AlnScore, PathToString(ag.Path.Nodes), ag.Path.TStart, giraf.NoteToString(ag.Notes[1]))
   150  		} else {
   151  			curr.Extra = fmt.Sprintf("BZ:i:%d\tGP:Z:%s\tXO:i:%d", ag.AlnScore, PathToString(ag.Path.Nodes), ag.Path.TStart)
   152  		}
   153  	}
   154  	return curr
   155  }
   156  
   157  func GirafPairToSam(ag giraf.GirafPair) sam.MatePair {
   158  	var mappedPair sam.MatePair = sam.MatePair{Fwd: sam.Sam{}, Rev: sam.Sam{}}
   159  	mappedPair.Fwd = GirafToSam(&ag.Fwd)
   160  	mappedPair.Rev = GirafToSam(&ag.Rev)
   161  	mappedPair.Fwd.Flag += 64 + 2
   162  	mappedPair.Rev.Flag += 128 + 2
   163  	if isProperPairAlign(&ag) {
   164  		mappedPair.Fwd.Flag += 1
   165  		mappedPair.Rev.Flag += 1
   166  	}
   167  	return mappedPair
   168  }
   169  
   170  func isProperPairAlign(mappedPair *giraf.GirafPair) bool {
   171  	if math.Abs(float64(mappedPair.Fwd.Path.TStart-mappedPair.Rev.Path.TStart)) < 10000 {
   172  		if mappedPair.Fwd.Path.TStart < mappedPair.Rev.Path.TStart && mappedPair.Fwd.PosStrand && !mappedPair.Rev.PosStrand {
   173  			return true
   174  		}
   175  		if mappedPair.Fwd.Path.TStart > mappedPair.Rev.Path.TStart && !mappedPair.Fwd.PosStrand && mappedPair.Rev.PosStrand {
   176  			return true
   177  		}
   178  	}
   179  	return false
   180  }
   181  
   182  func getGirafFlags(ag *giraf.Giraf) uint8 {
   183  	var answer uint8
   184  	if ag.PosStrand {
   185  		answer += 4 // Positive Strand
   186  	}
   187  	if ag.AlnScore < 1200 {
   188  		answer += 2 // Unmapped
   189  	}
   190  	return answer
   191  }
   192  
   193  func getSamFlags(ag *giraf.Giraf) uint16 {
   194  	var answer uint16
   195  	if !ag.PosStrand {
   196  		answer += 16
   197  	}
   198  	if ag.AlnScore < 1200 {
   199  		answer += 4
   200  	}
   201  	return answer
   202  }
   203  
   204  func setPath(p giraf.Path, targetStart int, nodes []uint32, targetEnd int) giraf.Path {
   205  	p.TStart = targetStart
   206  	p.Nodes = nodes
   207  	p.TEnd = targetEnd
   208  	return p
   209  }
   210  
   211  /*func vInfoToValue(n *Node) string {
   212  	var answer string
   213  	switch {
   214  	case n.Info.Variant == 1:
   215  		answer = fmt.Sprintf("%d=%s", n.Id, "snp")
   216  	case n.Info.Variant == 2:
   217  		answer = fmt.Sprintf("%d=%s", n.Id, "ins")
   218  	case n.Info.Variant == 3:
   219  		answer = fmt.Sprintf("%d=%s", n.Id, "del")
   220  	}
   221  	return answer
   222  }*/
   223  
   224  /*func infoToNotes(nodes []*Node, path []uint32) giraf.Note {
   225  	var vInfo giraf.Note = giraf.Note{Tag: []byte{'X', 'V'}, Type: 'Z'}
   226  	vInfo.Value = fmt.Sprintf("%d_%d", nodes[0].Info.Allele, nodes[0].Info.Variant)
   227  	if len(path) > 0 {
   228  		for i := 1; i < len(path); i++ {
   229  			if nodes[i].Info.Variant > 0 {
   230  				vInfo.Value += fmt.Sprintf(",%s", vInfoToValue(nodes[path[i]]))
   231  			} else {
   232  				vInfo.Value += fmt.Sprintf(",%d_%d", nodes[i].Info.Allele, nodes[path[i]].Info.Variant)
   233  			}
   234  
   235  		}
   236  	}
   237  	return vInfo
   238  }*/