github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/tournamentTree.go (about)

     1  package genomeGraph
     2  
     3  import (
     4  	"log"
     5  
     6  	"github.com/vertgenlab/gonomics/dna"
     7  	"github.com/vertgenlab/gonomics/dna/dnaTwoBit"
     8  	"github.com/vertgenlab/gonomics/fastq"
     9  	"github.com/vertgenlab/gonomics/numbers"
    10  )
    11  
    12  func extendToTheRight(node *Node, read fastq.FastqBig, readStart int, nodeStart int, posStrand bool) []*SeedDev {
    13  	const basesPerInt int = 32
    14  	var nodeOffset int = nodeStart % basesPerInt
    15  	var readOffset int = 31 - ((readStart - nodeOffset + 31) % 32)
    16  	var rightMatches int = 0
    17  	var currNode *SeedDev = nil
    18  	var answer, nextParts []*SeedDev
    19  	var i, j int = 0, 0
    20  
    21  	if posStrand {
    22  		rightMatches = dnaTwoBit.CountRightMatches(node.SeqTwoBit, nodeStart, &read.Rainbow[readOffset], readStart+readOffset)
    23  	} else {
    24  		rightMatches = dnaTwoBit.CountRightMatches(node.SeqTwoBit, nodeStart, &read.RainbowRc[readOffset], readStart+readOffset)
    25  	}
    26  
    27  	// nothing aligned here
    28  	if rightMatches == 0 {
    29  		return nil
    30  	}
    31  	// we went all the way to end and there might be more
    32  	if readStart+rightMatches < len(read.Seq) && nodeStart+rightMatches == node.SeqTwoBit.Len && len(node.Next) != 0 {
    33  		for i = 0; i < len(node.Next); i++ {
    34  			nextParts = extendToTheRight(node.Next[i].Dest, read, readStart+rightMatches, 0, posStrand)
    35  			// if we aligned into the next node, make a seed for this node and point it to the next one
    36  			for j = 0; j < len(nextParts); j++ {
    37  				currNode = &SeedDev{TargetId: node.Id, TargetStart: uint32(nodeStart), QueryStart: uint32(readStart), Length: uint32(rightMatches), PosStrand: posStrand, TotalLength: uint32(rightMatches) + nextParts[j].TotalLength, NextPart: nextParts[j]}
    38  				answer = append(answer, currNode)
    39  			}
    40  		}
    41  	}
    42  
    43  	// if the alignment did not go to another node, return the match for this node
    44  	if len(answer) == 0 {
    45  		currNode = &SeedDev{TargetId: node.Id, TargetStart: uint32(nodeStart), QueryStart: uint32(readStart), Length: uint32(rightMatches), PosStrand: posStrand, TotalLength: uint32(rightMatches), NextPart: nil}
    46  		answer = []*SeedDev{currNode}
    47  	}
    48  
    49  	return answer
    50  }
    51  
    52  func extendToTheLeft(node *Node, read fastq.FastqBig, currPart *SeedDev) []*SeedDev {
    53  	var answer, prevParts []*SeedDev
    54  	var i int
    55  	var readBase dna.Base
    56  
    57  	if currPart.QueryStart > 0 && currPart.TargetStart == 0 {
    58  		for i = 0; i < len(node.Prev); i++ {
    59  			if currPart.PosStrand {
    60  				readBase = dnaTwoBit.GetBase(&read.Rainbow[0], uint(currPart.QueryStart)-1)
    61  			} else {
    62  				readBase = dnaTwoBit.GetBase(&read.RainbowRc[0], uint(currPart.QueryStart)-1)
    63  			}
    64  			if readBase == dnaTwoBit.GetBase(node.Prev[i].Dest.SeqTwoBit, uint(node.Prev[i].Dest.SeqTwoBit.Len)-1) {
    65  				prevParts = extendToTheLeftHelper(node.Prev[i].Dest, read, currPart)
    66  				answer = append(answer, prevParts...)
    67  			}
    68  		}
    69  	}
    70  
    71  	if len(answer) == 0 {
    72  		return []*SeedDev{currPart}
    73  	} else {
    74  		return answer
    75  	}
    76  }
    77  
    78  func extendToTheLeftHelper(node *Node, read fastq.FastqBig, nextPart *SeedDev) []*SeedDev {
    79  	const basesPerInt int = 32
    80  	var nodePos int = node.SeqTwoBit.Len - 1
    81  	var readPos int = int(nextPart.QueryStart) - 1
    82  	var nodeOffset int = nodePos % basesPerInt
    83  	var readOffset int = 31 - ((readPos - nodeOffset + 31) % 32)
    84  	var leftMatches int = 0
    85  	var currPart *SeedDev = nil
    86  	var prevParts, answer []*SeedDev
    87  	var i int
    88  	var readBase dna.Base
    89  
    90  	if nextPart.PosStrand {
    91  		leftMatches = numbers.Min(readPos+1, dnaTwoBit.CountLeftMatches(node.SeqTwoBit, nodePos, &read.Rainbow[readOffset], readPos+readOffset))
    92  	} else {
    93  		leftMatches = numbers.Min(readPos+1, dnaTwoBit.CountLeftMatches(node.SeqTwoBit, nodePos, &read.RainbowRc[readOffset], readPos+readOffset))
    94  	}
    95  
    96  	if leftMatches == 0 {
    97  		log.Fatal("Error: should not have zero matches to the left\n")
    98  	}
    99  
   100  	currPart = &SeedDev{TargetId: node.Id, TargetStart: uint32(nodePos - (leftMatches - 1)), QueryStart: uint32(readPos - (leftMatches - 1)), Length: uint32(leftMatches), PosStrand: nextPart.PosStrand, TotalLength: uint32(leftMatches) + nextPart.TotalLength, NextPart: nextPart}
   101  
   102  	// we went all the way to end and there might be more
   103  	if currPart.QueryStart > 0 && currPart.TargetStart == 0 {
   104  		for i = 0; i < len(node.Prev); i++ {
   105  			if nextPart.PosStrand {
   106  				readBase = dnaTwoBit.GetBase(&read.Rainbow[0], uint(currPart.QueryStart)-1)
   107  			} else {
   108  				readBase = dnaTwoBit.GetBase(&read.RainbowRc[0], uint(currPart.QueryStart)-1)
   109  			}
   110  			if readBase == dnaTwoBit.GetBase(node.Prev[i].Dest.SeqTwoBit, uint(node.Prev[i].Dest.SeqTwoBit.Len)-1) {
   111  				prevParts = extendToTheLeftHelper(node.Prev[i].Dest, read, currPart)
   112  				answer = append(answer, prevParts...)
   113  			}
   114  		}
   115  	}
   116  
   117  	// if the alignment did not go to another node, return the match for this node
   118  	if len(answer) == 0 {
   119  		answer = []*SeedDev{currPart}
   120  	}
   121  
   122  	return answer
   123  }
   124  
   125  func findSeedsInSmallMapWithMemPool(seedHash map[uint64][]uint64, nodes []Node, read fastq.FastqBig, seedLen int, perfectScore int64, scoreMatrix [][]int64) []*SeedDev {
   126  	const basesPerInt int64 = 32
   127  	var currHits []uint64
   128  	var codedNodeCoord uint64
   129  	var leftMatches int = 0
   130  	var keyIdx, keyOffset, readOffset, nodeOffset int = 0, 0, 0, 0
   131  	var nodeIdx, nodePos int64 = 0, 0
   132  	//var poolHead *SeedDev = *memoryPool
   133  	var seqKey uint64
   134  	var keyShift uint = 64 - (uint(seedLen) * 2)
   135  	var tempSeeds, finalSeeds []*SeedDev
   136  	var tempSeed *SeedDev
   137  
   138  	for readStart := 0; readStart < len(read.Seq)-seedLen+1; readStart++ {
   139  		keyIdx = (readStart + 31) / 32
   140  		keyOffset = 31 - ((readStart + 31) % 32)
   141  
   142  		// do fwd strand
   143  		seqKey = read.Rainbow[keyOffset].Seq[keyIdx] >> keyShift
   144  		currHits = seedHash[seqKey]
   145  		/*if len(currHits) > 0 {
   146  			log.Printf(" %d hits\n", len(currHits))
   147  		}*/
   148  		for _, codedNodeCoord = range currHits {
   149  			nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord)
   150  			nodeOffset = int(nodePos % basesPerInt)
   151  			readOffset = 31 - ((readStart - nodeOffset + 31) % 32)
   152  
   153  			leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), &read.Rainbow[readOffset], readStart+readOffset))
   154  			//rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.Rainbow[readOffset], readStart+readOffset)
   155  			tempSeeds = extendToTheRight(&nodes[nodeIdx], read, readStart-(leftMatches-1), int(nodePos)-(leftMatches-1), true)
   156  			//log.Printf("After extendToTheRight fwd:\n")
   157  			//printSeedDev(tempSeeds)
   158  			for _, tempSeed = range tempSeeds {
   159  				finalSeeds = append(finalSeeds, extendToTheLeft(&nodes[nodeIdx], read, tempSeed)...)
   160  			}
   161  			//log.Printf("After extendToTheLeft fwd:\n")
   162  			//printSeedDev(finalSeeds)
   163  			// TODO: Bring back speed optimizations once we are sure of correctness
   164  			/*if leftMatches < 1 || rightMatches < seedLen {
   165  				log.Fatalf("No matches found at seed location: %s %d, %d %d", dna.BasesToString(read.Seq), readStart, nodeIdx, nodePos)
   166  			}*/
   167  
   168  			/*if seedCouldBeBetter(int64(leftMatches+rightMatches-1), bestScore, perfectScore, int64(len(read.Seq)), 100, 90, -196, -296) {
   169  				if poolHead != nil {
   170  					currSeed = poolHead
   171  					poolHead = poolHead.Next
   172  				} else {
   173  					currSeed = &SeedDev{}
   174  				}
   175  				currSeed.TargetId = uint32(nodeIdx)
   176  				currSeed.TargetStart = uint32(int(nodePos) - leftMatches + 1)
   177  				currSeed.QueryStart = uint32(readStart - leftMatches + 1)
   178  				currSeed.Length = uint32(leftMatches + rightMatches - 1)
   179  				currSeed.PosStrand = true
   180  				currSeed.TotalLength = uint32(leftMatches + rightMatches - 1)
   181  				currSeed.Next = hits
   182  				hits = currSeed
   183  				seedScore = scoreSeedFastqBig(currSeed, read, scoreMatrix)
   184  				if seedScore > bestScore {
   185  					bestScore = seedScore
   186  				}
   187  			}*/
   188  		}
   189  
   190  		// do rev strand
   191  		seqKey = read.RainbowRc[keyOffset].Seq[keyIdx] >> keyShift
   192  		currHits = seedHash[seqKey]
   193  		/*if len(currHits) > 0 {
   194  			log.Printf(" %d hits\n", len(currHits))
   195  		}*/
   196  		for _, codedNodeCoord = range currHits {
   197  			nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord)
   198  			nodeOffset = int(nodePos % basesPerInt)
   199  			readOffset = 31 - ((readStart - nodeOffset + 31) % 32)
   200  
   201  			leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), &read.RainbowRc[readOffset], readStart+readOffset))
   202  			//rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.RainbowRc[readOffset], readStart+readOffset)
   203  			tempSeeds = extendToTheRight(&nodes[nodeIdx], read, readStart-(leftMatches-1), int(nodePos)-(leftMatches-1), false)
   204  			//log.Printf("After extendToTheRight rev:\n")
   205  			//printSeedDev(tempSeeds)
   206  			for _, tempSeed = range tempSeeds {
   207  				//log.Printf("tempSeed.QueryStart = %d\n", tempSeed.QueryStart)
   208  				finalSeeds = append(finalSeeds, extendToTheLeft(&nodes[nodeIdx], read, tempSeed)...)
   209  			}
   210  			//log.Printf("After extendToTheLeft rev:\n")
   211  			//printSeedDev(finalSeeds)
   212  			// TODO: bring back speed optimizations
   213  			/*if leftMatches < 1 || rightMatches < seedLen {
   214  				log.Fatalf("No matches found at seed location: %s %d, %d %d", dna.BasesToString(read.SeqRc), readStart, nodeIdx, nodePos)
   215  			}*/
   216  			/*if seedCouldBeBetter(int64(leftMatches+rightMatches-1), bestScore, perfectScore, int64(len(read.SeqRc)), 100, 90, -196, -296) {
   217  				if poolHead != nil {
   218  					currSeed = poolHead
   219  					poolHead = poolHead.Next
   220  				} else {
   221  					currSeed = &SeedDev{}
   222  				}
   223  				currSeed.TargetId = uint32(nodeIdx)
   224  				currSeed.TargetStart = uint32(int(nodePos) - leftMatches + 1)
   225  				currSeed.QueryStart = uint32(readStart - leftMatches + 1)
   226  				currSeed.Length = uint32(leftMatches + rightMatches - 1)
   227  				currSeed.PosStrand = false
   228  				currSeed.TotalLength = uint32(leftMatches + rightMatches - 1)
   229  				currSeed.Next = hits
   230  				hits = currSeed
   231  				seedScore = scoreSeedFastqBig(currSeed, read, scoreMatrix)
   232  				if seedScore > bestScore {
   233  					bestScore = seedScore
   234  				}
   235  			}*/
   236  		}
   237  	}
   238  
   239  	/*var finalHits, nextSeed *SeedDev
   240  	for currSeed = hits; currSeed != nil; currSeed = nextSeed {
   241  		nextSeed = currSeed.Next
   242  		if seedCouldBeBetter(int64(currSeed.TotalLength), bestScore, perfectScore, int64(len(read.SeqRc)), 100, 90, -196, -296) {
   243  			currSeed.Next = finalHits
   244  			finalHits = currSeed
   245  		} else {
   246  			currSeed.Next = poolHead
   247  			poolHead = currSeed
   248  		}
   249  	}
   250  
   251  	*memoryPool = poolHead
   252  	return hits*/
   253  	//printSeedDev(finalSeeds)
   254  	return finalSeeds
   255  }
   256  
   257  // does not support extending along edges, so commented out
   258  /*func findSeedsInSmallMap(seedHash map[uint64][]uint64, nodes []*Node, read *fastq.FastqBig, seedLen int, perfectScore int64, scoreMatrix [][]int64) []*SeedDev {
   259  	var currHits []uint64
   260  	var leftMatches int = 0
   261  	var nodeIdx int64 = 0
   262  	var readStart, keyIdx, keyOffset int
   263  	var seqKey, codedNodeCoord uint64
   264  	var keyShift uint
   265  	var nodePos int64
   266  	var nodeOffset, basesPerInt, readOffset int
   267  	var tempSeeds []*SeedDev
   268  	var tempSeed *SeedDev
   269  	var finalSeeds []*SeedDev
   270  
   271  	log.Printf("About to find seeds:\n")
   272  	for readStart = 0; readStart < len(read.Seq)-seedLen+1; readStart++ {
   273  		keyIdx = (readStart + 31) / 32
   274  		keyOffset = 31 - ((readStart + 31) % 32)
   275  
   276  		// do fwd strand
   277  		seqKey = read.Rainbow[keyOffset].Seq[keyIdx] >> keyShift
   278  		currHits = seedHash[seqKey]
   279  		log.Printf("Found %d forward hits\n", len(currHits))
   280  		for _, codedNodeCoord = range currHits {
   281  			nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord)
   282  			nodeOffset = int(nodePos) % basesPerInt
   283  			readOffset = 31 - ((readStart - nodeOffset + 31) % 32)
   284  
   285  			leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.Rainbow[readOffset], readStart+readOffset))
   286  			//rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.Rainbow[readOffset], readStart+readOffset)
   287  			tempSeeds = extendToTheRight(nodes[nodeIdx], read, readStart-leftMatches, int(nodePos)-leftMatches, true)
   288  			log.Printf("After extendToTheRight fwd:\n")
   289  			printSeedDev(tempSeeds)
   290  			for _, tempSeed = range tempSeeds {
   291  				finalSeeds = append(finalSeeds, extendToTheLeft(nodes[nodeIdx], read, tempSeed)...)
   292  			}
   293  			log.Printf("After extendToTheLeft fwd:\n")
   294  			printSeedDev(finalSeeds)
   295  		}
   296  
   297  		// do rev strand
   298  		seqKey = read.RainbowRc[keyOffset].Seq[keyIdx] >> keyShift
   299  		currHits = seedHash[seqKey]
   300  		log.Printf("Found %d reverse hits\n", len(currHits))
   301  		for _, codedNodeCoord = range currHits {
   302  			nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord)
   303  			nodeOffset = int(nodePos) % basesPerInt
   304  			readOffset = 31 - ((readStart - nodeOffset + 31) % 32)
   305  
   306  			leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.RainbowRc[readOffset], readStart+readOffset))
   307  			//rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.RainbowRc[readOffset], readStart+readOffset)
   308  			tempSeeds = extendToTheRight(nodes[nodeIdx], read, readStart-leftMatches, int(nodePos)-leftMatches, false)
   309  			log.Printf("After extendToTheRight rev:\n")
   310  			printSeedDev(tempSeeds)
   311  			for _, tempSeed = range tempSeeds {
   312  				finalSeeds = append(finalSeeds, extendToTheLeft(nodes[nodeIdx], read, tempSeed)...)
   313  			}
   314  			log.Printf("After extendToTheLeft Rev:\n")
   315  			printSeedDev(finalSeeds)
   316  		}
   317  	}
   318  	return finalSeeds
   319  	// TODO: Bring back speed optimizations
   320  	var badIdx = len(hits) - 1
   321  	var badCount = 0
   322  	for i := 0; i <= badIdx; {
   323  		if !seedCouldBeBetter(int64(hits[i].TotalLength), bestScore, perfectScore, int64(len(read.SeqRc)), 100, 90, -196, -296) {
   324  			hits[i], hits[badIdx] = hits[badIdx], hits[i]
   325  			badIdx--
   326  			badCount++
   327  		} else {
   328  			i++
   329  		}
   330  	}
   331  	return hits[0:(badIdx + 1)]
   332  }*/
   333  
   334  /*func findSeedsInSlice(seedHash [][]uint64, read *fastq.Fastq, seedLen int, posStrand bool) []*SeedDev {
   335  	var codedSeq uint64 = 0
   336  	var allHits []*SeedDev = make([]*SeedDev, 0)
   337  	for subSeqStart := 0; subSeqStart < len(read.Seq)-seedLen+1; subSeqStart++ {
   338  		if dna.CountBaseInterval(read.Seq, dna.N, subSeqStart, subSeqStart+seedLen) == 0 {
   339  			codedSeq = dnaToNumber(read.Seq, subSeqStart, subSeqStart+seedLen)
   340  			//fmt.Printf("Coded seq is:%d, seedLength:%d\n", codedSeq, seedLen)
   341  			currHits := seedHash[codedSeq]
   342  			noMerge, merged := mergeSeedLists(prevHits, currHits, uint32(subSeqStart), posStrand)
   343  			allHits = append(allHits, noMerge...)
   344  			prevHits = merged
   345  		}
   346  
   347  	}
   348  	allHits = append(allHits, prevHits...)
   349  
   350  	return allHits
   351  }*/