github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/tournamentTree.go (about) 1 package genomeGraph 2 3 import ( 4 "log" 5 6 "github.com/vertgenlab/gonomics/dna" 7 "github.com/vertgenlab/gonomics/dna/dnaTwoBit" 8 "github.com/vertgenlab/gonomics/fastq" 9 "github.com/vertgenlab/gonomics/numbers" 10 ) 11 12 func extendToTheRight(node *Node, read fastq.FastqBig, readStart int, nodeStart int, posStrand bool) []*SeedDev { 13 const basesPerInt int = 32 14 var nodeOffset int = nodeStart % basesPerInt 15 var readOffset int = 31 - ((readStart - nodeOffset + 31) % 32) 16 var rightMatches int = 0 17 var currNode *SeedDev = nil 18 var answer, nextParts []*SeedDev 19 var i, j int = 0, 0 20 21 if posStrand { 22 rightMatches = dnaTwoBit.CountRightMatches(node.SeqTwoBit, nodeStart, &read.Rainbow[readOffset], readStart+readOffset) 23 } else { 24 rightMatches = dnaTwoBit.CountRightMatches(node.SeqTwoBit, nodeStart, &read.RainbowRc[readOffset], readStart+readOffset) 25 } 26 27 // nothing aligned here 28 if rightMatches == 0 { 29 return nil 30 } 31 // we went all the way to end and there might be more 32 if readStart+rightMatches < len(read.Seq) && nodeStart+rightMatches == node.SeqTwoBit.Len && len(node.Next) != 0 { 33 for i = 0; i < len(node.Next); i++ { 34 nextParts = extendToTheRight(node.Next[i].Dest, read, readStart+rightMatches, 0, posStrand) 35 // if we aligned into the next node, make a seed for this node and point it to the next one 36 for j = 0; j < len(nextParts); j++ { 37 currNode = &SeedDev{TargetId: node.Id, TargetStart: uint32(nodeStart), QueryStart: uint32(readStart), Length: uint32(rightMatches), PosStrand: posStrand, TotalLength: uint32(rightMatches) + nextParts[j].TotalLength, NextPart: nextParts[j]} 38 answer = append(answer, currNode) 39 } 40 } 41 } 42 43 // if the alignment did not go to another node, return the match for this node 44 if len(answer) == 0 { 45 currNode = &SeedDev{TargetId: node.Id, TargetStart: uint32(nodeStart), QueryStart: uint32(readStart), Length: uint32(rightMatches), PosStrand: posStrand, TotalLength: uint32(rightMatches), NextPart: nil} 46 answer = []*SeedDev{currNode} 47 } 48 49 return answer 50 } 51 52 func extendToTheLeft(node *Node, read fastq.FastqBig, currPart *SeedDev) []*SeedDev { 53 var answer, prevParts []*SeedDev 54 var i int 55 var readBase dna.Base 56 57 if currPart.QueryStart > 0 && currPart.TargetStart == 0 { 58 for i = 0; i < len(node.Prev); i++ { 59 if currPart.PosStrand { 60 readBase = dnaTwoBit.GetBase(&read.Rainbow[0], uint(currPart.QueryStart)-1) 61 } else { 62 readBase = dnaTwoBit.GetBase(&read.RainbowRc[0], uint(currPart.QueryStart)-1) 63 } 64 if readBase == dnaTwoBit.GetBase(node.Prev[i].Dest.SeqTwoBit, uint(node.Prev[i].Dest.SeqTwoBit.Len)-1) { 65 prevParts = extendToTheLeftHelper(node.Prev[i].Dest, read, currPart) 66 answer = append(answer, prevParts...) 67 } 68 } 69 } 70 71 if len(answer) == 0 { 72 return []*SeedDev{currPart} 73 } else { 74 return answer 75 } 76 } 77 78 func extendToTheLeftHelper(node *Node, read fastq.FastqBig, nextPart *SeedDev) []*SeedDev { 79 const basesPerInt int = 32 80 var nodePos int = node.SeqTwoBit.Len - 1 81 var readPos int = int(nextPart.QueryStart) - 1 82 var nodeOffset int = nodePos % basesPerInt 83 var readOffset int = 31 - ((readPos - nodeOffset + 31) % 32) 84 var leftMatches int = 0 85 var currPart *SeedDev = nil 86 var prevParts, answer []*SeedDev 87 var i int 88 var readBase dna.Base 89 90 if nextPart.PosStrand { 91 leftMatches = numbers.Min(readPos+1, dnaTwoBit.CountLeftMatches(node.SeqTwoBit, nodePos, &read.Rainbow[readOffset], readPos+readOffset)) 92 } else { 93 leftMatches = numbers.Min(readPos+1, dnaTwoBit.CountLeftMatches(node.SeqTwoBit, nodePos, &read.RainbowRc[readOffset], readPos+readOffset)) 94 } 95 96 if leftMatches == 0 { 97 log.Fatal("Error: should not have zero matches to the left\n") 98 } 99 100 currPart = &SeedDev{TargetId: node.Id, TargetStart: uint32(nodePos - (leftMatches - 1)), QueryStart: uint32(readPos - (leftMatches - 1)), Length: uint32(leftMatches), PosStrand: nextPart.PosStrand, TotalLength: uint32(leftMatches) + nextPart.TotalLength, NextPart: nextPart} 101 102 // we went all the way to end and there might be more 103 if currPart.QueryStart > 0 && currPart.TargetStart == 0 { 104 for i = 0; i < len(node.Prev); i++ { 105 if nextPart.PosStrand { 106 readBase = dnaTwoBit.GetBase(&read.Rainbow[0], uint(currPart.QueryStart)-1) 107 } else { 108 readBase = dnaTwoBit.GetBase(&read.RainbowRc[0], uint(currPart.QueryStart)-1) 109 } 110 if readBase == dnaTwoBit.GetBase(node.Prev[i].Dest.SeqTwoBit, uint(node.Prev[i].Dest.SeqTwoBit.Len)-1) { 111 prevParts = extendToTheLeftHelper(node.Prev[i].Dest, read, currPart) 112 answer = append(answer, prevParts...) 113 } 114 } 115 } 116 117 // if the alignment did not go to another node, return the match for this node 118 if len(answer) == 0 { 119 answer = []*SeedDev{currPart} 120 } 121 122 return answer 123 } 124 125 func findSeedsInSmallMapWithMemPool(seedHash map[uint64][]uint64, nodes []Node, read fastq.FastqBig, seedLen int, perfectScore int64, scoreMatrix [][]int64) []*SeedDev { 126 const basesPerInt int64 = 32 127 var currHits []uint64 128 var codedNodeCoord uint64 129 var leftMatches int = 0 130 var keyIdx, keyOffset, readOffset, nodeOffset int = 0, 0, 0, 0 131 var nodeIdx, nodePos int64 = 0, 0 132 //var poolHead *SeedDev = *memoryPool 133 var seqKey uint64 134 var keyShift uint = 64 - (uint(seedLen) * 2) 135 var tempSeeds, finalSeeds []*SeedDev 136 var tempSeed *SeedDev 137 138 for readStart := 0; readStart < len(read.Seq)-seedLen+1; readStart++ { 139 keyIdx = (readStart + 31) / 32 140 keyOffset = 31 - ((readStart + 31) % 32) 141 142 // do fwd strand 143 seqKey = read.Rainbow[keyOffset].Seq[keyIdx] >> keyShift 144 currHits = seedHash[seqKey] 145 /*if len(currHits) > 0 { 146 log.Printf(" %d hits\n", len(currHits)) 147 }*/ 148 for _, codedNodeCoord = range currHits { 149 nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord) 150 nodeOffset = int(nodePos % basesPerInt) 151 readOffset = 31 - ((readStart - nodeOffset + 31) % 32) 152 153 leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), &read.Rainbow[readOffset], readStart+readOffset)) 154 //rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.Rainbow[readOffset], readStart+readOffset) 155 tempSeeds = extendToTheRight(&nodes[nodeIdx], read, readStart-(leftMatches-1), int(nodePos)-(leftMatches-1), true) 156 //log.Printf("After extendToTheRight fwd:\n") 157 //printSeedDev(tempSeeds) 158 for _, tempSeed = range tempSeeds { 159 finalSeeds = append(finalSeeds, extendToTheLeft(&nodes[nodeIdx], read, tempSeed)...) 160 } 161 //log.Printf("After extendToTheLeft fwd:\n") 162 //printSeedDev(finalSeeds) 163 // TODO: Bring back speed optimizations once we are sure of correctness 164 /*if leftMatches < 1 || rightMatches < seedLen { 165 log.Fatalf("No matches found at seed location: %s %d, %d %d", dna.BasesToString(read.Seq), readStart, nodeIdx, nodePos) 166 }*/ 167 168 /*if seedCouldBeBetter(int64(leftMatches+rightMatches-1), bestScore, perfectScore, int64(len(read.Seq)), 100, 90, -196, -296) { 169 if poolHead != nil { 170 currSeed = poolHead 171 poolHead = poolHead.Next 172 } else { 173 currSeed = &SeedDev{} 174 } 175 currSeed.TargetId = uint32(nodeIdx) 176 currSeed.TargetStart = uint32(int(nodePos) - leftMatches + 1) 177 currSeed.QueryStart = uint32(readStart - leftMatches + 1) 178 currSeed.Length = uint32(leftMatches + rightMatches - 1) 179 currSeed.PosStrand = true 180 currSeed.TotalLength = uint32(leftMatches + rightMatches - 1) 181 currSeed.Next = hits 182 hits = currSeed 183 seedScore = scoreSeedFastqBig(currSeed, read, scoreMatrix) 184 if seedScore > bestScore { 185 bestScore = seedScore 186 } 187 }*/ 188 } 189 190 // do rev strand 191 seqKey = read.RainbowRc[keyOffset].Seq[keyIdx] >> keyShift 192 currHits = seedHash[seqKey] 193 /*if len(currHits) > 0 { 194 log.Printf(" %d hits\n", len(currHits)) 195 }*/ 196 for _, codedNodeCoord = range currHits { 197 nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord) 198 nodeOffset = int(nodePos % basesPerInt) 199 readOffset = 31 - ((readStart - nodeOffset + 31) % 32) 200 201 leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), &read.RainbowRc[readOffset], readStart+readOffset)) 202 //rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.RainbowRc[readOffset], readStart+readOffset) 203 tempSeeds = extendToTheRight(&nodes[nodeIdx], read, readStart-(leftMatches-1), int(nodePos)-(leftMatches-1), false) 204 //log.Printf("After extendToTheRight rev:\n") 205 //printSeedDev(tempSeeds) 206 for _, tempSeed = range tempSeeds { 207 //log.Printf("tempSeed.QueryStart = %d\n", tempSeed.QueryStart) 208 finalSeeds = append(finalSeeds, extendToTheLeft(&nodes[nodeIdx], read, tempSeed)...) 209 } 210 //log.Printf("After extendToTheLeft rev:\n") 211 //printSeedDev(finalSeeds) 212 // TODO: bring back speed optimizations 213 /*if leftMatches < 1 || rightMatches < seedLen { 214 log.Fatalf("No matches found at seed location: %s %d, %d %d", dna.BasesToString(read.SeqRc), readStart, nodeIdx, nodePos) 215 }*/ 216 /*if seedCouldBeBetter(int64(leftMatches+rightMatches-1), bestScore, perfectScore, int64(len(read.SeqRc)), 100, 90, -196, -296) { 217 if poolHead != nil { 218 currSeed = poolHead 219 poolHead = poolHead.Next 220 } else { 221 currSeed = &SeedDev{} 222 } 223 currSeed.TargetId = uint32(nodeIdx) 224 currSeed.TargetStart = uint32(int(nodePos) - leftMatches + 1) 225 currSeed.QueryStart = uint32(readStart - leftMatches + 1) 226 currSeed.Length = uint32(leftMatches + rightMatches - 1) 227 currSeed.PosStrand = false 228 currSeed.TotalLength = uint32(leftMatches + rightMatches - 1) 229 currSeed.Next = hits 230 hits = currSeed 231 seedScore = scoreSeedFastqBig(currSeed, read, scoreMatrix) 232 if seedScore > bestScore { 233 bestScore = seedScore 234 } 235 }*/ 236 } 237 } 238 239 /*var finalHits, nextSeed *SeedDev 240 for currSeed = hits; currSeed != nil; currSeed = nextSeed { 241 nextSeed = currSeed.Next 242 if seedCouldBeBetter(int64(currSeed.TotalLength), bestScore, perfectScore, int64(len(read.SeqRc)), 100, 90, -196, -296) { 243 currSeed.Next = finalHits 244 finalHits = currSeed 245 } else { 246 currSeed.Next = poolHead 247 poolHead = currSeed 248 } 249 } 250 251 *memoryPool = poolHead 252 return hits*/ 253 //printSeedDev(finalSeeds) 254 return finalSeeds 255 } 256 257 // does not support extending along edges, so commented out 258 /*func findSeedsInSmallMap(seedHash map[uint64][]uint64, nodes []*Node, read *fastq.FastqBig, seedLen int, perfectScore int64, scoreMatrix [][]int64) []*SeedDev { 259 var currHits []uint64 260 var leftMatches int = 0 261 var nodeIdx int64 = 0 262 var readStart, keyIdx, keyOffset int 263 var seqKey, codedNodeCoord uint64 264 var keyShift uint 265 var nodePos int64 266 var nodeOffset, basesPerInt, readOffset int 267 var tempSeeds []*SeedDev 268 var tempSeed *SeedDev 269 var finalSeeds []*SeedDev 270 271 log.Printf("About to find seeds:\n") 272 for readStart = 0; readStart < len(read.Seq)-seedLen+1; readStart++ { 273 keyIdx = (readStart + 31) / 32 274 keyOffset = 31 - ((readStart + 31) % 32) 275 276 // do fwd strand 277 seqKey = read.Rainbow[keyOffset].Seq[keyIdx] >> keyShift 278 currHits = seedHash[seqKey] 279 log.Printf("Found %d forward hits\n", len(currHits)) 280 for _, codedNodeCoord = range currHits { 281 nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord) 282 nodeOffset = int(nodePos) % basesPerInt 283 readOffset = 31 - ((readStart - nodeOffset + 31) % 32) 284 285 leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.Rainbow[readOffset], readStart+readOffset)) 286 //rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.Rainbow[readOffset], readStart+readOffset) 287 tempSeeds = extendToTheRight(nodes[nodeIdx], read, readStart-leftMatches, int(nodePos)-leftMatches, true) 288 log.Printf("After extendToTheRight fwd:\n") 289 printSeedDev(tempSeeds) 290 for _, tempSeed = range tempSeeds { 291 finalSeeds = append(finalSeeds, extendToTheLeft(nodes[nodeIdx], read, tempSeed)...) 292 } 293 log.Printf("After extendToTheLeft fwd:\n") 294 printSeedDev(finalSeeds) 295 } 296 297 // do rev strand 298 seqKey = read.RainbowRc[keyOffset].Seq[keyIdx] >> keyShift 299 currHits = seedHash[seqKey] 300 log.Printf("Found %d reverse hits\n", len(currHits)) 301 for _, codedNodeCoord = range currHits { 302 nodeIdx, nodePos = numberToChromAndPos(codedNodeCoord) 303 nodeOffset = int(nodePos) % basesPerInt 304 readOffset = 31 - ((readStart - nodeOffset + 31) % 32) 305 306 leftMatches = numbers.Min(readStart+1, dnaTwoBit.CountLeftMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.RainbowRc[readOffset], readStart+readOffset)) 307 //rightMatches = dnaTwoBit.CountRightMatches(nodes[nodeIdx].SeqTwoBit, int(nodePos), read.RainbowRc[readOffset], readStart+readOffset) 308 tempSeeds = extendToTheRight(nodes[nodeIdx], read, readStart-leftMatches, int(nodePos)-leftMatches, false) 309 log.Printf("After extendToTheRight rev:\n") 310 printSeedDev(tempSeeds) 311 for _, tempSeed = range tempSeeds { 312 finalSeeds = append(finalSeeds, extendToTheLeft(nodes[nodeIdx], read, tempSeed)...) 313 } 314 log.Printf("After extendToTheLeft Rev:\n") 315 printSeedDev(finalSeeds) 316 } 317 } 318 return finalSeeds 319 // TODO: Bring back speed optimizations 320 var badIdx = len(hits) - 1 321 var badCount = 0 322 for i := 0; i <= badIdx; { 323 if !seedCouldBeBetter(int64(hits[i].TotalLength), bestScore, perfectScore, int64(len(read.SeqRc)), 100, 90, -196, -296) { 324 hits[i], hits[badIdx] = hits[badIdx], hits[i] 325 badIdx-- 326 badCount++ 327 } else { 328 i++ 329 } 330 } 331 return hits[0:(badIdx + 1)] 332 }*/ 333 334 /*func findSeedsInSlice(seedHash [][]uint64, read *fastq.Fastq, seedLen int, posStrand bool) []*SeedDev { 335 var codedSeq uint64 = 0 336 var allHits []*SeedDev = make([]*SeedDev, 0) 337 for subSeqStart := 0; subSeqStart < len(read.Seq)-seedLen+1; subSeqStart++ { 338 if dna.CountBaseInterval(read.Seq, dna.N, subSeqStart, subSeqStart+seedLen) == 0 { 339 codedSeq = dnaToNumber(read.Seq, subSeqStart, subSeqStart+seedLen) 340 //fmt.Printf("Coded seq is:%d, seedLength:%d\n", codedSeq, seedLen) 341 currHits := seedHash[codedSeq] 342 noMerge, merged := mergeSeedLists(prevHits, currHits, uint32(subSeqStart), posStrand) 343 allHits = append(allHits, noMerge...) 344 prevHits = merged 345 } 346 347 } 348 allHits = append(allHits, prevHits...) 349 350 return allHits 351 }*/