github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/graphTools.go (about) 1 package genomeGraph 2 3 import ( 4 "github.com/vertgenlab/gonomics/dna" 5 "github.com/vertgenlab/gonomics/fasta" 6 "github.com/vertgenlab/gonomics/numbers" 7 "github.com/vertgenlab/gonomics/numbers/parse" 8 "github.com/vertgenlab/gonomics/vcf" 9 "log" 10 "strings" 11 ) 12 13 func VariantGraph(ref <-chan fasta.Fasta, vcfMap map[string][]vcf.Vcf) *GenomeGraph { 14 gg := EmptyGraph() 15 var filterVcf []vcf.Vcf = make([]vcf.Vcf, 0) 16 for val := range ref { 17 chr := val // not sure this is necessary, but I want to make sure the function does not break 18 // if the fasta being a pointer was important. 19 filterVcf = vcfMap[chr.Name] 20 if len(filterVcf) != 0 { 21 vcf.Sort(filterVcf) 22 gg = vChrGraph(gg, chr, filterVcf) 23 } else { 24 // Given the input is a vcf containing large structural variance 25 // It is possible for a chromosome to contain no call variants (ex. chrM). 26 // In such a case, we add the entire chromosome as one node. 27 chrNode := &Node{Id: uint32(len(gg.Nodes)), Seq: chr.Seq, Prev: nil, Next: nil} 28 AddNode(gg, chrNode) 29 } 30 } 31 gg = SortGraph(gg) 32 return gg 33 } 34 35 /* 36 func SplitGraphChr(reference []fasta.Fasta, vcfs []*vcf.Vcf) map[string]*GenomeGraph { 37 gg := make(map[string]*GenomeGraph) 38 vcfSplit := vcf.VcfSplit(vcfs, reference) 39 if len(vcfSplit) != len(reference) { 40 log.Fatal("Slice of vcfs do not equal reference length") 41 } else { 42 for i := 0; i < len(reference); i++ { 43 gg[reference[i].Name] = vChrGraph(EmptyGraph(), reference[i], vcfSplit[i]) 44 } 45 } 46 return gg 47 } 48 */ 49 50 func vChrGraph(genome *GenomeGraph, chr fasta.Fasta, vcfsChr []vcf.Vcf) *GenomeGraph { 51 vcfsChr = append(vcfsChr, vcf.Vcf{Chr: chr.Name, Pos: len(chr.Seq)}) 52 //log.Printf("Found %d variants on %s", len(vcfsChr), chr.Name) 53 fasta.ToUpper(chr) 54 var currMatch *Node = &Node{} 55 var lastMatch *Node = &Node{} 56 var refAllele, altAllele *Node = &Node{}, &Node{} 57 var prev []*Node = nil 58 var i, j, edge int 59 var index int = 0 60 for i = 0; i < len(vcfsChr)-1; i++ { 61 if strings.Compare(chr.Name, vcfsChr[i].Chr) != 0 { 62 log.Fatalf("Error: chromosome names do not match...\n") 63 } 64 if vcfsChr[i].Pos-index > 0 { 65 currMatch = &Node{Id: uint32(len(genome.Nodes)), Seq: chr.Seq[index : vcfsChr[i].Pos-1], Prev: nil, Next: make([]Edge, 0, 2)} 66 if len(currMatch.Seq) == 0 { 67 currMatch = lastMatch 68 //assuming we already created the ref allele, we only need to create 69 //the alt alleles this iteration 70 if vcf.Snp(vcfsChr[i]) { 71 altAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil} 72 altAllele = AddNode(genome, altAllele) 73 AddEdge(currMatch, altAllele, 0.5) 74 } else if vcf.Ins(vcfsChr[i]) { 75 insertion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0])[1:], Prev: nil, Next: nil} 76 insertion = AddNode(genome, insertion) 77 AddEdge(currMatch, insertion, 1) 78 index = vcfsChr[i].Pos - 1 79 } else if vcf.Del(vcfsChr[i]) { 80 deletion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref)[1:], Prev: nil, Next: nil} 81 deletion = AddNode(genome, deletion) 82 AddEdge(currMatch, deletion, 1) 83 if strings.Contains(vcfsChr[i].Id, "pbsv") { 84 index = numbers.Min(vcfsChr[i].Pos+len(deletion.Seq)-1, vcfsChr[i+1].Pos-1) 85 } else { 86 index = vcfsChr[i].Pos + len(deletion.Seq) 87 } 88 } else if strings.Contains(vcfsChr[i].Info, "SVTYPE=SNP;INS") || strings.Contains(vcfsChr[i].Info, "SVTYPE=SNP;DEL") || strings.Contains(vcfsChr[i].Info, "SVTYPE=HAP") { 89 altAllele := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil} 90 altAllele = AddNode(genome, altAllele) 91 AddEdge(currMatch, altAllele, 1) 92 index = vcfsChr[i].Pos + len(refAllele.Seq) - 1 93 } 94 lastMatch = currMatch 95 } else if len(currMatch.Seq) > 0 { 96 currMatch = AddNode(genome, currMatch) 97 if lastMatch != nil { 98 if len(lastMatch.Next) > 0 { 99 for edge = 0; edge < len(lastMatch.Next); edge++ { 100 AddEdge(lastMatch.Next[edge].Dest, currMatch, 1) 101 } 102 } 103 if i > 0 { 104 if vcf.Snp(vcfsChr[i-1]) || isHaplotypeBlock(vcfsChr[i-1]) { 105 AddEdge(altAllele, currMatch, 1) 106 } 107 } 108 AddEdge(lastMatch, currMatch, 1) 109 SetEvenWeights(lastMatch) 110 } 111 if vcf.Snp(vcfsChr[i]) { 112 refAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref), Prev: nil, Next: nil} 113 refAllele = AddNode(genome, refAllele) 114 AddEdge(currMatch, refAllele, 0.5) 115 116 altAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil} 117 altAllele = AddNode(genome, altAllele) 118 AddEdge(currMatch, altAllele, 0.5) 119 120 currMatch = refAllele 121 index = vcfsChr[i].Pos 122 for j = i + 1; j < len(vcfsChr)-1; j++ { 123 if vcf.Snp(vcfsChr[j-1]) && vcf.Snp(vcfsChr[j]) && vcfsChr[j].Pos-1 == vcfsChr[j-1].Pos { 124 refAllele.Seq = append(refAllele.Seq, dna.StringToBases(vcfsChr[j].Ref)...) 125 altAllele.Seq = append(altAllele.Seq, dna.StringToBases(vcfsChr[j].Alt[0])...) 126 index = vcfsChr[j].Pos 127 } else { 128 lastMatch = currMatch 129 i = j - 1 130 break 131 } 132 } 133 } else if vcf.Ins(vcfsChr[i]) { 134 //currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Ref)...) 135 insertion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil} 136 insertion = AddNode(genome, insertion) 137 AddEdge(currMatch, insertion, 1) 138 index = vcfsChr[i].Pos - 1 139 } else if vcf.Del(vcfsChr[i]) { 140 //TODO: does not deal with ends of large deletions well, might contain extra sequence towards the end. 141 //currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Alt)...) 142 deletion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref), Prev: nil, Next: nil} 143 deletion = AddNode(genome, deletion) 144 AddEdge(currMatch, deletion, 1) 145 if strings.Contains(vcfsChr[i].Id, "pbsv") { 146 index = numbers.Min(vcfsChr[i].Pos+len(deletion.Seq)-1, vcfsChr[i+1].Pos-1) 147 } else { 148 index = vcfsChr[i].Pos + len(deletion.Seq) 149 } 150 } else if isINV(vcfsChr[i]) { 151 currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Ref)...) 152 inversion := mkInversionNode(genome, vcfsChr[i], chr) 153 inversion = AddNode(genome, inversion) 154 AddEdge(currMatch, inversion, 1) 155 index = getSvEnd(vcfsChr[i]) 156 } else if isCNV(vcfsChr[i]) || isDup(vcfsChr[i]) { 157 currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Ref)...) 158 copyVariance := &Node{Id: uint32(len(genome.Nodes)), Seq: chr.Seq[vcfsChr[i].Pos:getSvEnd(vcfsChr[i])], Prev: nil, Next: nil} 159 copyVariance = AddNode(genome, copyVariance) 160 AddEdge(currMatch, copyVariance, 1) 161 index = getSvEnd(vcfsChr[i]) 162 } else if isHaplotypeBlock(vcfsChr[i]) { 163 refAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref), Prev: nil, Next: nil} 164 refAllele = AddNode(genome, refAllele) 165 AddEdge(currMatch, refAllele, 1) 166 altAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil} 167 altAllele = AddNode(genome, altAllele) 168 AddEdge(currMatch, altAllele, 1) 169 index = numbers.Min(vcfsChr[i].Pos+len(refAllele.Seq)-1, vcfsChr[i+1].Pos-1) 170 currMatch = refAllele 171 } 172 lastMatch = currMatch 173 } 174 } 175 } 176 //Case: last node 177 lastNode := &Node{Id: uint32(len(genome.Nodes)), Seq: chr.Seq[index:], Prev: make([]Edge, 0, len(prev)), Next: nil} 178 lastNode = AddNode(genome, lastNode) 179 for edge = 0; edge < len(lastMatch.Next); edge++ { 180 AddEdge(lastMatch.Next[edge].Dest, lastNode, 1) 181 } 182 if vcf.Snp(vcfsChr[len(vcfsChr)-2]) || isHaplotypeBlock(vcfsChr[len(vcfsChr)-2]) { 183 AddEdge(altAllele, lastNode, 1) 184 } 185 AddEdge(lastMatch, lastNode, 1) 186 SetEvenWeights(lastMatch) 187 return genome 188 } 189 190 /* 191 func chrSplitByNs(chr fasta.Fasta) []fasta.Fasta { 192 unGapped := bed.UngappedRegionsFromFa(chr) 193 var answer []fasta.Fasta = make([]fasta.Fasta, len(unGapped)) 194 for i := 0; i < len(unGapped); i++ { 195 answer[i] = fasta.Fasta{Name: fmt.Sprintf("%s_%d_%d", unGapped[i].Chrom, unGapped[i].ChromStart, unGapped[i].ChromEnd), Seq: chr.Seq[unGapped[i].ChromStart:unGapped[i].ChromEnd]} 196 } 197 return answer 198 } 199 */ 200 201 /* 202 func FaSplitByNs(fa []fasta.Fasta) []fasta.Fasta { 203 var answer []fasta.Fasta 204 for i := 0; i < len(fa); i++ { 205 answer = append(answer, chrSplitByNs(fa[i])...) 206 } 207 return answer 208 } 209 */ 210 211 // TODO move these vcf helper functions to vcf 212 // new nodes are treated as insertion. 213 func isINV(v vcf.Vcf) bool { 214 var truth bool = false 215 data := strings.Split(v.Info, ";") 216 if strings.Compare(v.Alt[0], "<INV>") == 0 || strings.Compare(data[0], "SVTYPE=INV") == 0 { 217 truth = true 218 } 219 return truth 220 } 221 222 func isDup(v vcf.Vcf) bool { 223 var truth bool = false 224 if strings.Contains(v.Info, "SVTYPE=DUP") { 225 truth = true 226 } 227 return truth 228 } 229 230 func isCNV(v vcf.Vcf) bool { 231 var truth bool = false 232 if strings.Contains(v.Info, "SVTYPE=CNV") { 233 truth = true 234 } 235 return truth 236 } 237 238 func getSvEnd(v vcf.Vcf) int { 239 if !strings.Contains(v.Info, "END=") { 240 log.Fatalf("Error: Vcf might not be from PBSV...") 241 } else { 242 words := strings.Split(v.Info, ";") 243 for i := 0; i < len(words); i++ { 244 if strings.Contains(words[i], "END=") { 245 text := strings.Split(words[i], "END=") 246 return parse.StringToInt(text[1]) 247 } 248 } 249 } 250 return 0 251 } 252 253 func mkInversionNode(genome *GenomeGraph, v vcf.Vcf, chr fasta.Fasta) *Node { 254 invSeq := make([]dna.Base, 0) 255 invSeq = append(invSeq, chr.Seq[v.Pos:getSvEnd(v)]...) 256 dna.ReverseComplement(invSeq) 257 inversion := &Node{Id: uint32(len(genome.Nodes)), Seq: invSeq, Prev: nil, Next: nil} 258 return inversion 259 } 260 261 /* 262 func createSNP(sg *GenomeGraph, snp *vcf.Vcf, chr string) (*Node, *Node) { 263 refAllele := &Node{Id: uint32(len(sg.Nodes)), Seq: dna.StringToBases(snp.Ref), Next: nil, Prev: nil} 264 altAllele := &Node{Id: uint32(len(sg.Nodes) + 1), Seq: dna.StringToBases(snp.Alt[0]), Next: nil, Prev: nil} 265 AddNode(sg, refAllele) 266 AddNode(sg, altAllele) 267 return refAllele, altAllele 268 } 269 */ 270 271 /* 272 func NodeSplitByNs(sg *GenomeGraph, currMatch *Node, chr *fasta.Fasta, index int64, end int64) *Node { 273 var inRegion bool = false 274 var start int64 = 0 275 for ; index < end; index++ { 276 if dna.DefineBase(chr.Seq[start]) && inRegion == false { 277 inRegion = false 278 currMatch.Seq = chr.Seq[start:index] 279 start = index 280 } else if dna.DefineBase(chr.Seq[start]) && inRegion == true { 281 newMatch := &Node{Id: uint32(len(sg.Nodes))} 282 AddNode(sg, newMatch) 283 AddEdge(currMatch, newMatch, 1) 284 inRegion = true 285 currMatch = newMatch 286 } 287 } 288 return currMatch 289 } 290 */ 291 292 /* 293 type noNsBed struct { 294 Start int32 295 End int32 296 } 297 */ 298 299 /* 300 func findUngap(fa *fasta.Fasta, index int64, end int64) []*noNsBed { 301 var answer []*noNsBed 302 var inRegion bool = false 303 var startIndex int32 = 0 304 for ; index < end; index++ { 305 if dna.DefineBase(fa.Seq[index]) && inRegion == false { 306 inRegion = true 307 startIndex = int32(index) 308 } else if !(dna.DefineBase(fa.Seq[index])) && inRegion == true { 309 answer = append(answer, &noNsBed{Start: startIndex, End: int32(index)}) 310 inRegion = false 311 } 312 313 } 314 if inRegion == true { 315 answer = append(answer, &noNsBed{Start: int32(startIndex), End: int32(index)}) 316 } 317 return answer 318 } 319 */ 320 321 /* 322 func createINS(sg *GenomeGraph, v *vcf.Vcf, chr string) *Node { 323 curr := Node{Id: uint32(len(sg.Nodes)), Seq: dna.StringToBases(v.Alt[0])} 324 AddNode(sg, &curr) 325 return &curr 326 } 327 */ 328 329 func isHaplotypeBlock(v vcf.Vcf) bool { 330 if strings.Contains(v.Info, "SVTYPE=SNP;INS") || strings.Contains(v.Info, "SVTYPE=SNP;DEL") || strings.Contains(v.Info, "SVTYPE=HAP") { 331 return true 332 } else { 333 return false 334 } 335 } 336 337 /* 338 func createDEL(sg *GenomeGraph, v *vcf.Vcf, chr string) *Node { 339 curr := Node{Id: uint32(len(sg.Nodes)), Seq: dna.StringToBases(v.Ref[1:len(v.Ref)])} 340 AddNode(sg, &curr) 341 return &curr 342 } 343 */