github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/graphTools.go (about)

     1  package genomeGraph
     2  
     3  import (
     4  	"github.com/vertgenlab/gonomics/dna"
     5  	"github.com/vertgenlab/gonomics/fasta"
     6  	"github.com/vertgenlab/gonomics/numbers"
     7  	"github.com/vertgenlab/gonomics/numbers/parse"
     8  	"github.com/vertgenlab/gonomics/vcf"
     9  	"log"
    10  	"strings"
    11  )
    12  
    13  func VariantGraph(ref <-chan fasta.Fasta, vcfMap map[string][]vcf.Vcf) *GenomeGraph {
    14  	gg := EmptyGraph()
    15  	var filterVcf []vcf.Vcf = make([]vcf.Vcf, 0)
    16  	for val := range ref {
    17  		chr := val // not sure this is necessary, but I want to make sure the function does not break
    18  		// if the fasta being a pointer was important.
    19  		filterVcf = vcfMap[chr.Name]
    20  		if len(filterVcf) != 0 {
    21  			vcf.Sort(filterVcf)
    22  			gg = vChrGraph(gg, chr, filterVcf)
    23  		} else {
    24  			// Given the input is a vcf containing large structural variance
    25  			// It is possible for a chromosome to contain no call variants (ex. chrM).
    26  			// In such a case, we add the entire chromosome as one node.
    27  			chrNode := &Node{Id: uint32(len(gg.Nodes)), Seq: chr.Seq, Prev: nil, Next: nil}
    28  			AddNode(gg, chrNode)
    29  		}
    30  	}
    31  	gg = SortGraph(gg)
    32  	return gg
    33  }
    34  
    35  /*
    36  func SplitGraphChr(reference []fasta.Fasta, vcfs []*vcf.Vcf) map[string]*GenomeGraph {
    37  	gg := make(map[string]*GenomeGraph)
    38  	vcfSplit := vcf.VcfSplit(vcfs, reference)
    39  	if len(vcfSplit) != len(reference) {
    40  		log.Fatal("Slice of vcfs do not equal reference length")
    41  	} else {
    42  		for i := 0; i < len(reference); i++ {
    43  			gg[reference[i].Name] = vChrGraph(EmptyGraph(), reference[i], vcfSplit[i])
    44  		}
    45  	}
    46  	return gg
    47  }
    48  */
    49  
    50  func vChrGraph(genome *GenomeGraph, chr fasta.Fasta, vcfsChr []vcf.Vcf) *GenomeGraph {
    51  	vcfsChr = append(vcfsChr, vcf.Vcf{Chr: chr.Name, Pos: len(chr.Seq)})
    52  	//log.Printf("Found %d variants on %s", len(vcfsChr), chr.Name)
    53  	fasta.ToUpper(chr)
    54  	var currMatch *Node = &Node{}
    55  	var lastMatch *Node = &Node{}
    56  	var refAllele, altAllele *Node = &Node{}, &Node{}
    57  	var prev []*Node = nil
    58  	var i, j, edge int
    59  	var index int = 0
    60  	for i = 0; i < len(vcfsChr)-1; i++ {
    61  		if strings.Compare(chr.Name, vcfsChr[i].Chr) != 0 {
    62  			log.Fatalf("Error: chromosome names do not match...\n")
    63  		}
    64  		if vcfsChr[i].Pos-index > 0 {
    65  			currMatch = &Node{Id: uint32(len(genome.Nodes)), Seq: chr.Seq[index : vcfsChr[i].Pos-1], Prev: nil, Next: make([]Edge, 0, 2)}
    66  			if len(currMatch.Seq) == 0 {
    67  				currMatch = lastMatch
    68  				//assuming we already created the ref allele, we only need to create
    69  				//the alt alleles this iteration
    70  				if vcf.Snp(vcfsChr[i]) {
    71  					altAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil}
    72  					altAllele = AddNode(genome, altAllele)
    73  					AddEdge(currMatch, altAllele, 0.5)
    74  				} else if vcf.Ins(vcfsChr[i]) {
    75  					insertion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0])[1:], Prev: nil, Next: nil}
    76  					insertion = AddNode(genome, insertion)
    77  					AddEdge(currMatch, insertion, 1)
    78  					index = vcfsChr[i].Pos - 1
    79  				} else if vcf.Del(vcfsChr[i]) {
    80  					deletion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref)[1:], Prev: nil, Next: nil}
    81  					deletion = AddNode(genome, deletion)
    82  					AddEdge(currMatch, deletion, 1)
    83  					if strings.Contains(vcfsChr[i].Id, "pbsv") {
    84  						index = numbers.Min(vcfsChr[i].Pos+len(deletion.Seq)-1, vcfsChr[i+1].Pos-1)
    85  					} else {
    86  						index = vcfsChr[i].Pos + len(deletion.Seq)
    87  					}
    88  				} else if strings.Contains(vcfsChr[i].Info, "SVTYPE=SNP;INS") || strings.Contains(vcfsChr[i].Info, "SVTYPE=SNP;DEL") || strings.Contains(vcfsChr[i].Info, "SVTYPE=HAP") {
    89  					altAllele := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil}
    90  					altAllele = AddNode(genome, altAllele)
    91  					AddEdge(currMatch, altAllele, 1)
    92  					index = vcfsChr[i].Pos + len(refAllele.Seq) - 1
    93  				}
    94  				lastMatch = currMatch
    95  			} else if len(currMatch.Seq) > 0 {
    96  				currMatch = AddNode(genome, currMatch)
    97  				if lastMatch != nil {
    98  					if len(lastMatch.Next) > 0 {
    99  						for edge = 0; edge < len(lastMatch.Next); edge++ {
   100  							AddEdge(lastMatch.Next[edge].Dest, currMatch, 1)
   101  						}
   102  					}
   103  					if i > 0 {
   104  						if vcf.Snp(vcfsChr[i-1]) || isHaplotypeBlock(vcfsChr[i-1]) {
   105  							AddEdge(altAllele, currMatch, 1)
   106  						}
   107  					}
   108  					AddEdge(lastMatch, currMatch, 1)
   109  					SetEvenWeights(lastMatch)
   110  				}
   111  				if vcf.Snp(vcfsChr[i]) {
   112  					refAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref), Prev: nil, Next: nil}
   113  					refAllele = AddNode(genome, refAllele)
   114  					AddEdge(currMatch, refAllele, 0.5)
   115  
   116  					altAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil}
   117  					altAllele = AddNode(genome, altAllele)
   118  					AddEdge(currMatch, altAllele, 0.5)
   119  
   120  					currMatch = refAllele
   121  					index = vcfsChr[i].Pos
   122  					for j = i + 1; j < len(vcfsChr)-1; j++ {
   123  						if vcf.Snp(vcfsChr[j-1]) && vcf.Snp(vcfsChr[j]) && vcfsChr[j].Pos-1 == vcfsChr[j-1].Pos {
   124  							refAllele.Seq = append(refAllele.Seq, dna.StringToBases(vcfsChr[j].Ref)...)
   125  							altAllele.Seq = append(altAllele.Seq, dna.StringToBases(vcfsChr[j].Alt[0])...)
   126  							index = vcfsChr[j].Pos
   127  						} else {
   128  							lastMatch = currMatch
   129  							i = j - 1
   130  							break
   131  						}
   132  					}
   133  				} else if vcf.Ins(vcfsChr[i]) {
   134  					//currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Ref)...)
   135  					insertion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil}
   136  					insertion = AddNode(genome, insertion)
   137  					AddEdge(currMatch, insertion, 1)
   138  					index = vcfsChr[i].Pos - 1
   139  				} else if vcf.Del(vcfsChr[i]) {
   140  					//TODO: does not deal with ends of large deletions well, might contain extra sequence towards the end.
   141  					//currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Alt)...)
   142  					deletion := &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref), Prev: nil, Next: nil}
   143  					deletion = AddNode(genome, deletion)
   144  					AddEdge(currMatch, deletion, 1)
   145  					if strings.Contains(vcfsChr[i].Id, "pbsv") {
   146  						index = numbers.Min(vcfsChr[i].Pos+len(deletion.Seq)-1, vcfsChr[i+1].Pos-1)
   147  					} else {
   148  						index = vcfsChr[i].Pos + len(deletion.Seq)
   149  					}
   150  				} else if isINV(vcfsChr[i]) {
   151  					currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Ref)...)
   152  					inversion := mkInversionNode(genome, vcfsChr[i], chr)
   153  					inversion = AddNode(genome, inversion)
   154  					AddEdge(currMatch, inversion, 1)
   155  					index = getSvEnd(vcfsChr[i])
   156  				} else if isCNV(vcfsChr[i]) || isDup(vcfsChr[i]) {
   157  					currMatch.Seq = append(currMatch.Seq, dna.StringToBases(vcfsChr[i].Ref)...)
   158  					copyVariance := &Node{Id: uint32(len(genome.Nodes)), Seq: chr.Seq[vcfsChr[i].Pos:getSvEnd(vcfsChr[i])], Prev: nil, Next: nil}
   159  					copyVariance = AddNode(genome, copyVariance)
   160  					AddEdge(currMatch, copyVariance, 1)
   161  					index = getSvEnd(vcfsChr[i])
   162  				} else if isHaplotypeBlock(vcfsChr[i]) {
   163  					refAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Ref), Prev: nil, Next: nil}
   164  					refAllele = AddNode(genome, refAllele)
   165  					AddEdge(currMatch, refAllele, 1)
   166  					altAllele = &Node{Id: uint32(len(genome.Nodes)), Seq: dna.StringToBases(vcfsChr[i].Alt[0]), Prev: nil, Next: nil}
   167  					altAllele = AddNode(genome, altAllele)
   168  					AddEdge(currMatch, altAllele, 1)
   169  					index = numbers.Min(vcfsChr[i].Pos+len(refAllele.Seq)-1, vcfsChr[i+1].Pos-1)
   170  					currMatch = refAllele
   171  				}
   172  				lastMatch = currMatch
   173  			}
   174  		}
   175  	}
   176  	//Case: last node
   177  	lastNode := &Node{Id: uint32(len(genome.Nodes)), Seq: chr.Seq[index:], Prev: make([]Edge, 0, len(prev)), Next: nil}
   178  	lastNode = AddNode(genome, lastNode)
   179  	for edge = 0; edge < len(lastMatch.Next); edge++ {
   180  		AddEdge(lastMatch.Next[edge].Dest, lastNode, 1)
   181  	}
   182  	if vcf.Snp(vcfsChr[len(vcfsChr)-2]) || isHaplotypeBlock(vcfsChr[len(vcfsChr)-2]) {
   183  		AddEdge(altAllele, lastNode, 1)
   184  	}
   185  	AddEdge(lastMatch, lastNode, 1)
   186  	SetEvenWeights(lastMatch)
   187  	return genome
   188  }
   189  
   190  /*
   191  func chrSplitByNs(chr fasta.Fasta) []fasta.Fasta {
   192  	unGapped := bed.UngappedRegionsFromFa(chr)
   193  	var answer []fasta.Fasta = make([]fasta.Fasta, len(unGapped))
   194  	for i := 0; i < len(unGapped); i++ {
   195  		answer[i] = fasta.Fasta{Name: fmt.Sprintf("%s_%d_%d", unGapped[i].Chrom, unGapped[i].ChromStart, unGapped[i].ChromEnd), Seq: chr.Seq[unGapped[i].ChromStart:unGapped[i].ChromEnd]}
   196  	}
   197  	return answer
   198  }
   199  */
   200  
   201  /*
   202  func FaSplitByNs(fa []fasta.Fasta) []fasta.Fasta {
   203  	var answer []fasta.Fasta
   204  	for i := 0; i < len(fa); i++ {
   205  		answer = append(answer, chrSplitByNs(fa[i])...)
   206  	}
   207  	return answer
   208  }
   209  */
   210  
   211  // TODO move these vcf helper functions to vcf
   212  // new nodes are treated as insertion.
   213  func isINV(v vcf.Vcf) bool {
   214  	var truth bool = false
   215  	data := strings.Split(v.Info, ";")
   216  	if strings.Compare(v.Alt[0], "<INV>") == 0 || strings.Compare(data[0], "SVTYPE=INV") == 0 {
   217  		truth = true
   218  	}
   219  	return truth
   220  }
   221  
   222  func isDup(v vcf.Vcf) bool {
   223  	var truth bool = false
   224  	if strings.Contains(v.Info, "SVTYPE=DUP") {
   225  		truth = true
   226  	}
   227  	return truth
   228  }
   229  
   230  func isCNV(v vcf.Vcf) bool {
   231  	var truth bool = false
   232  	if strings.Contains(v.Info, "SVTYPE=CNV") {
   233  		truth = true
   234  	}
   235  	return truth
   236  }
   237  
   238  func getSvEnd(v vcf.Vcf) int {
   239  	if !strings.Contains(v.Info, "END=") {
   240  		log.Fatalf("Error: Vcf might not be from PBSV...")
   241  	} else {
   242  		words := strings.Split(v.Info, ";")
   243  		for i := 0; i < len(words); i++ {
   244  			if strings.Contains(words[i], "END=") {
   245  				text := strings.Split(words[i], "END=")
   246  				return parse.StringToInt(text[1])
   247  			}
   248  		}
   249  	}
   250  	return 0
   251  }
   252  
   253  func mkInversionNode(genome *GenomeGraph, v vcf.Vcf, chr fasta.Fasta) *Node {
   254  	invSeq := make([]dna.Base, 0)
   255  	invSeq = append(invSeq, chr.Seq[v.Pos:getSvEnd(v)]...)
   256  	dna.ReverseComplement(invSeq)
   257  	inversion := &Node{Id: uint32(len(genome.Nodes)), Seq: invSeq, Prev: nil, Next: nil}
   258  	return inversion
   259  }
   260  
   261  /*
   262  func createSNP(sg *GenomeGraph, snp *vcf.Vcf, chr string) (*Node, *Node) {
   263  	refAllele := &Node{Id: uint32(len(sg.Nodes)), Seq: dna.StringToBases(snp.Ref), Next: nil, Prev: nil}
   264  	altAllele := &Node{Id: uint32(len(sg.Nodes) + 1), Seq: dna.StringToBases(snp.Alt[0]), Next: nil, Prev: nil}
   265  	AddNode(sg, refAllele)
   266  	AddNode(sg, altAllele)
   267  	return refAllele, altAllele
   268  }
   269  */
   270  
   271  /*
   272  func NodeSplitByNs(sg *GenomeGraph, currMatch *Node, chr *fasta.Fasta, index int64, end int64) *Node {
   273  	var inRegion bool = false
   274  	var start int64 = 0
   275  	for ; index < end; index++ {
   276  		if dna.DefineBase(chr.Seq[start]) && inRegion == false {
   277  			inRegion = false
   278  			currMatch.Seq = chr.Seq[start:index]
   279  			start = index
   280  		} else if dna.DefineBase(chr.Seq[start]) && inRegion == true {
   281  			newMatch := &Node{Id: uint32(len(sg.Nodes))}
   282  			AddNode(sg, newMatch)
   283  			AddEdge(currMatch, newMatch, 1)
   284  			inRegion = true
   285  			currMatch = newMatch
   286  		}
   287  	}
   288  	return currMatch
   289  }
   290  */
   291  
   292  /*
   293  type noNsBed struct {
   294  	Start int32
   295  	End   int32
   296  }
   297  */
   298  
   299  /*
   300  func findUngap(fa *fasta.Fasta, index int64, end int64) []*noNsBed {
   301  	var answer []*noNsBed
   302  	var inRegion bool = false
   303  	var startIndex int32 = 0
   304  	for ; index < end; index++ {
   305  		if dna.DefineBase(fa.Seq[index]) && inRegion == false {
   306  			inRegion = true
   307  			startIndex = int32(index)
   308  		} else if !(dna.DefineBase(fa.Seq[index])) && inRegion == true {
   309  			answer = append(answer, &noNsBed{Start: startIndex, End: int32(index)})
   310  			inRegion = false
   311  		}
   312  
   313  	}
   314  	if inRegion == true {
   315  		answer = append(answer, &noNsBed{Start: int32(startIndex), End: int32(index)})
   316  	}
   317  	return answer
   318  }
   319  */
   320  
   321  /*
   322  func createINS(sg *GenomeGraph, v *vcf.Vcf, chr string) *Node {
   323  	curr := Node{Id: uint32(len(sg.Nodes)), Seq: dna.StringToBases(v.Alt[0])}
   324  	AddNode(sg, &curr)
   325  	return &curr
   326  }
   327  */
   328  
   329  func isHaplotypeBlock(v vcf.Vcf) bool {
   330  	if strings.Contains(v.Info, "SVTYPE=SNP;INS") || strings.Contains(v.Info, "SVTYPE=SNP;DEL") || strings.Contains(v.Info, "SVTYPE=HAP") {
   331  		return true
   332  	} else {
   333  		return false
   334  	}
   335  }
   336  
   337  /*
   338  func createDEL(sg *GenomeGraph, v *vcf.Vcf, chr string) *Node {
   339  	curr := Node{Id: uint32(len(sg.Nodes)), Seq: dna.StringToBases(v.Ref[1:len(v.Ref)])}
   340  	AddNode(sg, &curr)
   341  	return &curr
   342  }
   343  */