github.com/vertgenlab/gonomics@v1.0.0/cmd/gsw/chainHelper.go (about)

     1  package main
     2  
     3  import (
     4  	"strings"
     5  
     6  	"github.com/vertgenlab/gonomics/axt"
     7  	"github.com/vertgenlab/gonomics/chain"
     8  	"github.com/vertgenlab/gonomics/dna"
     9  	"github.com/vertgenlab/gonomics/fasta"
    10  	"github.com/vertgenlab/gonomics/fileio"
    11  	"github.com/vertgenlab/gonomics/genomeGraph"
    12  	"github.com/vertgenlab/gonomics/vcf"
    13  )
    14  
    15  func convertChains(chainFile, targetFa, queryFa, format, output string) {
    16  	switch format {
    17  	case "axt":
    18  		axtChannel := goChainToAxt(chainFile, targetFa, queryFa)
    19  		file := fileio.EasyCreate(output)
    20  		var idx int = 0
    21  		for i := range axtChannel {
    22  			axt.WriteToFileHandle(file, i, idx)
    23  		}
    24  	case "vcf":
    25  		vcfChannel := goChainToVcf(chainFile, targetFa, queryFa)
    26  		file := fileio.EasyCreate(output)
    27  		vcf.NewWriteHeader(file, vcf.NewHeader())
    28  		for i := range vcfChannel {
    29  			vcf.WriteVcf(file, i)
    30  		}
    31  		file.Close()
    32  	case "gg":
    33  		genomeGraph.Write(output, chainToGenomeGraph(chainFile, targetFa, queryFa))
    34  	default:
    35  		ggToolsUsage()
    36  		errorMessage()
    37  	}
    38  }
    39  
    40  func goChainToAxt(chainFile, targetFa, queryFa string) <-chan axt.Axt {
    41  	target, query := fasta.Read(targetFa), fasta.Read(queryFa)
    42  	chainFa := chain.GoReadSeqChain(chainFile, target, query)
    43  	ans := make(chan axt.Axt, 2408)
    44  	go workThreadChainAxt(chainFa, ans)
    45  	return ans
    46  }
    47  
    48  func goChainToVcf(chainFile, targetFa, queryFa string) <-chan vcf.Vcf {
    49  	ans := make(chan vcf.Vcf, 2408)
    50  	axtChannel := goChainToAxt(chainFile, targetFa, queryFa)
    51  	go workThreadAxtVcf(axtChannel, ans)
    52  	return ans
    53  }
    54  
    55  func chainToGenomeGraph(chainFile, targetFa, queryFa string) *genomeGraph.GenomeGraph {
    56  	target, query := fasta.Read(targetFa), fasta.Read(queryFa)
    57  	chainFa := chain.GoReadSeqChain(chainFile, target, query)
    58  	axtChannel := make(chan axt.Axt, 2408)
    59  	go workThreadChainAxt(chainFa, axtChannel)
    60  
    61  	vcfChannel := make(chan vcf.Vcf, 2408)
    62  	go workThreadAxtVcf(axtChannel, vcfChannel)
    63  
    64  	chrVcfMap := make(map[string][]vcf.Vcf)
    65  	for i := range vcfChannel {
    66  		chrVcfMap[i.Chr] = append(chrVcfMap[i.Chr], i)
    67  	}
    68  	//set up fa channel
    69  	ref := goFaChannel(target)
    70  	//return the genome graph
    71  	return genomeGraph.VariantGraph(ref, chrVcfMap)
    72  }
    73  
    74  func workThreadChainAxt(chFa chain.SeqChain, ans chan<- axt.Axt) {
    75  	for i := range chFa.Chains {
    76  		ans <- chain.ToAxt(i, chFa.TSeq[i.TName], chFa.QSeq[i.QName])
    77  	}
    78  	close(ans)
    79  }
    80  
    81  func workThreadAxtVcf(axtChannel <-chan axt.Axt, ans chan<- vcf.Vcf) {
    82  	var j int = 0
    83  	var curr []vcf.Vcf
    84  	for i := range axtChannel {
    85  		//filter for uniq
    86  		curr = filterVcfPos(axt.ToVcf(i))
    87  		for j = 0; j < len(curr); j++ {
    88  			if !strings.Contains(curr[j].Ref, "N") && !strings.Contains(curr[j].Alt[0], "N") {
    89  				ans <- curr[j]
    90  			}
    91  		}
    92  	}
    93  	close(ans)
    94  }
    95  
    96  // FilterVcfPos will filter out records that appear as the same position more than once, keeping the first one it encounters. In addition, if records contains Ns, those records will also be filtered out.
    97  func filterVcfPos(vcfs []vcf.Vcf) []vcf.Vcf {
    98  	vcf.Sort(vcfs)
    99  	var answer []vcf.Vcf
   100  	chrVcfMap := make(map[string][]vcf.Vcf)
   101  
   102  	var ref []dna.Base
   103  	var alt [][]dna.Base
   104  	var containsN bool
   105  	for _, v := range vcfs {
   106  		chrVcfMap[v.Chr] = append(chrVcfMap[v.Chr], v)
   107  	}
   108  	var i int
   109  	var curr []vcf.Vcf
   110  	for key := range chrVcfMap {
   111  		curr = chrVcfMap[key]
   112  		encountered := make(map[int]bool)
   113  		for i = 0; i < len(curr); i++ {
   114  			if encountered[curr[i].Pos] {
   115  				//do not add
   116  			} else {
   117  				encountered[curr[i].Pos] = true
   118  				containsN = false
   119  				ref = dna.StringToBases(curr[i].Ref)
   120  				alt = vcf.GetAltBases(curr[i].Alt)
   121  				if dna.CountBaseInterval(ref, dna.N, 0, len(ref)) != 0 {
   122  					containsN = true
   123  				}
   124  				for j := 0; j < len(alt); j++ {
   125  					if dna.CountBaseInterval(alt[j], dna.N, 0, len(alt[j])) != 0 {
   126  						containsN = true
   127  					}
   128  				}
   129  				if !containsN {
   130  					answer = append(answer, curr[i])
   131  				}
   132  			}
   133  		}
   134  	}
   135  	return answer
   136  }
   137  
   138  func goFaChannel(ref []fasta.Fasta) <-chan fasta.Fasta {
   139  	//set up faChan
   140  	ans := make(chan fasta.Fasta, 100)
   141  	go faWorker(ref, ans)
   142  	return ans
   143  }
   144  
   145  func faWorker(ref []fasta.Fasta, faChan chan<- fasta.Fasta) {
   146  	for _, chr := range ref {
   147  		faChan <- chr
   148  	}
   149  	close(faChan)
   150  }