github.com/vertgenlab/gonomics@v1.0.0/cmd/gsw/ggTools.go (about)

     1  package main
     2  
     3  import (
     4  	"flag"
     5  	"fmt"
     6  	"log"
     7  	"os"
     8  	"strings"
     9  
    10  	"github.com/vertgenlab/gonomics/axt"
    11  	"github.com/vertgenlab/gonomics/bed"
    12  	"github.com/vertgenlab/gonomics/chain"
    13  	"github.com/vertgenlab/gonomics/fasta"
    14  	"github.com/vertgenlab/gonomics/fileio"
    15  	"github.com/vertgenlab/gonomics/genomeGraph"
    16  	"github.com/vertgenlab/gonomics/vcf"
    17  )
    18  
    19  type GgToolsSettings struct {
    20  	Cmd       *flag.FlagSet
    21  	TargetFa  string
    22  	QueryFa   string
    23  	FmtOutput string
    24  	Out       string
    25  }
    26  
    27  func ggToolsUsage() {
    28  	fmt.Printf(
    29  		"  ggtools\tGenomic utilities to create, manipulate and operate on genome graphs\n")
    30  }
    31  
    32  func ggtoolsExtend() {
    33  	fmt.Print(
    34  		"Usage:\n" +
    35  			"  gsw ggtools [options] input.file\n\n" +
    36  			"Required:\n" +
    37  			"  -t, --target\tTarget reference fasta file\n" +
    38  			"  -q, --query\tQuery fasta file required for chain file inputs\n\n" +
    39  			"Options:\n" +
    40  			"  -f, --format\t  Pick file format for output, only applies to file conversions utils\n" +
    41  			"  -o, --out\t  [.chain/.gg/.sam/.vcf/.gz]  (default: /dev/stdout)\n\n")
    42  }
    43  
    44  func initGgtoolsArgs() *GgToolsSettings {
    45  	ggT := &GgToolsSettings{Cmd: flag.NewFlagSet("ggtools", flag.ExitOnError)}
    46  	ggT.Cmd.StringVar(&ggT.FmtOutput, "format", "", "Pick file format for output, only applies to file conversions utils")
    47  	ggT.Cmd.StringVar(&ggT.TargetFa, "target", "", "Specify target reference fasta file")
    48  	ggT.Cmd.StringVar(&ggT.QueryFa, "query", "", "Specify query fasta file")
    49  	ggT.Cmd.StringVar(&ggT.Out, "out", "/dev/stdout", "Output filename, [.gg/.vcf]")
    50  
    51  	ggT.Cmd.StringVar(&ggT.TargetFa, "t", "", "Specify target reference fasta file")
    52  	ggT.Cmd.StringVar(&ggT.QueryFa, "q", "", "Specify query fasta file")
    53  	ggT.Cmd.StringVar(&ggT.FmtOutput, "f", "", "Pick file format for output, only applies to file conversions utils")
    54  	ggT.Cmd.StringVar(&ggT.Out, "o", "/dev/stdout", "Output filename, [.gg/.vcf/.sam]")
    55  	ggT.Cmd.Usage = ggtoolsExtend
    56  	return ggT
    57  }
    58  func RunGgTools() {
    59  	ggT := initGgtoolsArgs()
    60  	ggT.Cmd.Parse(os.Args[2:])
    61  	if len(ggT.Cmd.Args()) != 1 {
    62  		ggT.Cmd.Usage()
    63  	} else {
    64  		inFile := ggT.Cmd.Arg(0)
    65  		log.Printf("Reading: %s\n", inFile)
    66  		switch true {
    67  		case chain.IsChainFile(inFile):
    68  			log.Printf("Input chain detected...\n")
    69  			if strings.Compare(ggT.TargetFa, "") != 0 && strings.Compare(ggT.QueryFa, "") != 0 {
    70  				log.Printf("Converting chain to %s...\n", ggT.FmtOutput)
    71  				convertChains(inFile, ggT.TargetFa, ggT.QueryFa, ggT.FmtOutput, ggT.Out)
    72  			} else {
    73  				ggT.Cmd.Usage()
    74  				log.Fatalf("Error: Must specify both target and query fasta files...\n")
    75  			}
    76  		case vcf.IsVcfFile(inFile):
    77  			log.Printf("Input vcf detected...\n")
    78  			if strings.Compare(ggT.TargetFa, "") != 0 {
    79  				log.Printf("Converting vcf to %s...\n", ggT.FmtOutput)
    80  				genomeGraph.Write(ggT.Out, vcfToGenomeGraph(inFile, ggT.TargetFa))
    81  			} else {
    82  				ggT.Cmd.Usage()
    83  				log.Fatalf("Error: Must specify target reference fasta file...\n")
    84  			}
    85  		case axt.IsAxtFile(inFile):
    86  			log.Printf("Input axt detected...\n")
    87  			if strings.Compare(ggT.TargetFa, "") != 0 {
    88  				log.Printf("Converting axt alignment to %s...\n", ggT.FmtOutput)
    89  				convertAxt(inFile, ggT.FmtOutput, ggT.TargetFa, ggT.Out)
    90  			}
    91  		default:
    92  			// /ggT.Cmd.Usage()
    93  			errorMessage()
    94  		}
    95  		log.Printf("Finished writing %s file\n-xoxo gg\n", ggT.FmtOutput)
    96  	}
    97  }
    98  
    99  func isFasta(filename string) bool {
   100  	if strings.HasSuffix(filename, ".fasta") || strings.HasSuffix(filename, ".fa") || strings.HasSuffix(filename, ".fasta.gz") || strings.HasSuffix(filename, ".fa.gz") {
   101  		return true
   102  	} else {
   103  		//log.Fatalf("Error a fasta reference must be provided this is specific file conversion\n")
   104  		return false
   105  	}
   106  }
   107  
   108  func chainToBed(chainFmt, bedFmt string, target bool, out string) {
   109  	reader, _ := chain.GoReadToChan(chainFmt)
   110  
   111  	outFile := fileio.MustCreate(out)
   112  	defer outFile.Close()
   113  	for each := range reader {
   114  		bed.WriteBed(outFile, chain.ChainToBed(each, target))
   115  	}
   116  }
   117  
   118  /*
   119  func chainToAxt(chainFmt string, axtFmt string) {
   120  	input := fileio.EasyOpen(axtFmt)
   121  	defer input.Close()
   122  	reader, writer := make(chan *chain.Chain), make(chan *axt.Axt)
   123  	go chain.ReadToChan(input, reader)
   124  	var i int = 0
   125  	var wg sync.WaitGroup
   126  	wg.Add(1)
   127  	for each := range reader {
   128  		writer <- chain.ChainToAxt(each, i)
   129  		i++
   130  	}
   131  
   132  }*/
   133  
   134  func vcfSplitChrNoN(vcfInput string) map[string][]vcf.Vcf {
   135  	reader := fileio.EasyOpen(vcfInput)
   136  	defer reader.Close()
   137  	vcf.ReadHeader(reader)
   138  	var data vcf.Vcf
   139  	var done bool
   140  	chrMap := make(map[string][]vcf.Vcf)
   141  	for data, done = vcf.NextVcf(reader); !done; data, done = vcf.NextVcf(reader) {
   142  		if !strings.Contains(data.Ref, "N") && !strings.Contains(data.Alt[0], "N") {
   143  			chrMap[data.Chr] = append(chrMap[data.Chr], data)
   144  		}
   145  	}
   146  	return chrMap
   147  }
   148  
   149  func FaVcfChannels(ref string, vcfInput string) *genomeGraph.GenomeGraph {
   150  	vcfFilteredMap := vcfSplitChrNoN(vcfInput)
   151  	faReader := fasta.GoReadToChan(ref)
   152  	gg := genomeGraph.VariantGraph(faReader, vcfFilteredMap)
   153  	return gg
   154  }
   155  
   156  /*
   157  //axt is the select/target regions we are looking for overlaps from
   158  func findAxtBedOverlap(axtFmt string, bedFmt string, output string, invert bool) {
   159  	query := make(chan *bed.Bed)
   160  	go bed.ReadToChan(bedFmt, query)
   161  
   162  	target := mkAxtMap(axtFmt)
   163  
   164  	answer := fileio.MustCreate(output)
   165  	defer answer.Close()
   166  	for eachRegion := range query {
   167  		if overlapAxtCompareBeds(target[eachRegion.Chrom], eachRegion) {
   168  			//TODO: consider changing this so people can input n fields
   169  			bed.WriteBed(answer, eachRegion, 5)
   170  		} else {
   171  			if invert {
   172  				bed.WriteBed(answer, eachRegion, 5)
   173  			}
   174  		}
   175  	}
   176  }
   177  /*
   178  func mkAxtMap(axtFmt string) map[string][]*axt.Axt {
   179  	input := fileio.EasyOpen(axtFmt)
   180  	reader := make(chan *axt.Axt)
   181  
   182  	target := make(map[string][]*axt.Axt)
   183  
   184  	go axt.ReadToChan(input, reader)
   185  	for each := range reader {
   186  		target[each.RName] = append(target[each.RName], each)
   187  	}
   188  	return target
   189  }
   190  
   191  //check if a single bed record has overlap
   192  func overlapAxtCompareBeds(target []*axt.Axt, query *bed.Bed) bool {
   193  	for _, each := range target {
   194  		if axt.OverlapAxtBed(each, query) {
   195  			return true
   196  		}
   197  	}
   198  	return false
   199  }
   200  
   201  //check if a single bed record has overlap
   202  func overlapChainCompareBeds(target []*chain.Chain, query *bed.Bed) bool {
   203  	for _, each := range target {
   204  		//TODO: can only select regions from target at the moment, will add query option
   205  		if chain.OverlapChainBed(each, query, true) {
   206  			return true
   207  		}
   208  	}
   209  	return false
   210  }*/