github.com/vertgenlab/gonomics@v1.0.0/cmd/gsw/gsw.go (about)

     1  // Command Group: "Genome Graph Tools"
     2  // Command Usage: "Genome Graph Creation, Alignment, and Manipulation"
     3  
     4  // Graph-Smith-Waterman: align single or paired end fastqs
     5  package main
     6  
     7  import (
     8  	"flag"
     9  	"fmt"
    10  	"os"
    11  	"strings"
    12  
    13  	"github.com/vertgenlab/gonomics/align"
    14  	"github.com/vertgenlab/gonomics/chromInfo"
    15  	"github.com/vertgenlab/gonomics/genomeGraph"
    16  	"github.com/vertgenlab/gonomics/sam"
    17  )
    18  
    19  type GswSettings struct {
    20  	Cmd      *flag.FlagSet
    21  	Index    int
    22  	StepSize int
    23  	Cpus     int
    24  	Matrix   string
    25  	Liftover string
    26  	Help     string
    27  	Out      string
    28  }
    29  
    30  func alignUsage() {
    31  	fmt.Printf(
    32  		"  align\t\tGraph-Smith-Waterman: align single or paired end fastqs\n")
    33  }
    34  
    35  func extendedAlignUsage() {
    36  	fmt.Print(
    37  		"Usage:\n" +
    38  			"  gsw align [options] ref.gg R1.fastq.gz R2.fastq.gz\n\n" +
    39  			"Required:\n" +
    40  			"  ref[.gg/.fa]\t\tReference genome: accepts both fasta and graph formats\n\n" +
    41  			"Options:\n" +
    42  			"  -i, --index\t\tKmer length for index hash look up  (default: 32)\n" +
    43  			"  -s, --step\t\tOffset sliding window step size for hash set up (default 32)\n" +
    44  			"  -t, --threads\t\tNumber of CPUs for Goroutine concurrency (default: 4)\n" +
    45  			"  -p, --project\t\tConvert alignment coordinate to linear reference in sam format\n" +
    46  			"  -m, --matrix\t\tScores used to align matches and mismatches (default: humanChimp)\n\n")
    47  }
    48  
    49  func initArgsGsw() *GswSettings {
    50  	gsw := &GswSettings{Cmd: flag.NewFlagSet("align", flag.ExitOnError)}
    51  	gsw.Cmd.Usage = extendedAlignUsage
    52  
    53  	gsw.Cmd.IntVar(&gsw.Index, "index", 32, "``hash look up length")
    54  	gsw.Cmd.IntVar(&gsw.Index, "i", 32, "``hash look up length")
    55  
    56  	gsw.Cmd.IntVar(&gsw.StepSize, "window", 32, "offset position of sliding window of hash")
    57  	gsw.Cmd.IntVar(&gsw.StepSize, "w", 32, "offset position of sliding window of hash")
    58  
    59  	gsw.Cmd.IntVar(&gsw.Cpus, "threads", 4, "Number of CPUs for goroutines")
    60  	gsw.Cmd.IntVar(&gsw.Cpus, "t", 4, "Number of CPUs for goroutines")
    61  
    62  	gsw.Cmd.StringVar(&gsw.Matrix, "matrix", "humanChimp", "Scoring matrix for alignment")
    63  	gsw.Cmd.StringVar(&gsw.Matrix, "m", "humanChimp", "Scoring matrix for alignment")
    64  
    65  	gsw.Cmd.StringVar(&gsw.Liftover, "liftover", "", "liftover to linear reference sam file")
    66  	gsw.Cmd.StringVar(&gsw.Liftover, "l", "", "liftover to linear reference sam file")
    67  
    68  	gsw.Cmd.StringVar(&gsw.Help, "help", "", "display usage")
    69  	gsw.Cmd.StringVar(&gsw.Help, "h", "", "display usage")
    70  
    71  	gsw.Cmd.StringVar(&gsw.Out, "out", "/dev/stdout", "Output filename, [.gg/.vcf/.sam]")
    72  	gsw.Cmd.StringVar(&gsw.Out, "o", "/dev/stdout", "Output filename, [.gg/.vcf/.sam]")
    73  
    74  	return gsw
    75  }
    76  
    77  func RunAlignExe() {
    78  	gsw := initArgsGsw()
    79  	gsw.Cmd.Parse(os.Args[2:])
    80  	if len(gsw.Cmd.Args()) == 0 {
    81  		gsw.Cmd.Usage()
    82  	} else {
    83  		graphSmithWaterman(gsw.Index, gsw.StepSize, gsw.Cpus, gsw.Matrix, gsw.Out, gsw.Liftover, gsw.Cmd.Args())
    84  	}
    85  }
    86  
    87  // TODO: add check for too many args or an implementation for multiple fastq pairs.
    88  func graphSmithWaterman(seedNum int, stepSize int, cpus int, score string, out string, liftover string, args []string) {
    89  	//should be at most 3 args to add to the input, reference, readOne and/or readTwo
    90  	var genomeGraph *genomeGraph.GenomeGraph = genomeGraph.Read(args[0])
    91  	var readOne string
    92  	var readTwo string
    93  
    94  	if len(args) > 1 {
    95  		readOne = args[1]
    96  	}
    97  	if len(args) == 3 {
    98  		readTwo = args[2]
    99  	}
   100  	switch true {
   101  	case len(args) == 2 && !strings.HasSuffix(liftover, ".sizes"):
   102  		GswToGiraf(genomeGraph, readOne, out, cpus, seedNum, stepSize, selectScoreMatrix(score))
   103  	case len(args) == 3 && !strings.HasSuffix(liftover, ".sizes"):
   104  		GswToGirafPair(genomeGraph, readOne, readTwo, out, cpus, seedNum, stepSize, selectScoreMatrix(score))
   105  	case strings.HasSuffix(liftover, ".sizes"):
   106  		chrSize := chromInfo.ReadToSlice(liftover)
   107  		header := sam.GenerateHeader(chrSize, nil, sam.Unsorted, sam.None)
   108  		if len(args) == 2 {
   109  			GswToSam(genomeGraph, readOne, out, cpus, seedNum, stepSize, selectScoreMatrix(score), header)
   110  		}
   111  		if len(args) == 3 {
   112  			GswToSamPair(genomeGraph, readOne, readTwo, out, cpus, seedNum, stepSize, selectScoreMatrix(score), header)
   113  		}
   114  	default:
   115  		extendedAlignUsage()
   116  		errorMessage()
   117  	}
   118  }
   119  
   120  func selectScoreMatrix(score string) [][]int64 {
   121  	var scoreMatrix [][]int64
   122  	switch {
   123  	case strings.Contains(score, "humanChimp"):
   124  		scoreMatrix = align.HumanChimpTwoScoreMatrix
   125  	case strings.Contains(score, "hoxD55"):
   126  		scoreMatrix = align.HoxD55ScoreMatrix
   127  	case strings.Contains(score, "mouseRat"):
   128  		scoreMatrix = align.MouseRatScoreMatrix
   129  	case strings.Contains(score, "general"):
   130  		scoreMatrix = align.DefaultScoreMatrix
   131  	default:
   132  		errorMessage()
   133  	}
   134  	return scoreMatrix
   135  }