github.com/vertgenlab/gonomics@v1.0.0/cmd/vcfFilter/vcfFilter.go (about)

     1  // Command Group: "VCF Tools"
     2  
     3  // Filter vcf records
     4  package main
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"log"
    10  	"math"
    11  	"math/rand"
    12  	"strings"
    13  
    14  	"github.com/vertgenlab/gonomics/exception"
    15  	"github.com/vertgenlab/gonomics/fileio"
    16  	"github.com/vertgenlab/gonomics/popgen"
    17  	"github.com/vertgenlab/gonomics/vcf"
    18  )
    19  
    20  // coordinate is a chrom/pos pair, referring to a specific place in the genome.
    21  type coordinate struct {
    22  	Chr string
    23  	Pos int
    24  }
    25  
    26  func getSitesSeen(filename string) map[coordinate]uint8 {
    27  	var sitesSeen map[coordinate]uint8 = make(map[coordinate]uint8, 0)
    28  	var records <-chan vcf.Vcf
    29  	records, _ = vcf.GoReadToChan(filename)
    30  	var currentCoord coordinate = coordinate{"", 0}
    31  	for v := range records {
    32  		currentCoord.Chr = v.Chr
    33  		currentCoord.Pos = v.Pos
    34  		sitesSeen[currentCoord]++
    35  	}
    36  	return sitesSeen
    37  }
    38  
    39  func rmClusteredRecords(input <-chan vcf.Vcf, output chan<- vcf.Vcf, minDist int, totalChan, removedChan chan<- int) {
    40  	var prev vcf.Vcf
    41  	var canSend bool
    42  	var total, removed int
    43  	for v := range input {
    44  		total++
    45  		if prev.Chr == "" {
    46  			prev = v
    47  			canSend = true
    48  			continue
    49  		}
    50  
    51  		if v.Pos < prev.Pos && v.Chr == prev.Chr {
    52  			log.Fatalf("ERROR: input vcf is not sorted. Offending records:\n%s\n%s", prev, v)
    53  		}
    54  
    55  		// if onto a new chrom, send previous if applicable, set v as prev, and move on to next record
    56  		if v.Chr != prev.Chr {
    57  			if canSend {
    58  				output <- prev
    59  			} else {
    60  				removed++
    61  			}
    62  			canSend = true
    63  			prev = v
    64  			continue
    65  		}
    66  
    67  		if v.Pos-prev.Pos < minDist { // too close, don't send
    68  			canSend = false
    69  			prev = v
    70  			removed++
    71  			continue
    72  		}
    73  
    74  		// if we reach this point prev is clear to send and v is ok to send in the next loop if the subsequent variant is not too close
    75  		if canSend {
    76  			output <- prev
    77  		} else {
    78  			removed++
    79  		}
    80  		prev = v
    81  		canSend = true
    82  	}
    83  
    84  	// send final record if passing
    85  	if canSend {
    86  		output <- prev
    87  	} else {
    88  		removed++
    89  	}
    90  
    91  	totalChan <- total
    92  	removedChan <- removed
    93  
    94  	close(totalChan)
    95  	close(removedChan)
    96  	close(output)
    97  }
    98  
    99  func vcfFilter(infile string, outfile string, c criteria, groupFile string, parseFormat bool, parseInfo bool, setSeed int64) (total, removed int) {
   100  	rand.Seed(setSeed)
   101  	var records <-chan vcf.Vcf
   102  	var totalChan, removedChan chan int
   103  	var header vcf.Header
   104  	var err error
   105  	var currentCoord coordinate
   106  	var sitesSeen map[coordinate]uint8 = make(map[coordinate]uint8, 0) //uint8 is the number of times this site is seen in the vcf file.
   107  
   108  	if c.biAllelicOnly {
   109  		sitesSeen = getSitesSeen(infile)
   110  	}
   111  
   112  	records, header = vcf.GoReadToChan(infile)
   113  	out := fileio.EasyCreate(outfile)
   114  	tests := getTests(c, header)
   115  
   116  	if c.minDist > 0 {
   117  		// hijack input channel so only sites passing the minDist filter are passed to the rest of the function
   118  		passedRecords := make(chan vcf.Vcf, 100)
   119  		totalChan = make(chan int, 1)   // to get accurate total count
   120  		removedChan = make(chan int, 1) // to get accurate removed count
   121  		go rmClusteredRecords(records, passedRecords, c.minDist, totalChan, removedChan)
   122  		records = passedRecords
   123  	}
   124  
   125  	var samplesToKeep []int = make([]int, 0) //this var holds all of the indices from samples (defined below as the sample list in the header) that we want to keep in the output file.
   126  
   127  	if groupFile != "" {
   128  		groups := popgen.ReadGroups(groupFile)
   129  		samples := vcf.HeaderGetSampleList(header)
   130  
   131  		for i := 0; i < len(samples); i++ {
   132  			if popgen.GroupsContains(groups, samples[i]) {
   133  				samplesToKeep = append(samplesToKeep, i)
   134  			}
   135  		}
   136  		outSamples := filterHeaderSamplesToKeep(samples, samplesToKeep)
   137  		vcf.HeaderUpdateSampleList(header, outSamples)
   138  	}
   139  	vcf.NewWriteHeader(out, header)
   140  
   141  	for v := range records {
   142  		total++
   143  		if groupFile != "" {
   144  			v.Samples = filterRecordsSamplesToKeep(v.Samples, samplesToKeep)
   145  		}
   146  
   147  		if parseFormat {
   148  			v = vcf.ParseFormat(v, header)
   149  		}
   150  
   151  		if c.biAllelicOnly {
   152  			currentCoord = coordinate{v.Chr, v.Pos}
   153  			if sitesSeen[currentCoord] < 1 {
   154  				log.Panicf("Current variant not found in sitesSeen map. Something went horribly wrong. %v.\n", v)
   155  			} else if sitesSeen[currentCoord] > 1 {
   156  				removed++
   157  				continue
   158  			}
   159  		}
   160  
   161  		if parseInfo {
   162  			v = vcf.ParseInfo(v, header)
   163  		}
   164  
   165  		if !passesTests(v, tests) {
   166  			removed++
   167  			continue
   168  		}
   169  
   170  		vcf.WriteVcf(out, v)
   171  	}
   172  
   173  	if c.minDist > 0 {
   174  		total = <-totalChan
   175  		removed += <-removedChan
   176  	}
   177  
   178  	err = out.Close()
   179  	exception.PanicOnErr(err)
   180  	return
   181  }
   182  
   183  func filterRecordsSamplesToKeep(recordSamples []vcf.Sample, samplesToKeep []int) []vcf.Sample {
   184  	var answer []vcf.Sample
   185  	for _, v := range samplesToKeep {
   186  		answer = append(answer, recordSamples[v])
   187  	}
   188  
   189  	return answer
   190  }
   191  
   192  func filterHeaderSamplesToKeep(samples []string, samplesToKeep []int) []string {
   193  	var answer []string
   194  	for _, v := range samplesToKeep {
   195  		answer = append(answer, samples[v])
   196  	}
   197  
   198  	return answer
   199  }
   200  
   201  func usage() {
   202  	fmt.Print(
   203  		"vcfFilter - Filter vcf records.\n\n" +
   204  			"Usage:\n" +
   205  			"  vcfFilter [options] input.vcf output.vcf\n\n" +
   206  			"Options:\n\n")
   207  	flag.PrintDefaults()
   208  }
   209  
   210  // criteria by which vcf records are filtered.
   211  type criteria struct {
   212  	chrom                          string
   213  	groupFile                      string
   214  	minPos                         int
   215  	maxPos                         int
   216  	minQual                        float64
   217  	ref                            string
   218  	alt                            []string
   219  	biAllelicOnly                  bool
   220  	substitutionsOnly              bool
   221  	segregatingSitesOnly           bool
   222  	removeNoAncestor               bool
   223  	onlyPolarizableAncestors       bool
   224  	weakToStrongOrStrongToWeakOnly bool
   225  	noWeakToStrongOrStrongToWeak   bool
   226  	refWeakAltStrongOnly           bool
   227  	refStrongAltWeakOnly           bool
   228  	notRefWeakAltStrong            bool
   229  	notRefStrongAltWeak            bool
   230  	id                             string
   231  	formatExp                      string
   232  	infoExp                        string
   233  	includeMissingInfo             bool
   234  	subSet                         float64
   235  	minDaf                         float64
   236  	maxDaf                         float64
   237  	minDist                        int
   238  }
   239  
   240  // testingFuncs are a set of functions that must all return true to escape filter.
   241  type testingFuncs []func(vcf.Vcf) bool
   242  
   243  // passesTests runs all testingFuncs on a vcf record and returns true if all tests pass.
   244  func passesTests(v vcf.Vcf, t testingFuncs) bool {
   245  	for i := range t {
   246  		if !t[i](v) {
   247  			return false
   248  		}
   249  	}
   250  	return true
   251  }
   252  
   253  // getTests parses the criteria struct to determine the testingFuncs.
   254  func getTests(c criteria, header vcf.Header) testingFuncs {
   255  	var answer testingFuncs
   256  
   257  	if c.formatExp != "" {
   258  		answer = append(answer, parseExpression(c.formatExp, header, true, c.includeMissingInfo)...) //raven's note: when tried to go run, got parseExpression undefined error. parseExpression is defined in cmd/vcfFilter/expression.go
   259  	}
   260  
   261  	if c.infoExp != "" {
   262  		answer = append(answer, parseExpression(c.infoExp, header, false, c.includeMissingInfo)...)
   263  	}
   264  
   265  	if c.chrom != "" {
   266  		answer = append(answer,
   267  			func(v vcf.Vcf) bool {
   268  				return v.Chr == c.chrom
   269  			})
   270  	}
   271  
   272  	if c.minPos != 0 {
   273  		answer = append(answer,
   274  			func(v vcf.Vcf) bool {
   275  				return v.Pos >= c.minPos
   276  			})
   277  	}
   278  
   279  	if c.maxPos != math.MaxInt64 {
   280  		answer = append(answer,
   281  			func(v vcf.Vcf) bool {
   282  				return v.Pos <= c.maxPos
   283  			})
   284  	}
   285  
   286  	if c.minDaf != 0 {
   287  		if c.minDaf < 0 || c.minDaf > 1 {
   288  			log.Fatalf("minDaf must be between 0 and 1.")
   289  		}
   290  		answer = append(answer,
   291  			func(v vcf.Vcf) bool {
   292  				return popgen.VcfSampleDerivedAlleleFrequency(v) > c.minDaf
   293  			})
   294  	}
   295  
   296  	if c.maxDaf != 1 {
   297  		if c.maxDaf < 0 || c.maxDaf > 1 {
   298  			log.Fatalf("maxDaf must be between 0 and 1.")
   299  		}
   300  		answer = append(answer,
   301  			func(v vcf.Vcf) bool {
   302  				return popgen.VcfSampleDerivedAlleleFrequency(v) < c.maxDaf
   303  			})
   304  	}
   305  
   306  	if c.maxDaf < c.minDaf {
   307  		log.Fatalf("maxDaf must be less than minDaf.")
   308  	}
   309  
   310  	if c.minQual != 0 {
   311  		answer = append(answer,
   312  			func(v vcf.Vcf) bool {
   313  				return v.Qual >= c.minQual
   314  			})
   315  	}
   316  
   317  	if c.ref != "" {
   318  		answer = append(answer,
   319  			func(v vcf.Vcf) bool {
   320  				return v.Ref == c.ref
   321  			})
   322  	}
   323  
   324  	if c.alt != nil {
   325  		answer = append(answer,
   326  			func(v vcf.Vcf) bool {
   327  				if len(v.Alt) != len(c.alt) {
   328  					return false
   329  				}
   330  				for i := range v.Alt {
   331  					if v.Alt[i] != c.alt[i] {
   332  						return false
   333  					}
   334  				}
   335  				return true
   336  			})
   337  	}
   338  
   339  	if c.biAllelicOnly {
   340  		answer = append(answer, vcf.IsBiallelic)
   341  	}
   342  
   343  	if c.substitutionsOnly {
   344  		answer = append(answer, vcf.IsSubstitution)
   345  	}
   346  
   347  	if c.segregatingSitesOnly {
   348  		answer = append(answer, vcf.IsSegregating)
   349  	}
   350  
   351  	if c.removeNoAncestor {
   352  		answer = append(answer, vcf.HasAncestor)
   353  	}
   354  
   355  	if c.onlyPolarizableAncestors {
   356  		answer = append(answer, vcf.IsPolarizable)
   357  	}
   358  	if c.noWeakToStrongOrStrongToWeak {
   359  		answer = append(answer, vcf.IsNotWeakToStrongOrStrongToWeak)
   360  	}
   361  	if c.weakToStrongOrStrongToWeakOnly {
   362  		answer = append(answer, vcf.IsWeakToStrongOrStrongToWeak)
   363  	}
   364  	if c.refWeakAltStrongOnly {
   365  		answer = append(answer, vcf.IsRefWeakAltStrong)
   366  	}
   367  	if c.refStrongAltWeakOnly {
   368  		answer = append(answer, vcf.IsRefStrongAltWeak)
   369  	}
   370  	if c.notRefWeakAltStrong {
   371  		answer = append(answer, vcf.IsNotRefWeakAltStrong)
   372  	}
   373  	if c.notRefStrongAltWeak {
   374  		answer = append(answer, vcf.IsNotRefStrongAltWeak)
   375  	}
   376  	if c.id != "" {
   377  		answer = append(answer,
   378  			func(v vcf.Vcf) bool {
   379  				return v.Id == c.id
   380  			})
   381  	}
   382  	if c.subSet < 1 {
   383  		answer = append(answer,
   384  			func(v vcf.Vcf) bool {
   385  				r := rand.Float64()
   386  				return r <= c.subSet
   387  			})
   388  	}
   389  	return answer
   390  }
   391  
   392  func main() {
   393  	var expectedNumArgs int = 2
   394  	var setSeed *int64 = flag.Int64("setSeed", -1, "Use a specific seed for the RNG.")
   395  	var chrom *string = flag.String("chrom", "", "Specifies the chromosome name.")
   396  	var groupFile *string = flag.String("groupFile", "", "Retains alleles from individuals in the input group file.")
   397  	var minPos *int = flag.Int("minPos", 0, "Specifies the minimum position of the variant.")
   398  	var maxPos *int = flag.Int("maxPos", math.MaxInt64, "Specifies the maximum position of the variant.")
   399  	var minQual *float64 = flag.Float64("minQual", 0, "Specifies the minimum quality score.")
   400  	var ref *string = flag.String("ref", "", "Specifies the reference field.")
   401  	var alt *string = flag.String("alt", "", "Specifies the alt field.")
   402  	var biAllelicOnly *bool = flag.Bool("biAllelicOnly", false, "Retains only biallelic variants in the output file. Not compatible with stdin.")
   403  	var substitutionsOnly *bool = flag.Bool("substitutionsOnly", false, "Retains only substitution variants in the output file (removes INDEL variants).")
   404  	var segregatingSitesOnly *bool = flag.Bool("segregatingSitesOnly", false, "Retains only variants that are segregating in at least one sample.")
   405  	var removeNoAncestor *bool = flag.Bool("removeNoAncestor", false, "Retains only variants with an ancestor allele annotated in the info column.")
   406  	var onlyPolarizableAncestors *bool = flag.Bool("onlyPolarizableAncestors", false, "Retains only variants that can be used to construct a derived allele frequency spectrum. Must have a subsitution where the ancestral allele matches either alt or ref.")
   407  	var weakToStrongOrStrongToWeakOnly *bool = flag.Bool("weakToStrongOrStrongToWeakOnly", false, "Retains only variants that are weak to strong or strong to weak mutations.")
   408  	var noWeakToStrongOrStrongToWeak *bool = flag.Bool("noWeakToStrongOrStrongToWeak", false, "Removes weak to strong variants and strong to weak variants.")
   409  	var refWeakAltStrongOnly *bool = flag.Bool("refWeakAltStrongOnly", false, "Retains only variants that have a weak Ref allele and a strong Alt allele.")
   410  	var refStrongAltWeakOnly *bool = flag.Bool("refStrongAltWeakOnly", false, "Retains only variants that have a strong Ref allele and a weak Alt allele.")
   411  	var NotRefStrongAltWeak *bool = flag.Bool("notRefStrongAltWeak", false, "Removes variants that have a strong Ref alleles AND weak Alt alleles.")
   412  	var NotRefWeakAltStrong *bool = flag.Bool("notRefWeakAltStrong", false, "Removes variants that have weak Ref allele AND a strong Alt allele.")
   413  	var id *string = flag.String("id", "", "Specifies the rsID") //raven's note: added id string
   414  	var formatExp *string = flag.String("format", "", "A logical expression (or a series of semicolon ';' delimited expressions) consisting of a tag and value present in the format field. Must be in double quotes (\"). "+
   415  		"Expression can use the operators '>' '<' '=' '!=' '<=' '>'. For example, you can filter for variants with read depth greater than 100 and mapping quality greater or equal to 20 with the expression: \"DP > 100 ; MQ > 20\". "+
   416  		"This tag is currently not supported for tags that have multiple values. When testing a vcf with multiple samples, the expression will only be tested on the first sample.")
   417  	var infoExp *string = flag.String("info", "", "Identical to the 'format' tag, but tests the info field. The values of type 'Flag' in the info field"+
   418  		"can be tested by including just the flag ID in the expression. E.g. To select all records with the flag 'GG' you would use the expression \"GG\".")
   419  	var includeMissingInfo *bool = flag.Bool("includeMissingInfo", false, "When querying the records using the \"-info\" tag, include records where the queried tags are not present.")
   420  	var subSet *float64 = flag.Float64("subSet", 1, "Proportion of variants to retain in output. Value must be between 0 and 1.")
   421  	var minDaf *float64 = flag.Float64("minDaf", 0, "Set the minimum derived allele frequency for retained variants. Ancestral allele must be defined in INFO.")
   422  	var maxDaf *float64 = flag.Float64("maxDaf", 1, "Set the maximum derived allele frequency for retained variants. Ancestral allele must be defined in INFO.")
   423  	var minDist *int = flag.Int("minDistance", 0, "Remove variants that are within minDistance of another variant. File must be sorted by position.")
   424  
   425  	flag.Usage = usage
   426  	log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
   427  	flag.Parse()
   428  
   429  	var altSlice []string
   430  	if *alt != "" {
   431  		altSlice = strings.Split(*alt, ",")
   432  	}
   433  
   434  	c := criteria{
   435  		chrom:                          *chrom,
   436  		minPos:                         *minPos,
   437  		maxPos:                         *maxPos,
   438  		minQual:                        *minQual,
   439  		ref:                            *ref,
   440  		alt:                            altSlice,
   441  		biAllelicOnly:                  *biAllelicOnly,
   442  		substitutionsOnly:              *substitutionsOnly,
   443  		segregatingSitesOnly:           *segregatingSitesOnly,
   444  		removeNoAncestor:               *removeNoAncestor,
   445  		onlyPolarizableAncestors:       *onlyPolarizableAncestors,
   446  		weakToStrongOrStrongToWeakOnly: *weakToStrongOrStrongToWeakOnly,
   447  		noWeakToStrongOrStrongToWeak:   *noWeakToStrongOrStrongToWeak,
   448  		refWeakAltStrongOnly:           *refWeakAltStrongOnly,
   449  		refStrongAltWeakOnly:           *refStrongAltWeakOnly,
   450  		notRefStrongAltWeak:            *NotRefStrongAltWeak,
   451  		notRefWeakAltStrong:            *NotRefWeakAltStrong,
   452  		id:                             *id,
   453  		formatExp:                      *formatExp,
   454  		infoExp:                        *infoExp,
   455  		includeMissingInfo:             *includeMissingInfo,
   456  		subSet:                         *subSet,
   457  		minDaf:                         *minDaf,
   458  		maxDaf:                         *maxDaf,
   459  		minDist:                        *minDist,
   460  	}
   461  
   462  	var parseFormat, parseInfo bool
   463  	if *formatExp != "" {
   464  		parseFormat = true
   465  	}
   466  	if *infoExp != "" {
   467  		parseInfo = true
   468  	}
   469  
   470  	if len(flag.Args()) != expectedNumArgs {
   471  		flag.Usage()
   472  		log.Fatalf("Error: expecting %d arguments, but got %d\n",
   473  			expectedNumArgs, len(flag.Args()))
   474  	}
   475  
   476  	infile := flag.Arg(0)
   477  	outfile := flag.Arg(1)
   478  
   479  	total, removed := vcfFilter(infile, outfile, c, *groupFile, parseFormat, parseInfo, *setSeed)
   480  	log.Printf("Processed  %d variants\n", total)
   481  	log.Printf("Removed    %d variants\n", removed)
   482  }