github.com/vertgenlab/gonomics@v1.0.0/cmd/vcfFilter/vcfFilter.go (about) 1 // Command Group: "VCF Tools" 2 3 // Filter vcf records 4 package main 5 6 import ( 7 "flag" 8 "fmt" 9 "log" 10 "math" 11 "math/rand" 12 "strings" 13 14 "github.com/vertgenlab/gonomics/exception" 15 "github.com/vertgenlab/gonomics/fileio" 16 "github.com/vertgenlab/gonomics/popgen" 17 "github.com/vertgenlab/gonomics/vcf" 18 ) 19 20 // coordinate is a chrom/pos pair, referring to a specific place in the genome. 21 type coordinate struct { 22 Chr string 23 Pos int 24 } 25 26 func getSitesSeen(filename string) map[coordinate]uint8 { 27 var sitesSeen map[coordinate]uint8 = make(map[coordinate]uint8, 0) 28 var records <-chan vcf.Vcf 29 records, _ = vcf.GoReadToChan(filename) 30 var currentCoord coordinate = coordinate{"", 0} 31 for v := range records { 32 currentCoord.Chr = v.Chr 33 currentCoord.Pos = v.Pos 34 sitesSeen[currentCoord]++ 35 } 36 return sitesSeen 37 } 38 39 func rmClusteredRecords(input <-chan vcf.Vcf, output chan<- vcf.Vcf, minDist int, totalChan, removedChan chan<- int) { 40 var prev vcf.Vcf 41 var canSend bool 42 var total, removed int 43 for v := range input { 44 total++ 45 if prev.Chr == "" { 46 prev = v 47 canSend = true 48 continue 49 } 50 51 if v.Pos < prev.Pos && v.Chr == prev.Chr { 52 log.Fatalf("ERROR: input vcf is not sorted. Offending records:\n%s\n%s", prev, v) 53 } 54 55 // if onto a new chrom, send previous if applicable, set v as prev, and move on to next record 56 if v.Chr != prev.Chr { 57 if canSend { 58 output <- prev 59 } else { 60 removed++ 61 } 62 canSend = true 63 prev = v 64 continue 65 } 66 67 if v.Pos-prev.Pos < minDist { // too close, don't send 68 canSend = false 69 prev = v 70 removed++ 71 continue 72 } 73 74 // if we reach this point prev is clear to send and v is ok to send in the next loop if the subsequent variant is not too close 75 if canSend { 76 output <- prev 77 } else { 78 removed++ 79 } 80 prev = v 81 canSend = true 82 } 83 84 // send final record if passing 85 if canSend { 86 output <- prev 87 } else { 88 removed++ 89 } 90 91 totalChan <- total 92 removedChan <- removed 93 94 close(totalChan) 95 close(removedChan) 96 close(output) 97 } 98 99 func vcfFilter(infile string, outfile string, c criteria, groupFile string, parseFormat bool, parseInfo bool, setSeed int64) (total, removed int) { 100 rand.Seed(setSeed) 101 var records <-chan vcf.Vcf 102 var totalChan, removedChan chan int 103 var header vcf.Header 104 var err error 105 var currentCoord coordinate 106 var sitesSeen map[coordinate]uint8 = make(map[coordinate]uint8, 0) //uint8 is the number of times this site is seen in the vcf file. 107 108 if c.biAllelicOnly { 109 sitesSeen = getSitesSeen(infile) 110 } 111 112 records, header = vcf.GoReadToChan(infile) 113 out := fileio.EasyCreate(outfile) 114 tests := getTests(c, header) 115 116 if c.minDist > 0 { 117 // hijack input channel so only sites passing the minDist filter are passed to the rest of the function 118 passedRecords := make(chan vcf.Vcf, 100) 119 totalChan = make(chan int, 1) // to get accurate total count 120 removedChan = make(chan int, 1) // to get accurate removed count 121 go rmClusteredRecords(records, passedRecords, c.minDist, totalChan, removedChan) 122 records = passedRecords 123 } 124 125 var samplesToKeep []int = make([]int, 0) //this var holds all of the indices from samples (defined below as the sample list in the header) that we want to keep in the output file. 126 127 if groupFile != "" { 128 groups := popgen.ReadGroups(groupFile) 129 samples := vcf.HeaderGetSampleList(header) 130 131 for i := 0; i < len(samples); i++ { 132 if popgen.GroupsContains(groups, samples[i]) { 133 samplesToKeep = append(samplesToKeep, i) 134 } 135 } 136 outSamples := filterHeaderSamplesToKeep(samples, samplesToKeep) 137 vcf.HeaderUpdateSampleList(header, outSamples) 138 } 139 vcf.NewWriteHeader(out, header) 140 141 for v := range records { 142 total++ 143 if groupFile != "" { 144 v.Samples = filterRecordsSamplesToKeep(v.Samples, samplesToKeep) 145 } 146 147 if parseFormat { 148 v = vcf.ParseFormat(v, header) 149 } 150 151 if c.biAllelicOnly { 152 currentCoord = coordinate{v.Chr, v.Pos} 153 if sitesSeen[currentCoord] < 1 { 154 log.Panicf("Current variant not found in sitesSeen map. Something went horribly wrong. %v.\n", v) 155 } else if sitesSeen[currentCoord] > 1 { 156 removed++ 157 continue 158 } 159 } 160 161 if parseInfo { 162 v = vcf.ParseInfo(v, header) 163 } 164 165 if !passesTests(v, tests) { 166 removed++ 167 continue 168 } 169 170 vcf.WriteVcf(out, v) 171 } 172 173 if c.minDist > 0 { 174 total = <-totalChan 175 removed += <-removedChan 176 } 177 178 err = out.Close() 179 exception.PanicOnErr(err) 180 return 181 } 182 183 func filterRecordsSamplesToKeep(recordSamples []vcf.Sample, samplesToKeep []int) []vcf.Sample { 184 var answer []vcf.Sample 185 for _, v := range samplesToKeep { 186 answer = append(answer, recordSamples[v]) 187 } 188 189 return answer 190 } 191 192 func filterHeaderSamplesToKeep(samples []string, samplesToKeep []int) []string { 193 var answer []string 194 for _, v := range samplesToKeep { 195 answer = append(answer, samples[v]) 196 } 197 198 return answer 199 } 200 201 func usage() { 202 fmt.Print( 203 "vcfFilter - Filter vcf records.\n\n" + 204 "Usage:\n" + 205 " vcfFilter [options] input.vcf output.vcf\n\n" + 206 "Options:\n\n") 207 flag.PrintDefaults() 208 } 209 210 // criteria by which vcf records are filtered. 211 type criteria struct { 212 chrom string 213 groupFile string 214 minPos int 215 maxPos int 216 minQual float64 217 ref string 218 alt []string 219 biAllelicOnly bool 220 substitutionsOnly bool 221 segregatingSitesOnly bool 222 removeNoAncestor bool 223 onlyPolarizableAncestors bool 224 weakToStrongOrStrongToWeakOnly bool 225 noWeakToStrongOrStrongToWeak bool 226 refWeakAltStrongOnly bool 227 refStrongAltWeakOnly bool 228 notRefWeakAltStrong bool 229 notRefStrongAltWeak bool 230 id string 231 formatExp string 232 infoExp string 233 includeMissingInfo bool 234 subSet float64 235 minDaf float64 236 maxDaf float64 237 minDist int 238 } 239 240 // testingFuncs are a set of functions that must all return true to escape filter. 241 type testingFuncs []func(vcf.Vcf) bool 242 243 // passesTests runs all testingFuncs on a vcf record and returns true if all tests pass. 244 func passesTests(v vcf.Vcf, t testingFuncs) bool { 245 for i := range t { 246 if !t[i](v) { 247 return false 248 } 249 } 250 return true 251 } 252 253 // getTests parses the criteria struct to determine the testingFuncs. 254 func getTests(c criteria, header vcf.Header) testingFuncs { 255 var answer testingFuncs 256 257 if c.formatExp != "" { 258 answer = append(answer, parseExpression(c.formatExp, header, true, c.includeMissingInfo)...) //raven's note: when tried to go run, got parseExpression undefined error. parseExpression is defined in cmd/vcfFilter/expression.go 259 } 260 261 if c.infoExp != "" { 262 answer = append(answer, parseExpression(c.infoExp, header, false, c.includeMissingInfo)...) 263 } 264 265 if c.chrom != "" { 266 answer = append(answer, 267 func(v vcf.Vcf) bool { 268 return v.Chr == c.chrom 269 }) 270 } 271 272 if c.minPos != 0 { 273 answer = append(answer, 274 func(v vcf.Vcf) bool { 275 return v.Pos >= c.minPos 276 }) 277 } 278 279 if c.maxPos != math.MaxInt64 { 280 answer = append(answer, 281 func(v vcf.Vcf) bool { 282 return v.Pos <= c.maxPos 283 }) 284 } 285 286 if c.minDaf != 0 { 287 if c.minDaf < 0 || c.minDaf > 1 { 288 log.Fatalf("minDaf must be between 0 and 1.") 289 } 290 answer = append(answer, 291 func(v vcf.Vcf) bool { 292 return popgen.VcfSampleDerivedAlleleFrequency(v) > c.minDaf 293 }) 294 } 295 296 if c.maxDaf != 1 { 297 if c.maxDaf < 0 || c.maxDaf > 1 { 298 log.Fatalf("maxDaf must be between 0 and 1.") 299 } 300 answer = append(answer, 301 func(v vcf.Vcf) bool { 302 return popgen.VcfSampleDerivedAlleleFrequency(v) < c.maxDaf 303 }) 304 } 305 306 if c.maxDaf < c.minDaf { 307 log.Fatalf("maxDaf must be less than minDaf.") 308 } 309 310 if c.minQual != 0 { 311 answer = append(answer, 312 func(v vcf.Vcf) bool { 313 return v.Qual >= c.minQual 314 }) 315 } 316 317 if c.ref != "" { 318 answer = append(answer, 319 func(v vcf.Vcf) bool { 320 return v.Ref == c.ref 321 }) 322 } 323 324 if c.alt != nil { 325 answer = append(answer, 326 func(v vcf.Vcf) bool { 327 if len(v.Alt) != len(c.alt) { 328 return false 329 } 330 for i := range v.Alt { 331 if v.Alt[i] != c.alt[i] { 332 return false 333 } 334 } 335 return true 336 }) 337 } 338 339 if c.biAllelicOnly { 340 answer = append(answer, vcf.IsBiallelic) 341 } 342 343 if c.substitutionsOnly { 344 answer = append(answer, vcf.IsSubstitution) 345 } 346 347 if c.segregatingSitesOnly { 348 answer = append(answer, vcf.IsSegregating) 349 } 350 351 if c.removeNoAncestor { 352 answer = append(answer, vcf.HasAncestor) 353 } 354 355 if c.onlyPolarizableAncestors { 356 answer = append(answer, vcf.IsPolarizable) 357 } 358 if c.noWeakToStrongOrStrongToWeak { 359 answer = append(answer, vcf.IsNotWeakToStrongOrStrongToWeak) 360 } 361 if c.weakToStrongOrStrongToWeakOnly { 362 answer = append(answer, vcf.IsWeakToStrongOrStrongToWeak) 363 } 364 if c.refWeakAltStrongOnly { 365 answer = append(answer, vcf.IsRefWeakAltStrong) 366 } 367 if c.refStrongAltWeakOnly { 368 answer = append(answer, vcf.IsRefStrongAltWeak) 369 } 370 if c.notRefWeakAltStrong { 371 answer = append(answer, vcf.IsNotRefWeakAltStrong) 372 } 373 if c.notRefStrongAltWeak { 374 answer = append(answer, vcf.IsNotRefStrongAltWeak) 375 } 376 if c.id != "" { 377 answer = append(answer, 378 func(v vcf.Vcf) bool { 379 return v.Id == c.id 380 }) 381 } 382 if c.subSet < 1 { 383 answer = append(answer, 384 func(v vcf.Vcf) bool { 385 r := rand.Float64() 386 return r <= c.subSet 387 }) 388 } 389 return answer 390 } 391 392 func main() { 393 var expectedNumArgs int = 2 394 var setSeed *int64 = flag.Int64("setSeed", -1, "Use a specific seed for the RNG.") 395 var chrom *string = flag.String("chrom", "", "Specifies the chromosome name.") 396 var groupFile *string = flag.String("groupFile", "", "Retains alleles from individuals in the input group file.") 397 var minPos *int = flag.Int("minPos", 0, "Specifies the minimum position of the variant.") 398 var maxPos *int = flag.Int("maxPos", math.MaxInt64, "Specifies the maximum position of the variant.") 399 var minQual *float64 = flag.Float64("minQual", 0, "Specifies the minimum quality score.") 400 var ref *string = flag.String("ref", "", "Specifies the reference field.") 401 var alt *string = flag.String("alt", "", "Specifies the alt field.") 402 var biAllelicOnly *bool = flag.Bool("biAllelicOnly", false, "Retains only biallelic variants in the output file. Not compatible with stdin.") 403 var substitutionsOnly *bool = flag.Bool("substitutionsOnly", false, "Retains only substitution variants in the output file (removes INDEL variants).") 404 var segregatingSitesOnly *bool = flag.Bool("segregatingSitesOnly", false, "Retains only variants that are segregating in at least one sample.") 405 var removeNoAncestor *bool = flag.Bool("removeNoAncestor", false, "Retains only variants with an ancestor allele annotated in the info column.") 406 var onlyPolarizableAncestors *bool = flag.Bool("onlyPolarizableAncestors", false, "Retains only variants that can be used to construct a derived allele frequency spectrum. Must have a subsitution where the ancestral allele matches either alt or ref.") 407 var weakToStrongOrStrongToWeakOnly *bool = flag.Bool("weakToStrongOrStrongToWeakOnly", false, "Retains only variants that are weak to strong or strong to weak mutations.") 408 var noWeakToStrongOrStrongToWeak *bool = flag.Bool("noWeakToStrongOrStrongToWeak", false, "Removes weak to strong variants and strong to weak variants.") 409 var refWeakAltStrongOnly *bool = flag.Bool("refWeakAltStrongOnly", false, "Retains only variants that have a weak Ref allele and a strong Alt allele.") 410 var refStrongAltWeakOnly *bool = flag.Bool("refStrongAltWeakOnly", false, "Retains only variants that have a strong Ref allele and a weak Alt allele.") 411 var NotRefStrongAltWeak *bool = flag.Bool("notRefStrongAltWeak", false, "Removes variants that have a strong Ref alleles AND weak Alt alleles.") 412 var NotRefWeakAltStrong *bool = flag.Bool("notRefWeakAltStrong", false, "Removes variants that have weak Ref allele AND a strong Alt allele.") 413 var id *string = flag.String("id", "", "Specifies the rsID") //raven's note: added id string 414 var formatExp *string = flag.String("format", "", "A logical expression (or a series of semicolon ';' delimited expressions) consisting of a tag and value present in the format field. Must be in double quotes (\"). "+ 415 "Expression can use the operators '>' '<' '=' '!=' '<=' '>'. For example, you can filter for variants with read depth greater than 100 and mapping quality greater or equal to 20 with the expression: \"DP > 100 ; MQ > 20\". "+ 416 "This tag is currently not supported for tags that have multiple values. When testing a vcf with multiple samples, the expression will only be tested on the first sample.") 417 var infoExp *string = flag.String("info", "", "Identical to the 'format' tag, but tests the info field. The values of type 'Flag' in the info field"+ 418 "can be tested by including just the flag ID in the expression. E.g. To select all records with the flag 'GG' you would use the expression \"GG\".") 419 var includeMissingInfo *bool = flag.Bool("includeMissingInfo", false, "When querying the records using the \"-info\" tag, include records where the queried tags are not present.") 420 var subSet *float64 = flag.Float64("subSet", 1, "Proportion of variants to retain in output. Value must be between 0 and 1.") 421 var minDaf *float64 = flag.Float64("minDaf", 0, "Set the minimum derived allele frequency for retained variants. Ancestral allele must be defined in INFO.") 422 var maxDaf *float64 = flag.Float64("maxDaf", 1, "Set the maximum derived allele frequency for retained variants. Ancestral allele must be defined in INFO.") 423 var minDist *int = flag.Int("minDistance", 0, "Remove variants that are within minDistance of another variant. File must be sorted by position.") 424 425 flag.Usage = usage 426 log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) 427 flag.Parse() 428 429 var altSlice []string 430 if *alt != "" { 431 altSlice = strings.Split(*alt, ",") 432 } 433 434 c := criteria{ 435 chrom: *chrom, 436 minPos: *minPos, 437 maxPos: *maxPos, 438 minQual: *minQual, 439 ref: *ref, 440 alt: altSlice, 441 biAllelicOnly: *biAllelicOnly, 442 substitutionsOnly: *substitutionsOnly, 443 segregatingSitesOnly: *segregatingSitesOnly, 444 removeNoAncestor: *removeNoAncestor, 445 onlyPolarizableAncestors: *onlyPolarizableAncestors, 446 weakToStrongOrStrongToWeakOnly: *weakToStrongOrStrongToWeakOnly, 447 noWeakToStrongOrStrongToWeak: *noWeakToStrongOrStrongToWeak, 448 refWeakAltStrongOnly: *refWeakAltStrongOnly, 449 refStrongAltWeakOnly: *refStrongAltWeakOnly, 450 notRefStrongAltWeak: *NotRefStrongAltWeak, 451 notRefWeakAltStrong: *NotRefWeakAltStrong, 452 id: *id, 453 formatExp: *formatExp, 454 infoExp: *infoExp, 455 includeMissingInfo: *includeMissingInfo, 456 subSet: *subSet, 457 minDaf: *minDaf, 458 maxDaf: *maxDaf, 459 minDist: *minDist, 460 } 461 462 var parseFormat, parseInfo bool 463 if *formatExp != "" { 464 parseFormat = true 465 } 466 if *infoExp != "" { 467 parseInfo = true 468 } 469 470 if len(flag.Args()) != expectedNumArgs { 471 flag.Usage() 472 log.Fatalf("Error: expecting %d arguments, but got %d\n", 473 expectedNumArgs, len(flag.Args())) 474 } 475 476 infile := flag.Arg(0) 477 outfile := flag.Arg(1) 478 479 total, removed := vcfFilter(infile, outfile, c, *groupFile, parseFormat, parseInfo, *setSeed) 480 log.Printf("Processed %d variants\n", total) 481 log.Printf("Removed %d variants\n", removed) 482 }