github.com/vertgenlab/gonomics@v1.0.0/cmd/DEPRECATED/primateRecon/primateRecon.go (about) 1 // Command Group: "Sequence Evolution & Reconstruction" 2 3 // WARNING: this program is now deprecated. Please use 'reconstructSeq' as its replacement. 4 // Returns maximum likelihood sequence from an HBCGO primate alignment 5 package main 6 7 import ( 8 "flag" 9 "fmt" 10 "log" 11 12 "github.com/vertgenlab/gonomics/dna" 13 "github.com/vertgenlab/gonomics/exception" 14 "github.com/vertgenlab/gonomics/expandedTree" 15 "github.com/vertgenlab/gonomics/fasta" 16 "github.com/vertgenlab/gonomics/reconstruct" 17 ) 18 19 // likelihoodsToBaseUnbiased takes the un-normalized likelihoods for A, C, G, T as well 20 // and the probability 21 // threshold for when we will call it the mle base instead of N 22 // and gives back the reconstructed base for the hca. 23 func likelihoodsToBaseUnbiased(likes []float64, probThreshold float64) dna.Base { 24 var total, bestProb float64 25 var i int 26 var answer dna.Base = dna.N 27 28 for i = range likes { 29 total += likes[i] 30 } 31 32 for i = range likes { 33 if likes[i]/total >= probThreshold && likes[i] > bestProb { 34 bestProb = likes[i] 35 answer = dna.Base(i) 36 } 37 } 38 return answer 39 } 40 41 // likelihoodsToBaseBias takes the un-normalized likelihoods for A, C, G, T as well 42 // as the index of the biased base (0 for A, 1 for C, etc), and the probability 43 // threshold for when we will call it the mle base instead of the biased base 44 // and gives back the reconstructed base for the hca/hga. 45 func likelihoodsToBaseBias(likes []float64, biasBase dna.Base, probThreshold float64, nonBiasProbThreshold float64) dna.Base { 46 var total, nonHumanTotal, bestProb float64 47 var i int 48 var answer dna.Base 49 50 if biasBase == dna.Gap { 51 answer = dna.N 52 } else { 53 answer = biasBase 54 } 55 56 for i = range likes { 57 total += likes[i] 58 if i != int(biasBase) { 59 nonHumanTotal += likes[i] 60 } 61 } 62 63 for i = range likes { 64 if likes[i]/total >= probThreshold && likes[i] > bestProb && nonHumanTotal/total >= nonBiasProbThreshold { 65 bestProb = likes[i] 66 answer = dna.Base(i) 67 } 68 } 69 return answer 70 } 71 72 func baseIsPresent(b dna.Base) bool { 73 if dna.DefineBase(b) || b == dna.N { 74 return true 75 } 76 return false 77 } 78 79 func hcaIsPresent(human, bonobo, chimp, gorilla, orangutan dna.Base) bool { 80 if baseIsPresent(human) && (baseIsPresent(bonobo) || baseIsPresent(chimp)) { 81 return true 82 } 83 if (baseIsPresent(human) || baseIsPresent(bonobo) || baseIsPresent(chimp)) && (baseIsPresent(gorilla) || baseIsPresent(orangutan)) { 84 return true 85 } 86 return false 87 } 88 89 func hgaIsPresent(human, bonobo, chimp, gorilla, orangutan dna.Base) bool { 90 if baseIsPresent(gorilla) && (baseIsPresent(human) || baseIsPresent(chimp) || baseIsPresent(bonobo)) { 91 return true 92 } 93 if baseIsPresent(orangutan) && (baseIsPresent(gorilla) || baseIsPresent(human) || baseIsPresent(chimp) || baseIsPresent(bonobo)) { 94 return true 95 } 96 return false 97 } 98 99 func reconHcaBase(root, humanNode, chimpNode, nodeToRecon *expandedTree.ETree, position int, probThreshold float64, nonBiasProbThreshold float64, humanBias bool, chimpBias bool) { 100 var likelihoods []float64 101 var nextBase dna.Base 102 reconstruct.SetState(root, position) 103 likelihoods = reconstruct.FixFc(root, nodeToRecon) 104 if humanBias { 105 nextBase = likelihoodsToBaseBias(likelihoods, humanNode.Fasta.Seq[position], probThreshold, nonBiasProbThreshold) 106 } else if chimpBias { 107 nextBase = likelihoodsToBaseBias(likelihoods, chimpNode.Fasta.Seq[position], probThreshold, nonBiasProbThreshold) 108 } else { 109 nextBase = likelihoodsToBaseUnbiased(likelihoods, probThreshold) 110 } 111 nodeToRecon.Fasta.Seq = append(nodeToRecon.Fasta.Seq, nextBase) 112 } 113 114 func reconHgaBase(root, gorillaNode, nodeToRecon *expandedTree.ETree, position int, probThreshold float64, nonBiasProbThreshold float64) { 115 var likelihoods []float64 116 var nextBase dna.Base 117 reconstruct.SetState(root, position) 118 likelihoods = reconstruct.FixFc(root, nodeToRecon) 119 nextBase = likelihoodsToBaseBias(likelihoods, gorillaNode.Fasta.Seq[position], probThreshold, nonBiasProbThreshold) 120 nodeToRecon.Fasta.Seq = append(nodeToRecon.Fasta.Seq, nextBase) 121 } 122 123 func primateReconHcaMle(inFastaFilename string, inTreeFilename string, humanBias bool, chimpBias bool, probThreshold float64, nonHumanProbThreshold float64, useGenericNames bool, outputFastaFilename string) { 124 var tree, humanNode, humanAltNode, bonoboNode, chimpNode, gorillaNode, orangutanNode, hcaNode *expandedTree.ETree 125 var err error 126 var i int 127 128 tree, err = expandedTree.ReadTree(inTreeFilename, inFastaFilename) 129 exception.FatalOnErr(err) 130 if useGenericNames { 131 humanNode = expandedTree.FindNodeName(tree, "human") 132 if humanNode == nil { 133 log.Fatalf("Didn't find human in the tree\n") 134 } 135 bonoboNode = expandedTree.FindNodeName(tree, "bonobo") 136 if bonoboNode == nil { 137 log.Fatalf("Didn't find bonobo in the tree\n") 138 } 139 chimpNode = expandedTree.FindNodeName(tree, "chimp") 140 if chimpNode == nil { 141 log.Fatalf("Didn't find chimp in the tree\n") 142 } 143 gorillaNode = expandedTree.FindNodeName(tree, "gorilla") 144 if gorillaNode == nil { 145 log.Fatalf("Didn't find gorilla in the tree\n") 146 } 147 orangutanNode = expandedTree.FindNodeName(tree, "orangutan") 148 if orangutanNode == nil { 149 log.Fatalf("Didn't find orangutan in the tree\n") 150 } 151 } else { 152 humanNode = expandedTree.FindNodeName(tree, "hg38") 153 if humanNode == nil { 154 log.Fatalf("Didn't find hg38 in the tree\n") 155 } 156 bonoboNode = expandedTree.FindNodeName(tree, "panPan2") 157 if bonoboNode == nil { 158 log.Fatalf("Didn't find panPan2 in the tree\n") 159 } 160 chimpNode = expandedTree.FindNodeName(tree, "panTro6") 161 if chimpNode == nil { 162 log.Fatalf("Didn't find panTro6 in the tree\n") 163 } 164 gorillaNode = expandedTree.FindNodeName(tree, "gorGor5") 165 if gorillaNode == nil { 166 log.Fatalf("Didn't find gorGor5 in the tree\n") 167 } 168 orangutanNode = expandedTree.FindNodeName(tree, "ponAbe3") 169 if orangutanNode == nil { 170 log.Fatalf("Didn't find ponAbe3 in the tree\n") 171 } 172 } 173 hcaNode = expandedTree.FindNodeName(tree, "hca") 174 if hcaNode == nil { 175 log.Fatalf("Didn't find hca in the tree\n") 176 } 177 178 if !(humanBias || chimpBias) { 179 humanAltNode = expandedTree.FindNodeName(tree, "hg38alt") 180 if humanAltNode == nil { 181 log.Fatalf("Didn't find hg38alt in the tree\n") 182 } 183 } 184 185 for i = range humanNode.Fasta.Seq { 186 if hcaIsPresent(humanNode.Fasta.Seq[i], bonoboNode.Fasta.Seq[i], chimpNode.Fasta.Seq[i], gorillaNode.Fasta.Seq[i], orangutanNode.Fasta.Seq[i]) { 187 reconHcaBase(tree, humanNode, chimpNode, hcaNode, i, probThreshold, nonHumanProbThreshold, humanBias, chimpBias) 188 } else { 189 hcaNode.Fasta.Seq = append(hcaNode.Fasta.Seq, dna.Gap) 190 } 191 } 192 if humanBias || chimpBias { 193 fasta.Write(outputFastaFilename, []fasta.Fasta{*humanNode.Fasta, *chimpNode.Fasta, *bonoboNode.Fasta, *gorillaNode.Fasta, *orangutanNode.Fasta, *hcaNode.Fasta}) 194 } else { 195 fasta.Write(outputFastaFilename, []fasta.Fasta{*humanNode.Fasta, *humanAltNode.Fasta, *chimpNode.Fasta, *bonoboNode.Fasta, *gorillaNode.Fasta, *orangutanNode.Fasta, *hcaNode.Fasta}) 196 } 197 } 198 199 func primateReconHgaMle(inFile string, inTreeFileName string, probThreshold float64, nonBiasProbThreshold float64, useGenericNames bool, outFile string) { 200 var tree, humanNode, bonoboNode, chimpNode, gorillaNode, orangutanNode, hgaNode *expandedTree.ETree 201 var err error 202 var i int 203 204 tree, err = expandedTree.ReadTree(inTreeFileName, inFile) 205 exception.FatalOnErr(err) 206 if useGenericNames { 207 humanNode = expandedTree.FindNodeName(tree, "human") 208 if humanNode == nil { 209 log.Fatalf("Didn't find human in the tree\n") 210 } 211 bonoboNode = expandedTree.FindNodeName(tree, "bonobo") 212 if bonoboNode == nil { 213 log.Fatalf("Didn't find bonobo in the tree\n") 214 } 215 chimpNode = expandedTree.FindNodeName(tree, "chimp") 216 if chimpNode == nil { 217 log.Fatalf("Didn't find chimp in the tree\n") 218 } 219 gorillaNode = expandedTree.FindNodeName(tree, "gorilla") 220 if gorillaNode == nil { 221 log.Fatalf("Didn't find gorilla in the tree\n") 222 } 223 orangutanNode = expandedTree.FindNodeName(tree, "orangutan") 224 if orangutanNode == nil { 225 log.Fatalf("Didn't find orangutan in the tree\n") 226 } 227 } else { 228 humanNode = expandedTree.FindNodeName(tree, "hg38") 229 if humanNode == nil { 230 log.Fatalf("Didn't find hg38 in the tree\n") 231 } 232 bonoboNode = expandedTree.FindNodeName(tree, "panPan2") 233 if bonoboNode == nil { 234 log.Fatalf("Didn't find panPan2 in the tree\n") 235 } 236 chimpNode = expandedTree.FindNodeName(tree, "panTro6") 237 if chimpNode == nil { 238 log.Fatalf("Didn't find panTro6 in the tree\n") 239 } 240 gorillaNode = expandedTree.FindNodeName(tree, "gorGor5") 241 if gorillaNode == nil { 242 log.Fatalf("Didn't find gorGor5 in the tree\n") 243 } 244 orangutanNode = expandedTree.FindNodeName(tree, "ponAbe3") 245 if orangutanNode == nil { 246 log.Fatalf("Didn't find ponAbe3 in the tree\n") 247 } 248 } 249 hgaNode = expandedTree.FindNodeName(tree, "hga") 250 if hgaNode == nil { 251 log.Fatalf("Didn't find hca in the tree\n") 252 } 253 254 for i = range humanNode.Fasta.Seq { 255 if hgaIsPresent(humanNode.Fasta.Seq[i], bonoboNode.Fasta.Seq[i], chimpNode.Fasta.Seq[i], gorillaNode.Fasta.Seq[i], orangutanNode.Fasta.Seq[i]) { 256 reconHgaBase(tree, gorillaNode, hgaNode, i, probThreshold, nonBiasProbThreshold) 257 } else { 258 hgaNode.Fasta.Seq = append(hgaNode.Fasta.Seq, dna.Gap) 259 } 260 } 261 fasta.Write(outFile, []fasta.Fasta{*humanNode.Fasta, *chimpNode.Fasta, *bonoboNode.Fasta, *gorillaNode.Fasta, *orangutanNode.Fasta, *hgaNode.Fasta}) 262 } 263 264 func primateRecon(infile string, outfile string, messyToN bool) { 265 records := fasta.Read(infile) 266 output := append(records, ParsimonyPrimateRecon(records, messyToN)) 267 fasta.Write(outfile, output) 268 } 269 270 // ParsimonyPrimateRecon returns a human-biased conservative human-chimp ancestor estimate from an input multiFa alignment in the order Human-Chimp-Bonoboo-Orangutan-Gorilla. MessyToN, when true, turns ambiguous bases to Ns in the output. 271 func ParsimonyPrimateRecon(records []fasta.Fasta, messyToN bool) fasta.Fasta { 272 answer := fasta.Fasta{Name: "Human_Chimp_Ancestor"} 273 274 var outputBase dna.Base 275 276 if len(records) != 5 { 277 log.Fatalf("Wrong number of sequences, expecting five, found %d.\n", len(records)) 278 } 279 280 //confirm alignment lengths are all the same 281 firstLength := len(records[0].Seq) 282 for i := 1; i < len(records); i++ { 283 if len(records[i].Seq) != firstLength { 284 log.Fatalf("Sequence %d is the wrong length.\n", i+1) 285 } 286 } 287 288 var Ncount int = 0 289 var allMatch int = 0 290 var humanMatchChimporBonobo int = 0 291 var humanChange int = 0 292 var gorillaVote int = 0 293 var orangutanVote int = 0 294 var messyBase int = 0 295 296 human := records[0] 297 bonobo := records[1] 298 chimp := records[2] 299 orangutan := records[3] 300 gorilla := records[4] 301 302 for j := 0; j < len(human.Seq); j++ { 303 outputBase = human.Seq[j] 304 if human.Seq[j] == dna.N { //output seq is N if human is N 305 Ncount++ 306 outputBase = human.Seq[j] 307 } else if humanInsertion(records, j) { //output seq is gap if human is insertion(see helper function) 308 outputBase = dna.Gap 309 } else if human.Seq[j] != dna.Gap && (chimp.Seq[j] == dna.Gap && bonobo.Seq[j] == dna.Gap) { //if there is sequence in humans and either gorilla or orangutan, but no sequence in chimp and bonobo, we output N, because we don't want to estimate the longer human to gorilla branch. 310 if messyToN { 311 outputBase = dna.N 312 } else { 313 outputBase = human.Seq[j] 314 } 315 } else if gorilla.Seq[j] == dna.Gap && orangutan.Seq[j] == dna.Gap { //no sequence outside the HCA (no gorilla or orangutan, ancestral state can't be determined) 316 if messyToN { 317 outputBase = dna.N 318 } else { 319 outputBase = human.Seq[j] 320 } 321 } else if human.Seq[j] == chimp.Seq[j] && human.Seq[j] == bonobo.Seq[j] { //if human matches chimp and bonobo, output is human 322 outputBase = human.Seq[j] 323 allMatch++ 324 } else if (human.Seq[j] == chimp.Seq[j] || human.Seq[j] == bonobo.Seq[j]) && human.Seq[j] != dna.Gap { //if human is a base and matches chimp or bonobo, output is human 325 outputBase = human.Seq[j] 326 humanMatchChimporBonobo++ 327 } else if (chimp.Seq[j] == bonobo.Seq[j] && (chimp.Seq[j] == gorilla.Seq[j] || chimp.Seq[j] == orangutan.Seq[j])) && (chimp.Seq[j] != dna.N && chimp.Seq[j] != dna.Gap) { //human is different from ancestor, chimp and bonobo agree with ancestor. 328 outputBase = chimp.Seq[j] 329 humanChange++ 330 } else if (human.Seq[j] == gorilla.Seq[j] || chimp.Seq[j] == gorilla.Seq[j] || bonobo.Seq[j] == gorilla.Seq[j]) && (gorilla.Seq[j] != dna.N && gorilla.Seq[j] != dna.Gap) { //more disagreement, but gorilla defines the ancestor in this case 331 outputBase = gorilla.Seq[j] 332 gorillaVote++ 333 } else if (human.Seq[j] == orangutan.Seq[j] || chimp.Seq[j] == orangutan.Seq[j] || bonobo.Seq[j] == orangutan.Seq[j] || gorilla.Seq[j] == orangutan.Seq[j]) && (orangutan.Seq[j] != dna.N && orangutan.Seq[j] != dna.Gap) { 334 outputBase = orangutan.Seq[j] 335 orangutanVote++ 336 } else { 337 if human.Seq[j] != dna.Gap && !messyToN { 338 outputBase = human.Seq[j] 339 } else { 340 outputBase = dna.N 341 } 342 messyBase++ 343 } 344 answer.Seq = append(answer.Seq, outputBase) 345 } 346 return answer 347 } 348 349 func humanInsertion(records []fasta.Fasta, j int) bool { 350 //true if the human sequence is the only one present at a position 351 human := records[0] 352 bonobo := records[1] 353 chimp := records[2] 354 orangutan := records[3] 355 gorilla := records[4] 356 357 if (human.Seq[j] != dna.Gap) && (chimp.Seq[j] == dna.Gap && bonobo.Seq[j] == dna.Gap && gorilla.Seq[j] == dna.Gap && orangutan.Seq[j] == dna.Gap) { 358 return true 359 } 360 return false 361 } 362 363 func usage() { 364 fmt.Print( 365 "primateRecon - Returns maximum likelihood sequence from an HBCGO primate alignment\n" + 366 "Usage:\n" + 367 "primateRecon input.fa output.fa\n" + 368 "options:\n") 369 flag.PrintDefaults() 370 } 371 372 func main() { 373 var expectedNumArgs int = 2 374 var messyToN *bool = flag.Bool("messyToN", false, "Sets messy bases to Ns in the output file.") 375 var mleHcaUnbiased *bool = flag.Bool("mleHcaUnbiased", false, "Estimates the human-chimpanzee ancestor. Default is an N, unless a base passes the probThreshold threshold.") 376 var mleHcaHumanBiased *bool = flag.Bool("mleHcaHumanBiased", false, "Estimates the human-chimpanzee ancestor. Default is the human base, unless the non-human bases, collectively and individually, pass nonBiasProbThreshold and probThreshold.") 377 var mleHcaChimpBiased *bool = flag.Bool("mleHcaChimpBiased", false, "Estimates the human-chimpanzee ancestor. Default is the chimp base, unless the non-chimp bases, collectively and individually, pass nonBiasPropThreshold and probThreshold.") 378 var mleHgaGorillaBiased *bool = flag.Bool("mleHgaGorillaBiased", false, "Estimates the human-gorilla ancestor. Default is the gorilla base, unless the non-gorilla bases, collectively and individually, pass nonBiasPropThreshold and probThreshold.") 379 var tree *string = flag.String("mle", "", "Filename for newick tree with branch lengths. Must have the anticipated assembly names and the hca.") 380 var probThreshold *float64 = flag.Float64("probThreshold", 0.0, "The probability that a base other than human must pass to be considered a true change in the hca.") 381 var nonBiasProbThreshold *float64 = flag.Float64("nonBiasProbThreshold", 0.0, "The summation of all bases (other than the biased species) must pass this threshold for a non-biased species base to be considered in the hca.") 382 var useGenericNames *bool = flag.Bool("useGenericNames", false, "Use generic names (human, bonobo, chimp, gorilla, orangutan) instead of assembly names (hg38, panPan2, panTro6, gorGor5, ponAbe3)") 383 flag.Usage = usage 384 log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) 385 flag.Parse() 386 387 if len(flag.Args()) != expectedNumArgs { 388 flag.Usage() 389 log.Fatalf("Error: expecting %d arguments, but got %d\n", 390 expectedNumArgs, len(flag.Args())) 391 } 392 393 inFile := flag.Arg(0) 394 outFile := flag.Arg(1) 395 396 // some error check on flags provided 397 if *mleHcaHumanBiased && *mleHcaChimpBiased { 398 log.Fatalf("Error: cannot be biased for both the human and the chimp base\n") 399 } 400 if *messyToN && (*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased) { 401 log.Fatalf("Error: -messyToN can not be used with mle estimates\n") 402 } 403 if (*tree == "") && (*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased) { 404 log.Fatalf("Error: you need to provide a tree when using an mle estimate\n") 405 } 406 if *mleHcaUnbiased && (*mleHcaHumanBiased || *mleHcaChimpBiased) { 407 log.Fatalf("Error: Can not do both a biased and unbiased mle estimate\n") 408 } 409 if (*probThreshold != 0 || *nonBiasProbThreshold != 0) && !(*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased || *mleHgaGorillaBiased) { 410 log.Fatalf("Error: Can not use probability threshold flags without also using an mle estimate\n") 411 } 412 if *nonBiasProbThreshold != 0 && *mleHcaUnbiased { 413 log.Fatalf("Error: Can not do a nonBiasProbThreshold when also doing an unbiased estimate\n") 414 } 415 if *mleHgaGorillaBiased && (*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased) { 416 log.Fatalf("Error: cannot estimate both the HCA and the HGA at once\n") 417 } 418 419 if *mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased { 420 // at this point we know that xor of the mleFlags is true, so we only pass mleHumanBiased and mleChimpBiased 421 primateReconHcaMle(inFile, *tree, *mleHcaHumanBiased, *mleHcaChimpBiased, *probThreshold, *nonBiasProbThreshold, *useGenericNames, outFile) 422 } else if *mleHgaGorillaBiased { 423 primateReconHgaMle(inFile, *tree, *probThreshold, *nonBiasProbThreshold, *useGenericNames, outFile) 424 } else { 425 primateRecon(inFile, outFile, *messyToN) 426 } 427 }