github.com/vertgenlab/gonomics@v1.0.0/cmd/DEPRECATED/primateRecon/primateRecon.go (about)

     1  // Command Group: "Sequence Evolution & Reconstruction"
     2  
     3  // WARNING: this program is now deprecated. Please use 'reconstructSeq' as its replacement.
     4  // Returns maximum likelihood sequence from an HBCGO primate alignment
     5  package main
     6  
     7  import (
     8  	"flag"
     9  	"fmt"
    10  	"log"
    11  
    12  	"github.com/vertgenlab/gonomics/dna"
    13  	"github.com/vertgenlab/gonomics/exception"
    14  	"github.com/vertgenlab/gonomics/expandedTree"
    15  	"github.com/vertgenlab/gonomics/fasta"
    16  	"github.com/vertgenlab/gonomics/reconstruct"
    17  )
    18  
    19  // likelihoodsToBaseUnbiased takes the un-normalized likelihoods for A, C, G, T as well
    20  // and the probability
    21  // threshold for when we will call it the mle base instead of N
    22  // and gives back the reconstructed base for the hca.
    23  func likelihoodsToBaseUnbiased(likes []float64, probThreshold float64) dna.Base {
    24  	var total, bestProb float64
    25  	var i int
    26  	var answer dna.Base = dna.N
    27  
    28  	for i = range likes {
    29  		total += likes[i]
    30  	}
    31  
    32  	for i = range likes {
    33  		if likes[i]/total >= probThreshold && likes[i] > bestProb {
    34  			bestProb = likes[i]
    35  			answer = dna.Base(i)
    36  		}
    37  	}
    38  	return answer
    39  }
    40  
    41  // likelihoodsToBaseBias takes the un-normalized likelihoods for A, C, G, T as well
    42  // as the index of the biased base (0 for A, 1 for C, etc), and the probability
    43  // threshold for when we will call it the mle base instead of the biased base
    44  // and gives back the reconstructed base for the hca/hga.
    45  func likelihoodsToBaseBias(likes []float64, biasBase dna.Base, probThreshold float64, nonBiasProbThreshold float64) dna.Base {
    46  	var total, nonHumanTotal, bestProb float64
    47  	var i int
    48  	var answer dna.Base
    49  
    50  	if biasBase == dna.Gap {
    51  		answer = dna.N
    52  	} else {
    53  		answer = biasBase
    54  	}
    55  
    56  	for i = range likes {
    57  		total += likes[i]
    58  		if i != int(biasBase) {
    59  			nonHumanTotal += likes[i]
    60  		}
    61  	}
    62  
    63  	for i = range likes {
    64  		if likes[i]/total >= probThreshold && likes[i] > bestProb && nonHumanTotal/total >= nonBiasProbThreshold {
    65  			bestProb = likes[i]
    66  			answer = dna.Base(i)
    67  		}
    68  	}
    69  	return answer
    70  }
    71  
    72  func baseIsPresent(b dna.Base) bool {
    73  	if dna.DefineBase(b) || b == dna.N {
    74  		return true
    75  	}
    76  	return false
    77  }
    78  
    79  func hcaIsPresent(human, bonobo, chimp, gorilla, orangutan dna.Base) bool {
    80  	if baseIsPresent(human) && (baseIsPresent(bonobo) || baseIsPresent(chimp)) {
    81  		return true
    82  	}
    83  	if (baseIsPresent(human) || baseIsPresent(bonobo) || baseIsPresent(chimp)) && (baseIsPresent(gorilla) || baseIsPresent(orangutan)) {
    84  		return true
    85  	}
    86  	return false
    87  }
    88  
    89  func hgaIsPresent(human, bonobo, chimp, gorilla, orangutan dna.Base) bool {
    90  	if baseIsPresent(gorilla) && (baseIsPresent(human) || baseIsPresent(chimp) || baseIsPresent(bonobo)) {
    91  		return true
    92  	}
    93  	if baseIsPresent(orangutan) && (baseIsPresent(gorilla) || baseIsPresent(human) || baseIsPresent(chimp) || baseIsPresent(bonobo)) {
    94  		return true
    95  	}
    96  	return false
    97  }
    98  
    99  func reconHcaBase(root, humanNode, chimpNode, nodeToRecon *expandedTree.ETree, position int, probThreshold float64, nonBiasProbThreshold float64, humanBias bool, chimpBias bool) {
   100  	var likelihoods []float64
   101  	var nextBase dna.Base
   102  	reconstruct.SetState(root, position)
   103  	likelihoods = reconstruct.FixFc(root, nodeToRecon)
   104  	if humanBias {
   105  		nextBase = likelihoodsToBaseBias(likelihoods, humanNode.Fasta.Seq[position], probThreshold, nonBiasProbThreshold)
   106  	} else if chimpBias {
   107  		nextBase = likelihoodsToBaseBias(likelihoods, chimpNode.Fasta.Seq[position], probThreshold, nonBiasProbThreshold)
   108  	} else {
   109  		nextBase = likelihoodsToBaseUnbiased(likelihoods, probThreshold)
   110  	}
   111  	nodeToRecon.Fasta.Seq = append(nodeToRecon.Fasta.Seq, nextBase)
   112  }
   113  
   114  func reconHgaBase(root, gorillaNode, nodeToRecon *expandedTree.ETree, position int, probThreshold float64, nonBiasProbThreshold float64) {
   115  	var likelihoods []float64
   116  	var nextBase dna.Base
   117  	reconstruct.SetState(root, position)
   118  	likelihoods = reconstruct.FixFc(root, nodeToRecon)
   119  	nextBase = likelihoodsToBaseBias(likelihoods, gorillaNode.Fasta.Seq[position], probThreshold, nonBiasProbThreshold)
   120  	nodeToRecon.Fasta.Seq = append(nodeToRecon.Fasta.Seq, nextBase)
   121  }
   122  
   123  func primateReconHcaMle(inFastaFilename string, inTreeFilename string, humanBias bool, chimpBias bool, probThreshold float64, nonHumanProbThreshold float64, useGenericNames bool, outputFastaFilename string) {
   124  	var tree, humanNode, humanAltNode, bonoboNode, chimpNode, gorillaNode, orangutanNode, hcaNode *expandedTree.ETree
   125  	var err error
   126  	var i int
   127  
   128  	tree, err = expandedTree.ReadTree(inTreeFilename, inFastaFilename)
   129  	exception.FatalOnErr(err)
   130  	if useGenericNames {
   131  		humanNode = expandedTree.FindNodeName(tree, "human")
   132  		if humanNode == nil {
   133  			log.Fatalf("Didn't find human in the tree\n")
   134  		}
   135  		bonoboNode = expandedTree.FindNodeName(tree, "bonobo")
   136  		if bonoboNode == nil {
   137  			log.Fatalf("Didn't find bonobo in the tree\n")
   138  		}
   139  		chimpNode = expandedTree.FindNodeName(tree, "chimp")
   140  		if chimpNode == nil {
   141  			log.Fatalf("Didn't find chimp in the tree\n")
   142  		}
   143  		gorillaNode = expandedTree.FindNodeName(tree, "gorilla")
   144  		if gorillaNode == nil {
   145  			log.Fatalf("Didn't find gorilla in the tree\n")
   146  		}
   147  		orangutanNode = expandedTree.FindNodeName(tree, "orangutan")
   148  		if orangutanNode == nil {
   149  			log.Fatalf("Didn't find orangutan in the tree\n")
   150  		}
   151  	} else {
   152  		humanNode = expandedTree.FindNodeName(tree, "hg38")
   153  		if humanNode == nil {
   154  			log.Fatalf("Didn't find hg38 in the tree\n")
   155  		}
   156  		bonoboNode = expandedTree.FindNodeName(tree, "panPan2")
   157  		if bonoboNode == nil {
   158  			log.Fatalf("Didn't find panPan2 in the tree\n")
   159  		}
   160  		chimpNode = expandedTree.FindNodeName(tree, "panTro6")
   161  		if chimpNode == nil {
   162  			log.Fatalf("Didn't find panTro6 in the tree\n")
   163  		}
   164  		gorillaNode = expandedTree.FindNodeName(tree, "gorGor5")
   165  		if gorillaNode == nil {
   166  			log.Fatalf("Didn't find gorGor5 in the tree\n")
   167  		}
   168  		orangutanNode = expandedTree.FindNodeName(tree, "ponAbe3")
   169  		if orangutanNode == nil {
   170  			log.Fatalf("Didn't find ponAbe3 in the tree\n")
   171  		}
   172  	}
   173  	hcaNode = expandedTree.FindNodeName(tree, "hca")
   174  	if hcaNode == nil {
   175  		log.Fatalf("Didn't find hca in the tree\n")
   176  	}
   177  
   178  	if !(humanBias || chimpBias) {
   179  		humanAltNode = expandedTree.FindNodeName(tree, "hg38alt")
   180  		if humanAltNode == nil {
   181  			log.Fatalf("Didn't find hg38alt in the tree\n")
   182  		}
   183  	}
   184  
   185  	for i = range humanNode.Fasta.Seq {
   186  		if hcaIsPresent(humanNode.Fasta.Seq[i], bonoboNode.Fasta.Seq[i], chimpNode.Fasta.Seq[i], gorillaNode.Fasta.Seq[i], orangutanNode.Fasta.Seq[i]) {
   187  			reconHcaBase(tree, humanNode, chimpNode, hcaNode, i, probThreshold, nonHumanProbThreshold, humanBias, chimpBias)
   188  		} else {
   189  			hcaNode.Fasta.Seq = append(hcaNode.Fasta.Seq, dna.Gap)
   190  		}
   191  	}
   192  	if humanBias || chimpBias {
   193  		fasta.Write(outputFastaFilename, []fasta.Fasta{*humanNode.Fasta, *chimpNode.Fasta, *bonoboNode.Fasta, *gorillaNode.Fasta, *orangutanNode.Fasta, *hcaNode.Fasta})
   194  	} else {
   195  		fasta.Write(outputFastaFilename, []fasta.Fasta{*humanNode.Fasta, *humanAltNode.Fasta, *chimpNode.Fasta, *bonoboNode.Fasta, *gorillaNode.Fasta, *orangutanNode.Fasta, *hcaNode.Fasta})
   196  	}
   197  }
   198  
   199  func primateReconHgaMle(inFile string, inTreeFileName string, probThreshold float64, nonBiasProbThreshold float64, useGenericNames bool, outFile string) {
   200  	var tree, humanNode, bonoboNode, chimpNode, gorillaNode, orangutanNode, hgaNode *expandedTree.ETree
   201  	var err error
   202  	var i int
   203  
   204  	tree, err = expandedTree.ReadTree(inTreeFileName, inFile)
   205  	exception.FatalOnErr(err)
   206  	if useGenericNames {
   207  		humanNode = expandedTree.FindNodeName(tree, "human")
   208  		if humanNode == nil {
   209  			log.Fatalf("Didn't find human in the tree\n")
   210  		}
   211  		bonoboNode = expandedTree.FindNodeName(tree, "bonobo")
   212  		if bonoboNode == nil {
   213  			log.Fatalf("Didn't find bonobo in the tree\n")
   214  		}
   215  		chimpNode = expandedTree.FindNodeName(tree, "chimp")
   216  		if chimpNode == nil {
   217  			log.Fatalf("Didn't find chimp in the tree\n")
   218  		}
   219  		gorillaNode = expandedTree.FindNodeName(tree, "gorilla")
   220  		if gorillaNode == nil {
   221  			log.Fatalf("Didn't find gorilla in the tree\n")
   222  		}
   223  		orangutanNode = expandedTree.FindNodeName(tree, "orangutan")
   224  		if orangutanNode == nil {
   225  			log.Fatalf("Didn't find orangutan in the tree\n")
   226  		}
   227  	} else {
   228  		humanNode = expandedTree.FindNodeName(tree, "hg38")
   229  		if humanNode == nil {
   230  			log.Fatalf("Didn't find hg38 in the tree\n")
   231  		}
   232  		bonoboNode = expandedTree.FindNodeName(tree, "panPan2")
   233  		if bonoboNode == nil {
   234  			log.Fatalf("Didn't find panPan2 in the tree\n")
   235  		}
   236  		chimpNode = expandedTree.FindNodeName(tree, "panTro6")
   237  		if chimpNode == nil {
   238  			log.Fatalf("Didn't find panTro6 in the tree\n")
   239  		}
   240  		gorillaNode = expandedTree.FindNodeName(tree, "gorGor5")
   241  		if gorillaNode == nil {
   242  			log.Fatalf("Didn't find gorGor5 in the tree\n")
   243  		}
   244  		orangutanNode = expandedTree.FindNodeName(tree, "ponAbe3")
   245  		if orangutanNode == nil {
   246  			log.Fatalf("Didn't find ponAbe3 in the tree\n")
   247  		}
   248  	}
   249  	hgaNode = expandedTree.FindNodeName(tree, "hga")
   250  	if hgaNode == nil {
   251  		log.Fatalf("Didn't find hca in the tree\n")
   252  	}
   253  
   254  	for i = range humanNode.Fasta.Seq {
   255  		if hgaIsPresent(humanNode.Fasta.Seq[i], bonoboNode.Fasta.Seq[i], chimpNode.Fasta.Seq[i], gorillaNode.Fasta.Seq[i], orangutanNode.Fasta.Seq[i]) {
   256  			reconHgaBase(tree, gorillaNode, hgaNode, i, probThreshold, nonBiasProbThreshold)
   257  		} else {
   258  			hgaNode.Fasta.Seq = append(hgaNode.Fasta.Seq, dna.Gap)
   259  		}
   260  	}
   261  	fasta.Write(outFile, []fasta.Fasta{*humanNode.Fasta, *chimpNode.Fasta, *bonoboNode.Fasta, *gorillaNode.Fasta, *orangutanNode.Fasta, *hgaNode.Fasta})
   262  }
   263  
   264  func primateRecon(infile string, outfile string, messyToN bool) {
   265  	records := fasta.Read(infile)
   266  	output := append(records, ParsimonyPrimateRecon(records, messyToN))
   267  	fasta.Write(outfile, output)
   268  }
   269  
   270  // ParsimonyPrimateRecon returns a human-biased conservative human-chimp ancestor estimate from an input multiFa alignment in the order Human-Chimp-Bonoboo-Orangutan-Gorilla. MessyToN, when true, turns ambiguous bases to Ns in the output.
   271  func ParsimonyPrimateRecon(records []fasta.Fasta, messyToN bool) fasta.Fasta {
   272  	answer := fasta.Fasta{Name: "Human_Chimp_Ancestor"}
   273  
   274  	var outputBase dna.Base
   275  
   276  	if len(records) != 5 {
   277  		log.Fatalf("Wrong number of sequences, expecting five, found %d.\n", len(records))
   278  	}
   279  
   280  	//confirm alignment lengths are all the same
   281  	firstLength := len(records[0].Seq)
   282  	for i := 1; i < len(records); i++ {
   283  		if len(records[i].Seq) != firstLength {
   284  			log.Fatalf("Sequence %d is the wrong length.\n", i+1)
   285  		}
   286  	}
   287  
   288  	var Ncount int = 0
   289  	var allMatch int = 0
   290  	var humanMatchChimporBonobo int = 0
   291  	var humanChange int = 0
   292  	var gorillaVote int = 0
   293  	var orangutanVote int = 0
   294  	var messyBase int = 0
   295  
   296  	human := records[0]
   297  	bonobo := records[1]
   298  	chimp := records[2]
   299  	orangutan := records[3]
   300  	gorilla := records[4]
   301  
   302  	for j := 0; j < len(human.Seq); j++ {
   303  		outputBase = human.Seq[j]
   304  		if human.Seq[j] == dna.N { //output seq is N if human is N
   305  			Ncount++
   306  			outputBase = human.Seq[j]
   307  		} else if humanInsertion(records, j) { //output seq is gap if human is insertion(see helper function)
   308  			outputBase = dna.Gap
   309  		} else if human.Seq[j] != dna.Gap && (chimp.Seq[j] == dna.Gap && bonobo.Seq[j] == dna.Gap) { //if there is sequence in humans and either gorilla or orangutan, but no sequence in chimp and bonobo, we output N, because we don't want to estimate the longer human to gorilla branch.
   310  			if messyToN {
   311  				outputBase = dna.N
   312  			} else {
   313  				outputBase = human.Seq[j]
   314  			}
   315  		} else if gorilla.Seq[j] == dna.Gap && orangutan.Seq[j] == dna.Gap { //no sequence outside the HCA (no gorilla or orangutan, ancestral state can't be determined)
   316  			if messyToN {
   317  				outputBase = dna.N
   318  			} else {
   319  				outputBase = human.Seq[j]
   320  			}
   321  		} else if human.Seq[j] == chimp.Seq[j] && human.Seq[j] == bonobo.Seq[j] { //if human matches chimp and bonobo, output is human
   322  			outputBase = human.Seq[j]
   323  			allMatch++
   324  		} else if (human.Seq[j] == chimp.Seq[j] || human.Seq[j] == bonobo.Seq[j]) && human.Seq[j] != dna.Gap { //if human is a base and matches chimp or bonobo, output is human
   325  			outputBase = human.Seq[j]
   326  			humanMatchChimporBonobo++
   327  		} else if (chimp.Seq[j] == bonobo.Seq[j] && (chimp.Seq[j] == gorilla.Seq[j] || chimp.Seq[j] == orangutan.Seq[j])) && (chimp.Seq[j] != dna.N && chimp.Seq[j] != dna.Gap) { //human is different from ancestor, chimp and bonobo agree with ancestor.
   328  			outputBase = chimp.Seq[j]
   329  			humanChange++
   330  		} else if (human.Seq[j] == gorilla.Seq[j] || chimp.Seq[j] == gorilla.Seq[j] || bonobo.Seq[j] == gorilla.Seq[j]) && (gorilla.Seq[j] != dna.N && gorilla.Seq[j] != dna.Gap) { //more disagreement, but gorilla defines the ancestor in this case
   331  			outputBase = gorilla.Seq[j]
   332  			gorillaVote++
   333  		} else if (human.Seq[j] == orangutan.Seq[j] || chimp.Seq[j] == orangutan.Seq[j] || bonobo.Seq[j] == orangutan.Seq[j] || gorilla.Seq[j] == orangutan.Seq[j]) && (orangutan.Seq[j] != dna.N && orangutan.Seq[j] != dna.Gap) {
   334  			outputBase = orangutan.Seq[j]
   335  			orangutanVote++
   336  		} else {
   337  			if human.Seq[j] != dna.Gap && !messyToN {
   338  				outputBase = human.Seq[j]
   339  			} else {
   340  				outputBase = dna.N
   341  			}
   342  			messyBase++
   343  		}
   344  		answer.Seq = append(answer.Seq, outputBase)
   345  	}
   346  	return answer
   347  }
   348  
   349  func humanInsertion(records []fasta.Fasta, j int) bool {
   350  	//true if the human sequence is the only one present at a position
   351  	human := records[0]
   352  	bonobo := records[1]
   353  	chimp := records[2]
   354  	orangutan := records[3]
   355  	gorilla := records[4]
   356  
   357  	if (human.Seq[j] != dna.Gap) && (chimp.Seq[j] == dna.Gap && bonobo.Seq[j] == dna.Gap && gorilla.Seq[j] == dna.Gap && orangutan.Seq[j] == dna.Gap) {
   358  		return true
   359  	}
   360  	return false
   361  }
   362  
   363  func usage() {
   364  	fmt.Print(
   365  		"primateRecon - Returns maximum likelihood sequence from an HBCGO primate alignment\n" +
   366  			"Usage:\n" +
   367  			"primateRecon input.fa output.fa\n" +
   368  			"options:\n")
   369  	flag.PrintDefaults()
   370  }
   371  
   372  func main() {
   373  	var expectedNumArgs int = 2
   374  	var messyToN *bool = flag.Bool("messyToN", false, "Sets messy bases to Ns in the output file.")
   375  	var mleHcaUnbiased *bool = flag.Bool("mleHcaUnbiased", false, "Estimates the human-chimpanzee ancestor. Default is an N, unless a base passes the probThreshold threshold.")
   376  	var mleHcaHumanBiased *bool = flag.Bool("mleHcaHumanBiased", false, "Estimates the human-chimpanzee ancestor. Default is the human base, unless the non-human bases, collectively and individually, pass nonBiasProbThreshold and probThreshold.")
   377  	var mleHcaChimpBiased *bool = flag.Bool("mleHcaChimpBiased", false, "Estimates the human-chimpanzee ancestor. Default is the chimp base, unless the non-chimp bases, collectively and individually, pass nonBiasPropThreshold and probThreshold.")
   378  	var mleHgaGorillaBiased *bool = flag.Bool("mleHgaGorillaBiased", false, "Estimates the human-gorilla ancestor. Default is the gorilla base, unless the non-gorilla bases, collectively and individually, pass nonBiasPropThreshold and probThreshold.")
   379  	var tree *string = flag.String("mle", "", "Filename for newick tree with branch lengths.  Must have the anticipated assembly names and the hca.")
   380  	var probThreshold *float64 = flag.Float64("probThreshold", 0.0, "The probability that a base other than human must pass to be considered a true change in the hca.")
   381  	var nonBiasProbThreshold *float64 = flag.Float64("nonBiasProbThreshold", 0.0, "The summation of all bases (other than the biased species) must pass this threshold for a non-biased species base to be considered in the hca.")
   382  	var useGenericNames *bool = flag.Bool("useGenericNames", false, "Use generic names (human, bonobo, chimp, gorilla, orangutan) instead of assembly names (hg38, panPan2, panTro6, gorGor5, ponAbe3)")
   383  	flag.Usage = usage
   384  	log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
   385  	flag.Parse()
   386  
   387  	if len(flag.Args()) != expectedNumArgs {
   388  		flag.Usage()
   389  		log.Fatalf("Error: expecting %d arguments, but got %d\n",
   390  			expectedNumArgs, len(flag.Args()))
   391  	}
   392  
   393  	inFile := flag.Arg(0)
   394  	outFile := flag.Arg(1)
   395  
   396  	// some error check on flags provided
   397  	if *mleHcaHumanBiased && *mleHcaChimpBiased {
   398  		log.Fatalf("Error: cannot be biased for both the human and the chimp base\n")
   399  	}
   400  	if *messyToN && (*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased) {
   401  		log.Fatalf("Error: -messyToN can not be used with mle estimates\n")
   402  	}
   403  	if (*tree == "") && (*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased) {
   404  		log.Fatalf("Error: you need to provide a tree when using an mle estimate\n")
   405  	}
   406  	if *mleHcaUnbiased && (*mleHcaHumanBiased || *mleHcaChimpBiased) {
   407  		log.Fatalf("Error: Can not do both a biased and unbiased mle estimate\n")
   408  	}
   409  	if (*probThreshold != 0 || *nonBiasProbThreshold != 0) && !(*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased || *mleHgaGorillaBiased) {
   410  		log.Fatalf("Error: Can not use probability threshold flags without also using an mle estimate\n")
   411  	}
   412  	if *nonBiasProbThreshold != 0 && *mleHcaUnbiased {
   413  		log.Fatalf("Error: Can not do a nonBiasProbThreshold when also doing an unbiased estimate\n")
   414  	}
   415  	if *mleHgaGorillaBiased && (*mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased) {
   416  		log.Fatalf("Error: cannot estimate both the HCA and the HGA at once\n")
   417  	}
   418  
   419  	if *mleHcaUnbiased || *mleHcaHumanBiased || *mleHcaChimpBiased {
   420  		// at this point we know that xor of the mleFlags is true, so we only pass mleHumanBiased and mleChimpBiased
   421  		primateReconHcaMle(inFile, *tree, *mleHcaHumanBiased, *mleHcaChimpBiased, *probThreshold, *nonBiasProbThreshold, *useGenericNames, outFile)
   422  	} else if *mleHgaGorillaBiased {
   423  		primateReconHgaMle(inFile, *tree, *probThreshold, *nonBiasProbThreshold, *useGenericNames, outFile)
   424  	} else {
   425  		primateRecon(inFile, outFile, *messyToN)
   426  	}
   427  }