github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/randomReads.go (about)

     1  package genomeGraph
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/vertgenlab/gonomics/dna"
     6  	"github.com/vertgenlab/gonomics/dna/dnaTwoBit"
     7  	"github.com/vertgenlab/gonomics/fastq"
     8  	"github.com/vertgenlab/gonomics/numbers"
     9  	"github.com/vertgenlab/gonomics/numbers/parse"
    10  	"log"
    11  )
    12  
    13  func RandomPairedReads(genome *GenomeGraph, readLength int, numReads int, numChanges int) []fastq.PairedEnd {
    14  	var answer []fastq.PairedEnd = make([]fastq.PairedEnd, numReads)
    15  	var seq []dna.Base
    16  	var path []uint32
    17  	var nodeIdx, start1, endPos uint32
    18  	var strand bool
    19  	var totalBases = BasesInGraph(genome)
    20  	var fragLen int
    21  	for i := 0; i < numReads; {
    22  		strand = numbers.RandIntInRange(0, 2) == 0
    23  		fragLen = numbers.RandIntInRange(300, 500)
    24  		nodeIdx, start1 = RandLocationFast(genome, totalBases)
    25  		path, endPos, seq = RandPathFwd(genome, nodeIdx, start1, fragLen)
    26  
    27  		if (len(seq) == fragLen) && (dna.CountBaseInterval(seq, dna.N, 0, readLength) == 0) {
    28  			curr := fastq.PairedEnd{Fwd: fastq.Fastq{}, Rev: fastq.Fastq{}}
    29  			curr.Fwd.Name = fmt.Sprintf("%d_%d_%d_%d_%c_R: 1", path[0], start1, path[len(path)-1], start1+uint32(readLength), parse.StrandToRune(strand))
    30  			curr.Fwd.Seq = make([]dna.Base, readLength)
    31  			copy(curr.Fwd.Seq, seq[:readLength])
    32  
    33  			curr.Fwd.Qual = fastq.ToQualUint8(generateDiverseFakeQual(readLength))
    34  			curr.Rev.Name = fmt.Sprintf("%d_%d_%d_%d_%c_R: 2", path[0], start1+uint32(fragLen-readLength), path[len(path)-1], endPos, parse.StrandToRune(strand))
    35  			curr.Rev.Seq = make([]dna.Base, readLength)
    36  			copy(curr.Rev.Seq, seq[uint32(fragLen-readLength):])
    37  			curr.Rev.Qual = fastq.ToQualUint8(generateDiverseFakeQual(readLength))
    38  			if !strand {
    39  				fastq.ReverseComplement(curr.Fwd)
    40  			} else {
    41  				fastq.ReverseComplement(curr.Rev)
    42  			}
    43  			mutate(curr.Fwd.Seq, numChanges)
    44  			mutate(curr.Rev.Seq, numChanges)
    45  			answer[i] = curr
    46  			i++
    47  		}
    48  	}
    49  	return answer
    50  }
    51  
    52  func RandLocation(genome *GenomeGraph) (uint32, uint32) {
    53  	var totalBases int = BasesInGraph(genome)
    54  	return RandLocationFast(genome, totalBases)
    55  }
    56  
    57  func RandLocationFast(genome *GenomeGraph, totalBases int) (uint32, uint32) {
    58  	var rand int = numbers.RandIntInRange(0, totalBases)
    59  	for i := 0; i < len(genome.Nodes); i++ {
    60  		if rand < genome.Nodes[i].SeqTwoBit.Len {
    61  			return uint32(i), uint32(rand)
    62  		} else {
    63  			rand -= genome.Nodes[i].SeqTwoBit.Len
    64  		}
    65  	}
    66  	log.Fatal("Error: trouble selecting a random location in the graph\n")
    67  	return 0, 0 //needed for compiler, should not get here
    68  }
    69  
    70  func RandPathFwd(genome *GenomeGraph, nodeIdx uint32, pos uint32, length int) ([]uint32, uint32, []dna.Base) {
    71  	var answer []dna.Base = make([]dna.Base, 0, length)
    72  	var i int = 0
    73  	for i = 0; i < length && int(pos) < genome.Nodes[nodeIdx].SeqTwoBit.Len; i, pos = i+1, pos+1 {
    74  		answer = append(answer, dnaTwoBit.GetBase(genome.Nodes[nodeIdx].SeqTwoBit, uint(pos)))
    75  	}
    76  	if i == length || len(genome.Nodes[nodeIdx].Next) == 0 {
    77  		return []uint32{nodeIdx}, pos, answer
    78  	} else {
    79  		edgeIdx := numbers.RandIntInRange(0, len(genome.Nodes[nodeIdx].Next))
    80  		return randPathFwdHelper(genome, genome.Nodes[nodeIdx].Next[edgeIdx].Dest.Id, length, answer, []uint32{nodeIdx})
    81  	}
    82  }
    83  
    84  func randPathFwdHelper(genome *GenomeGraph, nodeIdx uint32, length int, progress []dna.Base, path []uint32) ([]uint32, uint32, []dna.Base) {
    85  	var pos uint32 = 0
    86  	for pos = 0; len(progress) < length && int(pos) < genome.Nodes[nodeIdx].SeqTwoBit.Len; pos = pos + 1 {
    87  		progress = append(progress, dnaTwoBit.GetBase(genome.Nodes[nodeIdx].SeqTwoBit, uint(pos)))
    88  	}
    89  	if len(progress) == length || len(genome.Nodes[nodeIdx].Next) == 0 {
    90  		return append(path, nodeIdx), pos, progress
    91  	} else {
    92  		edgeIdx := numbers.RandIntInRange(0, len(genome.Nodes[nodeIdx].Next))
    93  		return randPathFwdHelper(genome, genome.Nodes[nodeIdx].Next[edgeIdx].Dest.Id, length, progress, append(path, nodeIdx))
    94  	}
    95  }
    96  
    97  func RandomReads(genome *GenomeGraph, readLength int, numReads int, numChanges int) []fastq.Fastq {
    98  	var answer []fastq.Fastq = make([]fastq.Fastq, numReads)
    99  	var seq []dna.Base
   100  	var path []uint32
   101  	var nodeIdx, pos, endPos uint32
   102  	var strand bool
   103  	var totalBases = BasesInGraph(genome)
   104  	for i := 0; i < numReads; {
   105  		nodeIdx, pos = RandLocationFast(genome, totalBases)
   106  		path, endPos, seq = RandPathFwd(genome, nodeIdx, pos, readLength)
   107  		strand = numbers.RandIntInRange(0, 2) == 0
   108  		if (len(seq) == readLength) && (dna.CountBaseInterval(seq, dna.N, 0, readLength) == 0) {
   109  			curr := fastq.Fastq{}
   110  			curr.Name = fmt.Sprintf("%d_%d_%d_%d_%c", path[0], pos+1, path[len(path)-1], endPos+1, parse.StrandToRune(strand))
   111  			curr.Seq = seq
   112  			curr.Qual = fastq.ToQualUint8(generateDiverseFakeQual(readLength))
   113  			if !strand {
   114  				dna.ReverseComplement(curr.Seq)
   115  			}
   116  			mutate(curr.Seq, numChanges)
   117  			answer[i] = curr
   118  			i++
   119  		}
   120  	}
   121  	return answer
   122  }
   123  
   124  func mutate(sequence []dna.Base, numChanges int) {
   125  	possibleBases := []dna.Base{0, 1, 2, 3}
   126  	for i := 0; i < numChanges; i++ {
   127  		sequence[numbers.RandIntInRange(0, len(sequence))] = possibleBases[numbers.RandIntInRange(0, len(possibleBases))]
   128  	}
   129  }
   130  
   131  func mutatePos(seq []dna.Base, pos int) {
   132  	possibleBases := []dna.Base{dna.A, dna.C, dna.G, dna.T}
   133  	newBase := possibleBases[numbers.RandIntInRange(0, len(possibleBases))]
   134  	if newBase == seq[pos] {
   135  		mutatePos(seq, pos)
   136  	} else {
   137  		seq[pos] = newBase
   138  	}
   139  }
   140  
   141  func generateDiverseFakeQual(length int) []rune {
   142  	var answer []rune = make([]rune, length)
   143  	//var asci = []rune{'!', '#', '$', '%', '&', '(', ')', '*', '+', '`', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'}
   144  	var asci = []rune{'F', ',', 'F', ':'}
   145  	for i := 0; i < length; i++ {
   146  		answer[i] = asci[numbers.RandIntInRange(0, len(asci))]
   147  	}
   148  	return answer
   149  }
   150  
   151  func generateFakeQual(length int) []rune {
   152  	var answer []rune = make([]rune, length)
   153  	for i := 0; i < length; i++ {
   154  		answer[i] = 'J'
   155  	}
   156  	return answer
   157  }