github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/randomReads.go (about) 1 package genomeGraph 2 3 import ( 4 "fmt" 5 "github.com/vertgenlab/gonomics/dna" 6 "github.com/vertgenlab/gonomics/dna/dnaTwoBit" 7 "github.com/vertgenlab/gonomics/fastq" 8 "github.com/vertgenlab/gonomics/numbers" 9 "github.com/vertgenlab/gonomics/numbers/parse" 10 "log" 11 ) 12 13 func RandomPairedReads(genome *GenomeGraph, readLength int, numReads int, numChanges int) []fastq.PairedEnd { 14 var answer []fastq.PairedEnd = make([]fastq.PairedEnd, numReads) 15 var seq []dna.Base 16 var path []uint32 17 var nodeIdx, start1, endPos uint32 18 var strand bool 19 var totalBases = BasesInGraph(genome) 20 var fragLen int 21 for i := 0; i < numReads; { 22 strand = numbers.RandIntInRange(0, 2) == 0 23 fragLen = numbers.RandIntInRange(300, 500) 24 nodeIdx, start1 = RandLocationFast(genome, totalBases) 25 path, endPos, seq = RandPathFwd(genome, nodeIdx, start1, fragLen) 26 27 if (len(seq) == fragLen) && (dna.CountBaseInterval(seq, dna.N, 0, readLength) == 0) { 28 curr := fastq.PairedEnd{Fwd: fastq.Fastq{}, Rev: fastq.Fastq{}} 29 curr.Fwd.Name = fmt.Sprintf("%d_%d_%d_%d_%c_R: 1", path[0], start1, path[len(path)-1], start1+uint32(readLength), parse.StrandToRune(strand)) 30 curr.Fwd.Seq = make([]dna.Base, readLength) 31 copy(curr.Fwd.Seq, seq[:readLength]) 32 33 curr.Fwd.Qual = fastq.ToQualUint8(generateDiverseFakeQual(readLength)) 34 curr.Rev.Name = fmt.Sprintf("%d_%d_%d_%d_%c_R: 2", path[0], start1+uint32(fragLen-readLength), path[len(path)-1], endPos, parse.StrandToRune(strand)) 35 curr.Rev.Seq = make([]dna.Base, readLength) 36 copy(curr.Rev.Seq, seq[uint32(fragLen-readLength):]) 37 curr.Rev.Qual = fastq.ToQualUint8(generateDiverseFakeQual(readLength)) 38 if !strand { 39 fastq.ReverseComplement(curr.Fwd) 40 } else { 41 fastq.ReverseComplement(curr.Rev) 42 } 43 mutate(curr.Fwd.Seq, numChanges) 44 mutate(curr.Rev.Seq, numChanges) 45 answer[i] = curr 46 i++ 47 } 48 } 49 return answer 50 } 51 52 func RandLocation(genome *GenomeGraph) (uint32, uint32) { 53 var totalBases int = BasesInGraph(genome) 54 return RandLocationFast(genome, totalBases) 55 } 56 57 func RandLocationFast(genome *GenomeGraph, totalBases int) (uint32, uint32) { 58 var rand int = numbers.RandIntInRange(0, totalBases) 59 for i := 0; i < len(genome.Nodes); i++ { 60 if rand < genome.Nodes[i].SeqTwoBit.Len { 61 return uint32(i), uint32(rand) 62 } else { 63 rand -= genome.Nodes[i].SeqTwoBit.Len 64 } 65 } 66 log.Fatal("Error: trouble selecting a random location in the graph\n") 67 return 0, 0 //needed for compiler, should not get here 68 } 69 70 func RandPathFwd(genome *GenomeGraph, nodeIdx uint32, pos uint32, length int) ([]uint32, uint32, []dna.Base) { 71 var answer []dna.Base = make([]dna.Base, 0, length) 72 var i int = 0 73 for i = 0; i < length && int(pos) < genome.Nodes[nodeIdx].SeqTwoBit.Len; i, pos = i+1, pos+1 { 74 answer = append(answer, dnaTwoBit.GetBase(genome.Nodes[nodeIdx].SeqTwoBit, uint(pos))) 75 } 76 if i == length || len(genome.Nodes[nodeIdx].Next) == 0 { 77 return []uint32{nodeIdx}, pos, answer 78 } else { 79 edgeIdx := numbers.RandIntInRange(0, len(genome.Nodes[nodeIdx].Next)) 80 return randPathFwdHelper(genome, genome.Nodes[nodeIdx].Next[edgeIdx].Dest.Id, length, answer, []uint32{nodeIdx}) 81 } 82 } 83 84 func randPathFwdHelper(genome *GenomeGraph, nodeIdx uint32, length int, progress []dna.Base, path []uint32) ([]uint32, uint32, []dna.Base) { 85 var pos uint32 = 0 86 for pos = 0; len(progress) < length && int(pos) < genome.Nodes[nodeIdx].SeqTwoBit.Len; pos = pos + 1 { 87 progress = append(progress, dnaTwoBit.GetBase(genome.Nodes[nodeIdx].SeqTwoBit, uint(pos))) 88 } 89 if len(progress) == length || len(genome.Nodes[nodeIdx].Next) == 0 { 90 return append(path, nodeIdx), pos, progress 91 } else { 92 edgeIdx := numbers.RandIntInRange(0, len(genome.Nodes[nodeIdx].Next)) 93 return randPathFwdHelper(genome, genome.Nodes[nodeIdx].Next[edgeIdx].Dest.Id, length, progress, append(path, nodeIdx)) 94 } 95 } 96 97 func RandomReads(genome *GenomeGraph, readLength int, numReads int, numChanges int) []fastq.Fastq { 98 var answer []fastq.Fastq = make([]fastq.Fastq, numReads) 99 var seq []dna.Base 100 var path []uint32 101 var nodeIdx, pos, endPos uint32 102 var strand bool 103 var totalBases = BasesInGraph(genome) 104 for i := 0; i < numReads; { 105 nodeIdx, pos = RandLocationFast(genome, totalBases) 106 path, endPos, seq = RandPathFwd(genome, nodeIdx, pos, readLength) 107 strand = numbers.RandIntInRange(0, 2) == 0 108 if (len(seq) == readLength) && (dna.CountBaseInterval(seq, dna.N, 0, readLength) == 0) { 109 curr := fastq.Fastq{} 110 curr.Name = fmt.Sprintf("%d_%d_%d_%d_%c", path[0], pos+1, path[len(path)-1], endPos+1, parse.StrandToRune(strand)) 111 curr.Seq = seq 112 curr.Qual = fastq.ToQualUint8(generateDiverseFakeQual(readLength)) 113 if !strand { 114 dna.ReverseComplement(curr.Seq) 115 } 116 mutate(curr.Seq, numChanges) 117 answer[i] = curr 118 i++ 119 } 120 } 121 return answer 122 } 123 124 func mutate(sequence []dna.Base, numChanges int) { 125 possibleBases := []dna.Base{0, 1, 2, 3} 126 for i := 0; i < numChanges; i++ { 127 sequence[numbers.RandIntInRange(0, len(sequence))] = possibleBases[numbers.RandIntInRange(0, len(possibleBases))] 128 } 129 } 130 131 func mutatePos(seq []dna.Base, pos int) { 132 possibleBases := []dna.Base{dna.A, dna.C, dna.G, dna.T} 133 newBase := possibleBases[numbers.RandIntInRange(0, len(possibleBases))] 134 if newBase == seq[pos] { 135 mutatePos(seq, pos) 136 } else { 137 seq[pos] = newBase 138 } 139 } 140 141 func generateDiverseFakeQual(length int) []rune { 142 var answer []rune = make([]rune, length) 143 //var asci = []rune{'!', '#', '$', '%', '&', '(', ')', '*', '+', '`', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'} 144 var asci = []rune{'F', ',', 'F', ':'} 145 for i := 0; i < length; i++ { 146 answer[i] = asci[numbers.RandIntInRange(0, len(asci))] 147 } 148 return answer 149 } 150 151 func generateFakeQual(length int) []rune { 152 var answer []rune = make([]rune, length) 153 for i := 0; i < length; i++ { 154 answer[i] = 'J' 155 } 156 return answer 157 }