github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/genomeGraph.go (about) 1 // Package genomeGraph has structs and tools for reading, writing, editing and aligning graph representations of genomes 2 package genomeGraph 3 4 import ( 5 "fmt" 6 "github.com/vertgenlab/gonomics/dna" 7 "github.com/vertgenlab/gonomics/dna/dnaTwoBit" 8 "github.com/vertgenlab/gonomics/exception" 9 "github.com/vertgenlab/gonomics/fileio" 10 "github.com/vertgenlab/gonomics/numbers" 11 "github.com/vertgenlab/gonomics/numbers/parse" 12 "io" 13 "log" 14 "strings" 15 ) 16 17 // GenomeGraph struct contains a slice of Nodes. 18 type GenomeGraph struct { 19 Nodes []Node 20 } 21 22 // Node is uniquely definded by Id and is encoded with information 23 // describing sequence order and orientation and annotated variance. 24 type Node struct { 25 Id uint32 26 ColumnId uint32 27 Seq []dna.Base // only this field or the SeqThreeBit will be kept 28 SeqTwoBit *dnaTwoBit.TwoBit // this will change to a ThreeBit or be removed 29 Prev []Edge 30 Next []Edge 31 } // used to have Name (string) and Info (Annotation) 32 33 // Edge describes the neighboring nodes and a weighted probability 34 // of the more likely path. 35 type Edge struct { 36 Dest *Node 37 Prob float32 38 } 39 40 // Annotation struct is an uint64 encoding of allele id, starting position on linear reference and variant on node 41 // a single byte will represent what allele the node came from, uint32 will be used for starting position of chromosome of the linear reference 42 // uint8 are variants are represented as follows: 0=match, 1=mismatch, 2=insert, 3=deletion, 4=hap 43 /*type Annotation struct { 44 Start uint32 45 Allele byte 46 Variant uint8 47 }*/ 48 49 // Read will process a simple graph formated text file and parse the data into graph fields. 50 func Read(filename string) *GenomeGraph { 51 simpleReader := fileio.NewByteReader(filename) 52 genome := EmptyGraph() 53 var weight float32 54 var line string 55 var words []string = make([]string, 0, 2) 56 var nodeId, homeNodeIdx, destNodeIdx uint32 57 var homeNode, destNode *Node 58 var i int 59 60 for reader, done := fileio.ReadLine(simpleReader); !done; reader, done = fileio.ReadLine(simpleReader) { 61 line = reader.String() 62 switch true { 63 case strings.HasPrefix(line, ">"): 64 nodeId = parse.StringToUint32(line[1:]) 65 AddNode(genome, &Node{Id: nodeId}) 66 case strings.Contains(line, "\t"): 67 words = strings.Split(line, "\t") 68 homeNodeIdx = parse.StringToUint32(words[0]) 69 homeNode = &genome.Nodes[homeNodeIdx] 70 if len(words) > 2 { 71 for i = 1; i < len(words); i += 2 { 72 weight = parse.StringToFloat32(words[i]) 73 destNodeIdx = parse.StringToUint32(words[i+1]) 74 destNode = &genome.Nodes[destNodeIdx] 75 AddEdge(homeNode, destNode, weight) 76 } 77 } 78 default: 79 genome.Nodes[nodeId].Seq = append(genome.Nodes[nodeId].Seq, dna.ByteSliceToDnaBases(reader.Bytes())...) 80 } 81 } 82 for i = range genome.Nodes { 83 if len(genome.Nodes[i].Seq) != 0 { 84 genome.Nodes[i].SeqTwoBit = dnaTwoBit.NewTwoBit(genome.Nodes[i].Seq) 85 } 86 } 87 return genome 88 } 89 90 // AddNode will add the values in n to the graph at the index of n.Id 91 // A pointer to the new location of the node (inside the graph) is returned. 92 func AddNode(g *GenomeGraph, n *Node) *Node { 93 const positionsToExtend = 1000 // when we need to increase the slice, do this many nodes at a time 94 if int(n.Id) < len(g.Nodes) { // if the memory for this node has already been allocated 95 // then we can overwrite it as long as it does not already exist 96 if len(g.Nodes[n.Id].Seq) != 0 { // if the node already exists because data has been written here 97 log.Panicf("Error: tried to add a node of id=%d, when that id already exists\n", n.Id) 98 } 99 } else if int(n.Id) < cap(g.Nodes) { // if we already have the capacity, but not the length 100 g.Nodes = g.Nodes[:n.Id+1] 101 } else { // if we need to increase capacity 102 futureNodes := make([]Node, int(n.Id)+1, numbers.Max(cap(g.Nodes)+positionsToExtend, int(n.Id)+1)) 103 copy(futureNodes, g.Nodes) 104 g.Nodes = futureNodes 105 } 106 g.Nodes[n.Id] = *n 107 return &g.Nodes[n.Id] 108 } 109 110 // AddEdge will append two edges one forward and one backwards for any two 111 // given node. Provide a probability float32 to specify a weight for an edge 112 // to describe the more likely path through the graph. 113 func AddEdge(u, v *Node, p float32) { 114 u.Next = append(u.Next, Edge{Dest: v, Prob: p}) 115 v.Prev = append(v.Prev, Edge{Dest: u, Prob: p}) 116 } 117 118 // SetEvenWeights will loop through a slice of edges and set the probability weight 119 // divided by the length of the slice. 120 func SetEvenWeights(u *Node) { 121 var edge int 122 var weights float32 = 1 / float32(len(u.Next)) 123 for edge = 0; edge < len(u.Next); edge++ { 124 u.Next[edge].Prob = weights 125 } 126 } 127 128 // Write function will process GenomeGraph and write the data to a file. 129 func Write(filename string, sg *GenomeGraph) { 130 lineLength := 50 131 file := fileio.EasyCreate(filename) 132 defer file.Close() 133 WriteToGraphHandle(file, sg, lineLength) 134 } 135 136 // EmptyGraph will allocate a new zero pointer to a simple graph and will allocate memory for the Nodes of the graph. 137 func EmptyGraph() *GenomeGraph { 138 return &GenomeGraph{Nodes: make([]Node, 0)} 139 } 140 141 // PrintGraph will quickly print simpleGraph to standard out. 142 func PrintGraph(gg *GenomeGraph) { 143 Write("/dev/stdout", gg) 144 } 145 146 // WriteToGraphHandle will help with any error handling when writing GenomeGraph to file. 147 func WriteToGraphHandle(file io.Writer, gg *GenomeGraph, lineLength int) { 148 var err error 149 var i, j int 150 for i = 0; i < len(gg.Nodes); i++ { 151 _, err = fmt.Fprintf(file, ">%d\n", gg.Nodes[i].Id) 152 exception.PanicOnErr(err) 153 for j = 0; j < len(gg.Nodes[i].Seq); j += lineLength { 154 if j+lineLength > len(gg.Nodes[i].Seq) { 155 _, err = fmt.Fprintf(file, "%s\n", dna.BasesToString(gg.Nodes[i].Seq[j:])) 156 exception.PanicOnErr(err) 157 } else { 158 _, err = fmt.Fprintf(file, "%s\n", dna.BasesToString(gg.Nodes[i].Seq[j:j+lineLength])) 159 exception.PanicOnErr(err) 160 } 161 } 162 } 163 for i = 0; i < len(gg.Nodes); i++ { 164 near := gg.Nodes[i].Next 165 if len(near) > 0 { 166 _, err = fmt.Fprintf(file, "%d", gg.Nodes[i].Id) 167 exception.PanicOnErr(err) 168 for j = 0; j < len(near); j++ { 169 _, err = fmt.Fprintf(file, "\t%v\t%d", near[j].Prob, near[j].Dest.Id) 170 exception.PanicOnErr(err) 171 } 172 _, err = fmt.Fprintf(file, "\n") 173 exception.PanicOnErr(err) 174 } 175 } 176 } 177 178 // BasesInGraph will calculate the number of bases contained in GenomeGraph using dnaTwoBit. 179 func BasesInGraph(g *GenomeGraph) int { 180 var i, baseCount int = 0, 0 181 for i = 0; i < len(g.Nodes); i++ { 182 baseCount += len(g.Nodes[i].Seq) 183 } 184 return baseCount 185 }