github.com/vertgenlab/gonomics@v1.0.0/genomeGraph/genomeGraph.go (about)

     1  // Package genomeGraph has structs and tools for reading, writing, editing and aligning graph representations of genomes
     2  package genomeGraph
     3  
     4  import (
     5  	"fmt"
     6  	"github.com/vertgenlab/gonomics/dna"
     7  	"github.com/vertgenlab/gonomics/dna/dnaTwoBit"
     8  	"github.com/vertgenlab/gonomics/exception"
     9  	"github.com/vertgenlab/gonomics/fileio"
    10  	"github.com/vertgenlab/gonomics/numbers"
    11  	"github.com/vertgenlab/gonomics/numbers/parse"
    12  	"io"
    13  	"log"
    14  	"strings"
    15  )
    16  
    17  // GenomeGraph struct contains a slice of Nodes.
    18  type GenomeGraph struct {
    19  	Nodes []Node
    20  }
    21  
    22  // Node is uniquely definded by Id and is encoded with information
    23  // describing sequence order and orientation and annotated variance.
    24  type Node struct {
    25  	Id        uint32
    26  	ColumnId  uint32
    27  	Seq       []dna.Base        // only this field or the SeqThreeBit will be kept
    28  	SeqTwoBit *dnaTwoBit.TwoBit // this will change to a ThreeBit or be removed
    29  	Prev      []Edge
    30  	Next      []Edge
    31  } // used to have Name (string) and Info (Annotation)
    32  
    33  // Edge describes the neighboring nodes and a weighted probability
    34  // of the more likely path.
    35  type Edge struct {
    36  	Dest *Node
    37  	Prob float32
    38  }
    39  
    40  // Annotation struct is an uint64 encoding of allele id, starting position on linear reference and variant on node
    41  // a single byte will represent what allele the node came from, uint32 will be used for starting position of chromosome of the linear reference
    42  // uint8 are variants are represented as follows: 0=match, 1=mismatch, 2=insert, 3=deletion, 4=hap
    43  /*type Annotation struct {
    44  	Start   uint32
    45  	Allele  byte
    46  	Variant uint8
    47  }*/
    48  
    49  // Read will process a simple graph formated text file and parse the data into graph fields.
    50  func Read(filename string) *GenomeGraph {
    51  	simpleReader := fileio.NewByteReader(filename)
    52  	genome := EmptyGraph()
    53  	var weight float32
    54  	var line string
    55  	var words []string = make([]string, 0, 2)
    56  	var nodeId, homeNodeIdx, destNodeIdx uint32
    57  	var homeNode, destNode *Node
    58  	var i int
    59  
    60  	for reader, done := fileio.ReadLine(simpleReader); !done; reader, done = fileio.ReadLine(simpleReader) {
    61  		line = reader.String()
    62  		switch true {
    63  		case strings.HasPrefix(line, ">"):
    64  			nodeId = parse.StringToUint32(line[1:])
    65  			AddNode(genome, &Node{Id: nodeId})
    66  		case strings.Contains(line, "\t"):
    67  			words = strings.Split(line, "\t")
    68  			homeNodeIdx = parse.StringToUint32(words[0])
    69  			homeNode = &genome.Nodes[homeNodeIdx]
    70  			if len(words) > 2 {
    71  				for i = 1; i < len(words); i += 2 {
    72  					weight = parse.StringToFloat32(words[i])
    73  					destNodeIdx = parse.StringToUint32(words[i+1])
    74  					destNode = &genome.Nodes[destNodeIdx]
    75  					AddEdge(homeNode, destNode, weight)
    76  				}
    77  			}
    78  		default:
    79  			genome.Nodes[nodeId].Seq = append(genome.Nodes[nodeId].Seq, dna.ByteSliceToDnaBases(reader.Bytes())...)
    80  		}
    81  	}
    82  	for i = range genome.Nodes {
    83  		if len(genome.Nodes[i].Seq) != 0 {
    84  			genome.Nodes[i].SeqTwoBit = dnaTwoBit.NewTwoBit(genome.Nodes[i].Seq)
    85  		}
    86  	}
    87  	return genome
    88  }
    89  
    90  // AddNode will add the values in n to the graph at the index of n.Id
    91  // A pointer to the new location of the node (inside the graph) is returned.
    92  func AddNode(g *GenomeGraph, n *Node) *Node {
    93  	const positionsToExtend = 1000 // when we need to increase the slice, do this many nodes at a time
    94  	if int(n.Id) < len(g.Nodes) {  // if the memory for this node has already been allocated
    95  		// then we can overwrite it as long as it does not already exist
    96  		if len(g.Nodes[n.Id].Seq) != 0 { // if the node already exists because data has been written here
    97  			log.Panicf("Error: tried to add a node of id=%d, when that id already exists\n", n.Id)
    98  		}
    99  	} else if int(n.Id) < cap(g.Nodes) { // if we already have the capacity, but not the length
   100  		g.Nodes = g.Nodes[:n.Id+1]
   101  	} else { // if we need to increase capacity
   102  		futureNodes := make([]Node, int(n.Id)+1, numbers.Max(cap(g.Nodes)+positionsToExtend, int(n.Id)+1))
   103  		copy(futureNodes, g.Nodes)
   104  		g.Nodes = futureNodes
   105  	}
   106  	g.Nodes[n.Id] = *n
   107  	return &g.Nodes[n.Id]
   108  }
   109  
   110  // AddEdge will append two edges one forward and one backwards for any two
   111  // given node. Provide a probability float32 to specify a weight for an edge
   112  // to describe the more likely path through the graph.
   113  func AddEdge(u, v *Node, p float32) {
   114  	u.Next = append(u.Next, Edge{Dest: v, Prob: p})
   115  	v.Prev = append(v.Prev, Edge{Dest: u, Prob: p})
   116  }
   117  
   118  // SetEvenWeights will loop through a slice of edges and set the probability weight
   119  // divided by the length of the slice.
   120  func SetEvenWeights(u *Node) {
   121  	var edge int
   122  	var weights float32 = 1 / float32(len(u.Next))
   123  	for edge = 0; edge < len(u.Next); edge++ {
   124  		u.Next[edge].Prob = weights
   125  	}
   126  }
   127  
   128  // Write function will process GenomeGraph and write the data to a file.
   129  func Write(filename string, sg *GenomeGraph) {
   130  	lineLength := 50
   131  	file := fileio.EasyCreate(filename)
   132  	defer file.Close()
   133  	WriteToGraphHandle(file, sg, lineLength)
   134  }
   135  
   136  // EmptyGraph will allocate a new zero pointer to a simple graph and will allocate memory for the Nodes of the graph.
   137  func EmptyGraph() *GenomeGraph {
   138  	return &GenomeGraph{Nodes: make([]Node, 0)}
   139  }
   140  
   141  // PrintGraph will quickly print simpleGraph to standard out.
   142  func PrintGraph(gg *GenomeGraph) {
   143  	Write("/dev/stdout", gg)
   144  }
   145  
   146  // WriteToGraphHandle will help with any error handling when writing GenomeGraph to file.
   147  func WriteToGraphHandle(file io.Writer, gg *GenomeGraph, lineLength int) {
   148  	var err error
   149  	var i, j int
   150  	for i = 0; i < len(gg.Nodes); i++ {
   151  		_, err = fmt.Fprintf(file, ">%d\n", gg.Nodes[i].Id)
   152  		exception.PanicOnErr(err)
   153  		for j = 0; j < len(gg.Nodes[i].Seq); j += lineLength {
   154  			if j+lineLength > len(gg.Nodes[i].Seq) {
   155  				_, err = fmt.Fprintf(file, "%s\n", dna.BasesToString(gg.Nodes[i].Seq[j:]))
   156  				exception.PanicOnErr(err)
   157  			} else {
   158  				_, err = fmt.Fprintf(file, "%s\n", dna.BasesToString(gg.Nodes[i].Seq[j:j+lineLength]))
   159  				exception.PanicOnErr(err)
   160  			}
   161  		}
   162  	}
   163  	for i = 0; i < len(gg.Nodes); i++ {
   164  		near := gg.Nodes[i].Next
   165  		if len(near) > 0 {
   166  			_, err = fmt.Fprintf(file, "%d", gg.Nodes[i].Id)
   167  			exception.PanicOnErr(err)
   168  			for j = 0; j < len(near); j++ {
   169  				_, err = fmt.Fprintf(file, "\t%v\t%d", near[j].Prob, near[j].Dest.Id)
   170  				exception.PanicOnErr(err)
   171  			}
   172  			_, err = fmt.Fprintf(file, "\n")
   173  			exception.PanicOnErr(err)
   174  		}
   175  	}
   176  }
   177  
   178  // BasesInGraph will calculate the number of bases contained in GenomeGraph using dnaTwoBit.
   179  func BasesInGraph(g *GenomeGraph) int {
   180  	var i, baseCount int = 0, 0
   181  	for i = 0; i < len(g.Nodes); i++ {
   182  		baseCount += len(g.Nodes[i].Seq)
   183  	}
   184  	return baseCount
   185  }