github.com/biogo/biogo@v1.0.4/align/pals/packseqs.go (about) 1 // Copyright ©2011-2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package pals 6 7 import ( 8 "errors" 9 "fmt" 10 11 "github.com/biogo/biogo/alphabet" 12 "github.com/biogo/biogo/seq" 13 "github.com/biogo/biogo/seq/linear" 14 "github.com/biogo/biogo/util" 15 ) 16 17 var ( 18 binSize int = 1 << 10 19 minPadding int = 50 20 ) 21 22 // A Packer collects a set of sequence into a Packed sequence. 23 type Packer struct { 24 packed *Packed 25 lastPad int 26 length int 27 } 28 29 type Packed struct { 30 *linear.Seq 31 seqMap 32 } 33 34 // Convert coordinates in a packed sequence into a feat.Feature. 35 func (pa *Packed) feature(from, to int, comp bool) (*Feature, error) { 36 if comp { 37 from, to = pa.Len()-to, pa.Len()-from 38 } 39 if from >= to { 40 return nil, errors.New("pals: from > to") 41 } 42 43 // DPHit coordinates sometimes over/underflow. 44 // This is a lazy hack to work around it, should really figure 45 // out what is going on. 46 if from < 0 { 47 from = 0 48 } 49 if to > pa.Len() { 50 to = pa.Len() 51 } 52 53 // Take midpoint of segment -- lazy hack again, endpoints 54 // sometimes under / overflow 55 bin := (from + to) / (2 * binSize) 56 binCount := (pa.Len() + binSize - 1) / binSize 57 58 if bin < 0 || bin >= binCount { 59 return nil, fmt.Errorf("pals: bin %d out of range 0..%d", bin, binCount-1) 60 } 61 62 contigIndex := pa.seqMap.binMap[bin] 63 64 if contigIndex < 0 || contigIndex >= len(pa.seqMap.contigs) { 65 return nil, fmt.Errorf("pals: contig %s index %d out of range 0..%d", pa.ID, contigIndex, len(pa.seqMap.contigs)) 66 } 67 68 length := to - from 69 70 if length < 0 { 71 return nil, errors.New("pals: length < 0") 72 } 73 74 contig := pa.seqMap.contigs[contigIndex] 75 contigFrom := from - contig.from 76 contigTo := contigFrom + length 77 78 if contigFrom < 0 { 79 contigFrom = 0 80 } 81 82 if contigTo > contig.Len() { 83 contigTo = contig.Len() 84 } 85 86 return &Feature{ 87 ID: contig.ID, 88 From: contigFrom, 89 To: contigTo, 90 Loc: Contig(contig.ID), 91 }, nil 92 } 93 94 // Create a new Packer. 95 func NewPacker(id string) *Packer { 96 return &Packer{ 97 packed: &Packed{ 98 Seq: &linear.Seq{Annotation: seq.Annotation{ID: id}}, 99 seqMap: seqMap{}, 100 }, 101 } 102 } 103 104 // Pack a sequence into the Packed sequence. Returns a string giving diagnostic information. 105 func (pa *Packer) Pack(seq *linear.Seq) (string, error) { 106 if pa.packed.Alpha == nil { 107 pa.packed.Alpha = seq.Alpha 108 } else if pa.packed.Alpha != seq.Alpha { 109 return "", errors.New("pals: alphabet mismatch") 110 } 111 112 c := contig{Seq: seq} 113 114 padding := binSize - seq.Len()%binSize 115 if padding < minPadding { 116 padding += binSize 117 } 118 119 pa.length += pa.lastPad 120 c.from = pa.length 121 pa.length += seq.Len() 122 pa.lastPad = padding 123 124 m := &pa.packed.seqMap 125 bins := make([]int, (padding+seq.Len())/binSize) 126 for i := 0; i < len(bins); i++ { 127 bins[i] = len(m.contigs) 128 } 129 m.binMap = append(m.binMap, bins...) 130 m.contigs = append(m.contigs, c) 131 132 return fmt.Sprintf("%20s\t%10d\t%7d-%-d", seq.ID[:util.Min(20, len(seq.ID))], seq.Len(), len(m.binMap)-len(bins), len(m.binMap)-1), nil 133 } 134 135 // Finalise the sequence packing. 136 func (pa *Packer) FinalisePack() *Packed { 137 lastPad := 0 138 seq := make(alphabet.Letters, 0, pa.length) 139 for _, c := range pa.packed.seqMap.contigs { 140 padding := binSize - c.Len()%binSize 141 if padding < minPadding { 142 padding += binSize 143 } 144 seq = append(seq, alphabet.Letter('N').Repeat(lastPad)...) 145 seq = append(seq, c.Seq.Seq...) 146 lastPad = padding 147 } 148 pa.packed.Seq.Seq = seq 149 150 return pa.packed 151 } 152 153 // A contig holds a sequence within a SeqMap. 154 type contig struct { 155 *linear.Seq 156 from int 157 } 158 159 // A seqMap is a collection of sequences mapped to a Packed sequence. 160 type seqMap struct { 161 contigs []contig 162 binMap []int 163 }