github.com/biogo/biogo@v1.0.4/seq/seq.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package seq provides the base for storage and manipulation of biological sequence information. 6 // 7 // A variety of sequence types are provided by derived packages including linear and protein sequence 8 // with and without quality scores. Multiple sequence data is also supported as unaligned sets and aligned sequences. 9 // 10 // Quality scoring is based on Phred scores, although there is the capacity to interconvert between Phred and 11 // Solexa scores and a Solexa quality package is provided, though not integrated. 12 package seq 13 14 import ( 15 "github.com/biogo/biogo/alphabet" 16 "github.com/biogo/biogo/feat" 17 18 "math" 19 ) 20 21 const ( 22 Start = 1 << iota 23 End 24 ) 25 26 var ( 27 // The default value for Qphred scores from non-quality sequences. 28 DefaultQphred alphabet.Qphred = 40 29 // The default encoding for Qphred scores from non-quality sequences. 30 DefaultEncoding alphabet.Encoding = alphabet.Sanger 31 ) 32 33 type Alphabeter interface { 34 Alphabet() alphabet.Alphabet 35 } 36 37 // A QFilter returns a letter based on an alphabet, quality letter and quality threshold. 38 type QFilter func(a alphabet.Alphabet, thresh alphabet.Qphred, ql alphabet.QLetter) alphabet.Letter 39 40 var ( 41 // AmbigFilter is a QFilter function that returns the given alphabet's ambiguous position 42 // letter for quality letters with a quality score below the specified threshold. 43 AmbigFilter QFilter = func(a alphabet.Alphabet, thresh alphabet.Qphred, l alphabet.QLetter) alphabet.Letter { 44 if l.L == a.Gap() || l.Q >= thresh { 45 return l.L 46 } 47 return a.Ambiguous() 48 } 49 50 // CaseFilter is a QFilter function that returns a lower case letter for quality letters 51 // with a quality score below the specified threshold and upper case equal to or above the threshold. 52 CaseFilter QFilter = func(a alphabet.Alphabet, thresh alphabet.Qphred, l alphabet.QLetter) alphabet.Letter { 53 switch { 54 case l.L == a.Gap(): 55 return l.L 56 case l.Q >= thresh: 57 return l.L &^ ('a' - 'A') 58 } 59 return l.L | ('a' - 'A') 60 } 61 ) 62 63 // A Sequence is a feature that stores sequence information. 64 type Sequence interface { 65 Feature 66 At(int) alphabet.QLetter // Return the letter at a specific position. 67 Set(int, alphabet.QLetter) error // Set the letter at a specific position. 68 Alphabet() alphabet.Alphabet // Return the Alphabet being used. 69 RevComp() // Reverse complement the sequence. 70 Reverse() // Reverse the order of elements in the sequence. 71 New() Sequence // Return a zero value of the sequence type, with the same alphabet. 72 Clone() Sequence // Return a copy of the Sequence. 73 CloneAnnotation() *Annotation // Return a copy of the sequence's annotation. 74 Slicer 75 Conformationer 76 ConformationSetter 77 } 78 79 // A Feature describes the basis for sequence features. 80 type Feature interface { 81 feat.Feature 82 feat.Offsetter 83 } 84 85 // A Conformationer can give information regarding the sequence's conformation. For the 86 // purposes of sequtils, types that are not a Conformationer are treated as linear. 87 type Conformationer interface { 88 Conformation() feat.Conformation 89 } 90 91 // A ConformationSetter can set its sequence conformation. 92 type ConformationSetter interface { 93 SetConformation(feat.Conformation) error 94 } 95 96 // A Slicer returns and sets a Slice. 97 type Slicer interface { 98 Slice() alphabet.Slice 99 SetSlice(alphabet.Slice) 100 } 101 102 // A Scorer is a sequence type that provides Phred-based scoring information. 103 type Scorer interface { 104 Feature 105 EAt(int) float64 // Return the p(Error) for a specific position. 106 SetE(int, float64) error // Set the p(Error) for a specific position. 107 Encoding() alphabet.Encoding // Return the score encoding scheme. 108 SetEncoding(alphabet.Encoding) error // Set the score encoding scheme. 109 QEncode(int) byte // Encode the quality at the specified position according the the encoding scheme. 110 } 111 112 // A Quality is a feature whose elements are Phred scores. 113 type Quality interface { 114 Scorer 115 Copy() Quality // Return a copy of the Quality. 116 } 117 118 // Rower describes the interface for sets of sequences or aligned multiple sequences. 119 type Rower interface { 120 Rows() int 121 Row(i int) Sequence 122 } 123 124 // RowAppender is a type for sets of sequences or aligned multiple sequences that can append letters to individual or grouped sequences. 125 type RowAppender interface { 126 Rower 127 AppendEach(a [][]alphabet.QLetter) (err error) 128 } 129 130 // An Appender can append letters. 131 type Appender interface { 132 AppendLetters(...alphabet.Letter) error 133 AppendQLetters(...alphabet.QLetter) error 134 } 135 136 // Aligned describes the interface for aligned multiple sequences. 137 type Aligned interface { 138 Start() int 139 End() int 140 Rows() int 141 Column(pos int, fill bool) []alphabet.Letter 142 ColumnQL(pos int, fill bool) []alphabet.QLetter 143 } 144 145 // An AlignedAppender is a multiple sequence alignment that can append letters. 146 type AlignedAppender interface { 147 Aligned 148 AppendColumns(a ...[]alphabet.QLetter) (err error) 149 AppendEach(a [][]alphabet.QLetter) (err error) 150 } 151 152 // ConsenseFunc is a function type that returns the consensus letter for a column of an alignment. 153 type ConsenseFunc func(a Aligned, alpha alphabet.Alphabet, pos int, fill bool) alphabet.QLetter 154 155 var ( 156 // The default ConsenseFunc function. 157 DefaultConsensus ConsenseFunc = func(a Aligned, alpha alphabet.Alphabet, pos int, fill bool) alphabet.QLetter { 158 w := make([]int, alpha.Len()) 159 c := a.Column(pos, fill) 160 161 for _, l := range c { 162 if alpha.IsValid(l) { 163 w[alpha.IndexOf(l)]++ 164 } 165 } 166 167 var max, maxi int 168 for i, v := range w { 169 if v > max { 170 max, maxi = v, i 171 } 172 } 173 174 return alphabet.QLetter{ 175 L: alpha.Letter(maxi), 176 Q: alphabet.Ephred(1 - (float64(max) / float64(len(c)))), 177 } 178 } 179 180 // A default ConsenseFunc function that takes letter quality into account. 181 // http://staden.sourceforge.net/manual/gap4_unix_120.html 182 DefaultQConsensus ConsenseFunc = func(a Aligned, alpha alphabet.Alphabet, pos int, fill bool) alphabet.QLetter { 183 w := make([]float64, alpha.Len()) 184 for i := range w { 185 w[i] = 1 186 } 187 188 others := float64(alpha.Len() - 1) 189 c := a.ColumnQL(pos, fill) 190 for _, l := range c { 191 if alpha.IsValid(l.L) { 192 i, alt := alpha.IndexOf(l.L), l.Q.ProbE() 193 p := (1 - alt) 194 alt /= others 195 for b := range w { 196 if i == b { 197 w[b] *= p 198 } else { 199 w[b] *= alt 200 } 201 } 202 } 203 } 204 205 var ( 206 max = 0. 207 sum float64 208 best, count int 209 ) 210 for _, p := range w { 211 sum += p 212 } 213 for i, v := range w { 214 if v /= sum; v > max { 215 max, best = v, i 216 count = 0 217 } 218 if v == max || math.Abs(max-v) < FloatTolerance { 219 count++ 220 } 221 } 222 223 if count > 1 { 224 return alphabet.QLetter{ 225 L: alpha.Ambiguous(), 226 Q: 0, 227 } 228 } 229 230 return alphabet.QLetter{ 231 L: alpha.Letter(best), 232 Q: alphabet.Ephred(1 - max), 233 } 234 } 235 ) 236 237 // Tolerance on float comparison for DefaultQConsensus. 238 var FloatTolerance float64 = 1e-10