github.com/biogo/biogo@v1.0.4/alphabet/alphabet.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package alphabet describes biological sequence letters, including quality scores. 6 package alphabet 7 8 import ( 9 "github.com/biogo/biogo/feat" 10 11 "errors" 12 "fmt" 13 "strings" 14 "unicode" 15 ) 16 17 const ( 18 CaseSensitive = true 19 ) 20 21 // Package alphabet provides default Alphabets for DNA, RNA and Protein. These 22 // alphabets are case insensitive and for the non-redundant nucleic acid alphabets 23 // satisfy the condition that the index of a letter is equal to the bitwise-complement 24 // of the index of the base-complement, modulo 4. 25 var ( 26 DNA = MustComplement(NewComplementor( 27 "acgt", 28 feat.DNA, 29 MustPair(NewPairing("acgtnxACGTNX-", "tgcanxTGCANX-")), 30 '-', 'n', 31 !CaseSensitive, 32 )) 33 34 DNAgapped = MustComplement(NewComplementor( 35 "-acgt", 36 feat.DNA, 37 MustPair(NewPairing("acgtnxACGTNX-", "tgcanxTGCANX-")), 38 '-', 'n', 39 !CaseSensitive, 40 )) 41 42 DNAredundant = MustComplement(NewComplementor( 43 "-acmgrsvtwyhkdbn", 44 feat.DNA, 45 MustPair(NewPairing("acmgrsvtwyhkdbnxACMGRSVTWYHKDBNX-", "tgkcysbawrdmhvnxTGKCYSBAWRDMHVNX-")), 46 '-', 'n', 47 !CaseSensitive, 48 )) 49 50 RNA = MustComplement(NewComplementor( 51 "acgu", 52 feat.RNA, 53 MustPair(NewPairing("acgunxACGUNX-", "ugcanxUGCANX-")), 54 '-', 'n', 55 !CaseSensitive, 56 )) 57 58 RNAgapped = MustComplement(NewComplementor( 59 "-acgu", 60 feat.RNA, 61 MustPair(NewPairing("acgunxACGUNX-", "ugcanxUGCANX-")), 62 '-', 'n', 63 !CaseSensitive, 64 )) 65 66 RNAredundant = MustComplement(NewComplementor( 67 "-acmgrsvuwyhkdbn", 68 feat.RNA, 69 MustPair(NewPairing("acmgrsvuwyhkdbnxACMGRSVUWYHKDBNX-", "ugkcysbawrdmhvnxUGKCYSBAWRDMHVNX-")), 70 '-', 'n', 71 !CaseSensitive, 72 )) 73 74 Protein = Must(NewAlphabet( 75 "-abcdefghijklmnpqrstvwxyz*", 76 feat.Protein, 77 '-', 'x', 78 !CaseSensitive, 79 )) 80 ) 81 82 // Must is a helper that wraps a call to a function returning (Alphabet, error) 83 // and panics if the error is non-nil. It is intended for use in variable 84 // initializations. 85 func Must(a Alphabet, err error) Alphabet { 86 if err != nil { 87 panic(err) 88 } 89 return a 90 } 91 92 // MustComplement is a helper that wraps a call to a function returning (Complementor, error) 93 // and panics if the error is non-nil. It is intended for use in variable 94 // initializations. 95 func MustComplement(c Complementor, err error) Complementor { 96 if err != nil { 97 panic(err) 98 } 99 return c 100 } 101 102 // MustPair is a helper that wraps a call to a function returning (*Pairing, error) 103 // and panics if the error is non-nil. It is intended for use in variable 104 // initializations. 105 func MustPair(p *Pairing, err error) *Pairing { 106 if err != nil { 107 panic(err) 108 } 109 return p 110 } 111 112 // Type Index is a pointer to an index table. 113 type Index *[256]int 114 115 // An Alphabet describes valid single character letters within a sequence. 116 type Alphabet interface { 117 // IsValid reports whether a letter conforms to the alphabet. 118 IsValid(Letter) bool 119 120 // AllValid reports whether a slice of bytes conforms to the alphabet. 121 // It returns the index of the first invalid byte, 122 // or a negative int if all bytes are valid. 123 AllValid([]Letter) (ok bool, pos int) 124 125 // AllValidQLetter reports whether a slice of bytes conforms to the alphabet. 126 // It returns the index of the first invalid byte, 127 // or a negative int if all bytes are valid. 128 AllValidQLetter([]QLetter) (ok bool, pos int) 129 130 // Len returns the number of distinct valid letters in the alphabet. 131 Len() int 132 133 // IndexOf returns the index of a given letter. 134 IndexOf(Letter) int 135 136 // Letter returns the letter corresponding to the given index. 137 Letter(int) Letter 138 139 // LetterIndex returns a pointer to the internal array specifying 140 // letter to index conversion. The returned index should not be altered. 141 LetterIndex() Index 142 143 // Letters returns a string of letters conforming to the alphabet in index 144 // order. In case insensitive alphabets, both cases are presented. 145 Letters() string 146 147 // ValidLetters returns a slice of the internal []bool indicating valid 148 // letters. The returned slice should not be altered. 149 ValidLetters() []bool 150 151 // Gap returns the gap character used by the alphabet. 152 Gap() Letter 153 154 // Ambiguous returns the character representing an ambiguous letter. 155 Ambiguous() Letter 156 157 // Moltype returns the molecule type of the alphabet. 158 Moltype() feat.Moltype 159 160 // IsCased returns whether the alphabet is case sensitive. 161 IsCased() bool 162 } 163 164 // A Complementor is an Alphabet that describes the complementation relationships 165 // between letters. 166 type Complementor interface { 167 Alphabet 168 Complement(Letter) (Letter, bool) 169 ComplementTable() []Letter 170 } 171 172 // Single letter alphabet type. 173 type alpha struct { 174 letters string 175 length int 176 valid [256]bool 177 index [256]int 178 gap, ambiguous Letter 179 caseSensitive bool 180 molType feat.Moltype 181 } 182 183 func newAlphabet(letters string, molType feat.Moltype, gap, ambiguous Letter, caseSensitive bool) (*alpha, error) { 184 if strings.IndexFunc(letters, func(r rune) bool { return r < 0 || r > unicode.MaxASCII }) > -1 { 185 return nil, errors.New("alphabet: letters contains non-ASCII rune") 186 } 187 188 a := &alpha{ 189 length: len(letters), 190 gap: gap, 191 ambiguous: ambiguous, 192 caseSensitive: caseSensitive, 193 molType: molType, 194 } 195 196 for i := range a.index { 197 a.index[i] = -1 198 } 199 200 if caseSensitive { 201 a.letters = letters 202 for i, l := range a.letters { 203 a.valid[l] = true 204 a.index[l] = i 205 } 206 return a, nil 207 } 208 209 a.letters = strings.ToLower(letters) + strings.ToUpper(letters) 210 for i, l := range a.letters[:len(letters)] { 211 a.valid[l] = true 212 a.index[l] = i 213 } 214 for i, l := range a.letters[len(letters):] { 215 a.valid[l] = true 216 a.index[l] = a.index[a.letters[i]] 217 } 218 219 return a, nil 220 } 221 222 func (a *alpha) Moltype() feat.Moltype { return a.molType } 223 func (a *alpha) Len() int { return a.length } 224 func (a *alpha) IsCased() bool { return a.caseSensitive } 225 func (a *alpha) Gap() Letter { return a.gap } 226 func (a *alpha) Ambiguous() Letter { return a.ambiguous } 227 func (a *alpha) AllValidQLetter(n []QLetter) (bool, int) { 228 for i, v := range n { 229 if !a.valid[v.L] { 230 return false, i 231 } 232 } 233 234 return true, -1 235 } 236 func (a *alpha) AllValid(n []Letter) (bool, int) { 237 for i, v := range n { 238 if !a.valid[v] { 239 return false, i 240 } 241 } 242 243 return true, -1 244 } 245 func (a *alpha) IsValid(n Letter) bool { 246 return a.valid[n] 247 } 248 func (a *alpha) Letter(i int) Letter { 249 return Letter(a.letters[:a.length][i]) 250 } 251 func (a *alpha) IndexOf(n Letter) int { 252 return a.index[n] 253 } 254 func (a *alpha) ValidLetters() []bool { return a.valid[:] } 255 func (a *alpha) LetterIndex() Index { return Index(&a.index) } 256 func (a *alpha) Letters() string { return a.letters } 257 258 // A Pairing provides a lookup table between a letter and its complement. 259 type Pairing struct { 260 pair []Letter 261 ok []bool 262 complements [256]Letter 263 } 264 265 // NewPairing create a new Pairing from a pair of strings. Pairing definitions must be 266 // a bijection and must contain only ASCII characters. 267 func NewPairing(s, c string) (*Pairing, error) { 268 if len(s) != len(c) { 269 return nil, errors.New("alphabet: length of pairing definitions do not match") 270 } 271 272 p := &Pairing{ 273 pair: make([]Letter, 256), 274 ok: make([]bool, 256), 275 } 276 277 for i := range p.pair { 278 p.pair[i] = Letter(i) 279 } 280 281 cr := []rune(c) 282 for i, v := range s { 283 if v < 0 || cr[i] < 0 || v > unicode.MaxASCII || cr[i] > unicode.MaxASCII { 284 return nil, errors.New("alphabet: pairing definition contains non-ASCII rune") 285 } 286 p.pair[v] = Letter(cr[i]) 287 p.ok[v] = true 288 } 289 for i, l := range s { 290 if Letter(l) != p.pair[p.pair[l]] { 291 return nil, errors.New("alphabet: pairing definition is not a bijection") 292 } 293 if Letter(c[i]) != p.pair[p.pair[c[i]]] { 294 return nil, errors.New("alphabet: pairing definition is not a bijection") 295 } 296 } 297 copy(p.complements[:], p.pair) 298 for i, ok := range p.ok { 299 if !ok { 300 p.complements[i] |= unicode.MaxASCII + 1 301 } 302 } 303 return p, nil 304 } 305 306 // Returns the complement of a letter and true if the complement is a valid letter otherwise unchanged and false. 307 func (p *Pairing) Complement(l Letter) (c Letter, ok bool) { return p.pair[l], p.ok[l] } 308 309 // Returns a complementation table based on the internal representation. Invalid pairs hold a value outside the ASCII range. 310 // The caller must not modify the returned table. 311 func (p *Pairing) ComplementTable() []Letter { 312 return p.complements[:] 313 } 314 315 type nucleic struct { 316 *alpha 317 *Pairing 318 } 319 320 // NewComplementor returns a complementing alphabet. The Complement table is checked for 321 // validity and an error is returned if an invalid complement pair is found. Pairings 322 // that result in no change but would otherwise be invalid are allowed. Letter parameter 323 // handling is the same as for NewAlphabet. 324 func NewComplementor(letters string, molType feat.Moltype, pairs *Pairing, gap, ambiguous Letter, caseSensitive bool) (Complementor, error) { 325 a, err := newAlphabet(letters, molType, gap, ambiguous, caseSensitive) 326 if err != nil { 327 return nil, err 328 } 329 330 if pairs != nil { 331 for i, v := range pairs.pair { 332 if !(pairs.ok[i] || Letter(i&unicode.MaxASCII) == v&unicode.MaxASCII) && !(a.valid[i] && a.valid[v]) { 333 return nil, fmt.Errorf("alphabet: invalid pairing: %c (%d) -> %c (%d)", i, i, v, v) 334 } 335 } 336 } 337 338 return &nucleic{ 339 alpha: a, 340 Pairing: pairs, 341 }, nil 342 } 343 344 // NewAlphabet returns a new Alphabet based on the provided definitions. Index values 345 // for letters reflect order of the letters parameter. Letters must be within the 346 // ASCII range. No check is performed to determine whether letters appear more than once, 347 // the index of a letter will be the position of the last occurrence of that letter in the 348 // letters parameter. 349 func NewAlphabet(letters string, molType feat.Moltype, gap, ambiguous Letter, caseSensitive bool) (Alphabet, error) { 350 return newAlphabet(letters, molType, gap, ambiguous, caseSensitive) 351 }