github.com/biogo/biogo@v1.0.4/alphabet/alphabet.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package alphabet describes biological sequence letters, including quality scores.
     6  package alphabet
     7  
     8  import (
     9  	"github.com/biogo/biogo/feat"
    10  
    11  	"errors"
    12  	"fmt"
    13  	"strings"
    14  	"unicode"
    15  )
    16  
    17  const (
    18  	CaseSensitive = true
    19  )
    20  
    21  // Package alphabet provides default Alphabets for DNA, RNA and Protein. These
    22  // alphabets are case insensitive and for the non-redundant nucleic acid alphabets
    23  // satisfy the condition that the index of a letter is equal to the bitwise-complement
    24  // of the index of the base-complement, modulo 4.
    25  var (
    26  	DNA = MustComplement(NewComplementor(
    27  		"acgt",
    28  		feat.DNA,
    29  		MustPair(NewPairing("acgtnxACGTNX-", "tgcanxTGCANX-")),
    30  		'-', 'n',
    31  		!CaseSensitive,
    32  	))
    33  
    34  	DNAgapped = MustComplement(NewComplementor(
    35  		"-acgt",
    36  		feat.DNA,
    37  		MustPair(NewPairing("acgtnxACGTNX-", "tgcanxTGCANX-")),
    38  		'-', 'n',
    39  		!CaseSensitive,
    40  	))
    41  
    42  	DNAredundant = MustComplement(NewComplementor(
    43  		"-acmgrsvtwyhkdbn",
    44  		feat.DNA,
    45  		MustPair(NewPairing("acmgrsvtwyhkdbnxACMGRSVTWYHKDBNX-", "tgkcysbawrdmhvnxTGKCYSBAWRDMHVNX-")),
    46  		'-', 'n',
    47  		!CaseSensitive,
    48  	))
    49  
    50  	RNA = MustComplement(NewComplementor(
    51  		"acgu",
    52  		feat.RNA,
    53  		MustPair(NewPairing("acgunxACGUNX-", "ugcanxUGCANX-")),
    54  		'-', 'n',
    55  		!CaseSensitive,
    56  	))
    57  
    58  	RNAgapped = MustComplement(NewComplementor(
    59  		"-acgu",
    60  		feat.RNA,
    61  		MustPair(NewPairing("acgunxACGUNX-", "ugcanxUGCANX-")),
    62  		'-', 'n',
    63  		!CaseSensitive,
    64  	))
    65  
    66  	RNAredundant = MustComplement(NewComplementor(
    67  		"-acmgrsvuwyhkdbn",
    68  		feat.RNA,
    69  		MustPair(NewPairing("acmgrsvuwyhkdbnxACMGRSVUWYHKDBNX-", "ugkcysbawrdmhvnxUGKCYSBAWRDMHVNX-")),
    70  		'-', 'n',
    71  		!CaseSensitive,
    72  	))
    73  
    74  	Protein = Must(NewAlphabet(
    75  		"-abcdefghijklmnpqrstvwxyz*",
    76  		feat.Protein,
    77  		'-', 'x',
    78  		!CaseSensitive,
    79  	))
    80  )
    81  
    82  // Must is a helper that wraps a call to a function returning (Alphabet, error)
    83  // and panics if the error is non-nil. It is intended for use in variable
    84  // initializations.
    85  func Must(a Alphabet, err error) Alphabet {
    86  	if err != nil {
    87  		panic(err)
    88  	}
    89  	return a
    90  }
    91  
    92  // MustComplement is a helper that wraps a call to a function returning (Complementor, error)
    93  // and panics if the error is non-nil. It is intended for use in variable
    94  // initializations.
    95  func MustComplement(c Complementor, err error) Complementor {
    96  	if err != nil {
    97  		panic(err)
    98  	}
    99  	return c
   100  }
   101  
   102  // MustPair is a helper that wraps a call to a function returning (*Pairing, error)
   103  // and panics if the error is non-nil. It is intended for use in variable
   104  // initializations.
   105  func MustPair(p *Pairing, err error) *Pairing {
   106  	if err != nil {
   107  		panic(err)
   108  	}
   109  	return p
   110  }
   111  
   112  // Type Index is a pointer to an index table.
   113  type Index *[256]int
   114  
   115  // An Alphabet describes valid single character letters within a sequence.
   116  type Alphabet interface {
   117  	// IsValid reports whether a letter conforms to the alphabet.
   118  	IsValid(Letter) bool
   119  
   120  	// AllValid reports whether a slice of bytes conforms to the alphabet.
   121  	// It returns the index of the first invalid byte,
   122  	// or a negative int if all bytes are valid.
   123  	AllValid([]Letter) (ok bool, pos int)
   124  
   125  	// AllValidQLetter reports whether a slice of bytes conforms to the alphabet.
   126  	// It returns the index of the first invalid byte,
   127  	// or a negative int if all bytes are valid.
   128  	AllValidQLetter([]QLetter) (ok bool, pos int)
   129  
   130  	// Len returns the number of distinct valid letters in the alphabet.
   131  	Len() int
   132  
   133  	// IndexOf returns the index of a given letter.
   134  	IndexOf(Letter) int
   135  
   136  	// Letter returns the letter corresponding to the given index.
   137  	Letter(int) Letter
   138  
   139  	// LetterIndex returns a pointer to the internal array specifying
   140  	// letter to index conversion. The returned index should not be altered.
   141  	LetterIndex() Index
   142  
   143  	// Letters returns a string of letters conforming to the alphabet in index
   144  	// order. In case insensitive alphabets, both cases are presented.
   145  	Letters() string
   146  
   147  	// ValidLetters returns a slice of the internal []bool indicating valid
   148  	// letters. The returned slice should not be altered.
   149  	ValidLetters() []bool
   150  
   151  	// Gap returns the gap character used by the alphabet.
   152  	Gap() Letter
   153  
   154  	// Ambiguous returns the character representing an ambiguous letter.
   155  	Ambiguous() Letter
   156  
   157  	// Moltype returns the molecule type of the alphabet.
   158  	Moltype() feat.Moltype
   159  
   160  	// IsCased returns whether the alphabet is case sensitive.
   161  	IsCased() bool
   162  }
   163  
   164  // A Complementor is an Alphabet that describes the complementation relationships
   165  // between letters.
   166  type Complementor interface {
   167  	Alphabet
   168  	Complement(Letter) (Letter, bool)
   169  	ComplementTable() []Letter
   170  }
   171  
   172  // Single letter alphabet type.
   173  type alpha struct {
   174  	letters        string
   175  	length         int
   176  	valid          [256]bool
   177  	index          [256]int
   178  	gap, ambiguous Letter
   179  	caseSensitive  bool
   180  	molType        feat.Moltype
   181  }
   182  
   183  func newAlphabet(letters string, molType feat.Moltype, gap, ambiguous Letter, caseSensitive bool) (*alpha, error) {
   184  	if strings.IndexFunc(letters, func(r rune) bool { return r < 0 || r > unicode.MaxASCII }) > -1 {
   185  		return nil, errors.New("alphabet: letters contains non-ASCII rune")
   186  	}
   187  
   188  	a := &alpha{
   189  		length:        len(letters),
   190  		gap:           gap,
   191  		ambiguous:     ambiguous,
   192  		caseSensitive: caseSensitive,
   193  		molType:       molType,
   194  	}
   195  
   196  	for i := range a.index {
   197  		a.index[i] = -1
   198  	}
   199  
   200  	if caseSensitive {
   201  		a.letters = letters
   202  		for i, l := range a.letters {
   203  			a.valid[l] = true
   204  			a.index[l] = i
   205  		}
   206  		return a, nil
   207  	}
   208  
   209  	a.letters = strings.ToLower(letters) + strings.ToUpper(letters)
   210  	for i, l := range a.letters[:len(letters)] {
   211  		a.valid[l] = true
   212  		a.index[l] = i
   213  	}
   214  	for i, l := range a.letters[len(letters):] {
   215  		a.valid[l] = true
   216  		a.index[l] = a.index[a.letters[i]]
   217  	}
   218  
   219  	return a, nil
   220  }
   221  
   222  func (a *alpha) Moltype() feat.Moltype { return a.molType }
   223  func (a *alpha) Len() int              { return a.length }
   224  func (a *alpha) IsCased() bool         { return a.caseSensitive }
   225  func (a *alpha) Gap() Letter           { return a.gap }
   226  func (a *alpha) Ambiguous() Letter     { return a.ambiguous }
   227  func (a *alpha) AllValidQLetter(n []QLetter) (bool, int) {
   228  	for i, v := range n {
   229  		if !a.valid[v.L] {
   230  			return false, i
   231  		}
   232  	}
   233  
   234  	return true, -1
   235  }
   236  func (a *alpha) AllValid(n []Letter) (bool, int) {
   237  	for i, v := range n {
   238  		if !a.valid[v] {
   239  			return false, i
   240  		}
   241  	}
   242  
   243  	return true, -1
   244  }
   245  func (a *alpha) IsValid(n Letter) bool {
   246  	return a.valid[n]
   247  }
   248  func (a *alpha) Letter(i int) Letter {
   249  	return Letter(a.letters[:a.length][i])
   250  }
   251  func (a *alpha) IndexOf(n Letter) int {
   252  	return a.index[n]
   253  }
   254  func (a *alpha) ValidLetters() []bool { return a.valid[:] }
   255  func (a *alpha) LetterIndex() Index   { return Index(&a.index) }
   256  func (a *alpha) Letters() string      { return a.letters }
   257  
   258  // A Pairing provides a lookup table between a letter and its complement.
   259  type Pairing struct {
   260  	pair        []Letter
   261  	ok          []bool
   262  	complements [256]Letter
   263  }
   264  
   265  // NewPairing create a new Pairing from a pair of strings. Pairing definitions must be
   266  // a bijection and must contain only ASCII characters.
   267  func NewPairing(s, c string) (*Pairing, error) {
   268  	if len(s) != len(c) {
   269  		return nil, errors.New("alphabet: length of pairing definitions do not match")
   270  	}
   271  
   272  	p := &Pairing{
   273  		pair: make([]Letter, 256),
   274  		ok:   make([]bool, 256),
   275  	}
   276  
   277  	for i := range p.pair {
   278  		p.pair[i] = Letter(i)
   279  	}
   280  
   281  	cr := []rune(c)
   282  	for i, v := range s {
   283  		if v < 0 || cr[i] < 0 || v > unicode.MaxASCII || cr[i] > unicode.MaxASCII {
   284  			return nil, errors.New("alphabet: pairing definition contains non-ASCII rune")
   285  		}
   286  		p.pair[v] = Letter(cr[i])
   287  		p.ok[v] = true
   288  	}
   289  	for i, l := range s {
   290  		if Letter(l) != p.pair[p.pair[l]] {
   291  			return nil, errors.New("alphabet: pairing definition is not a bijection")
   292  		}
   293  		if Letter(c[i]) != p.pair[p.pair[c[i]]] {
   294  			return nil, errors.New("alphabet: pairing definition is not a bijection")
   295  		}
   296  	}
   297  	copy(p.complements[:], p.pair)
   298  	for i, ok := range p.ok {
   299  		if !ok {
   300  			p.complements[i] |= unicode.MaxASCII + 1
   301  		}
   302  	}
   303  	return p, nil
   304  }
   305  
   306  // Returns the complement of a letter and true if the complement is a valid letter otherwise unchanged and false.
   307  func (p *Pairing) Complement(l Letter) (c Letter, ok bool) { return p.pair[l], p.ok[l] }
   308  
   309  // Returns a complementation table based on the internal representation. Invalid pairs hold a value outside the ASCII range.
   310  // The caller must not modify the returned table.
   311  func (p *Pairing) ComplementTable() []Letter {
   312  	return p.complements[:]
   313  }
   314  
   315  type nucleic struct {
   316  	*alpha
   317  	*Pairing
   318  }
   319  
   320  // NewComplementor returns a complementing alphabet. The Complement table is checked for
   321  // validity and an error is returned if an invalid complement pair is found. Pairings
   322  // that result in no change but would otherwise be invalid are allowed. Letter parameter
   323  // handling is the same as for NewAlphabet.
   324  func NewComplementor(letters string, molType feat.Moltype, pairs *Pairing, gap, ambiguous Letter, caseSensitive bool) (Complementor, error) {
   325  	a, err := newAlphabet(letters, molType, gap, ambiguous, caseSensitive)
   326  	if err != nil {
   327  		return nil, err
   328  	}
   329  
   330  	if pairs != nil {
   331  		for i, v := range pairs.pair {
   332  			if !(pairs.ok[i] || Letter(i&unicode.MaxASCII) == v&unicode.MaxASCII) && !(a.valid[i] && a.valid[v]) {
   333  				return nil, fmt.Errorf("alphabet: invalid pairing: %c (%d) -> %c (%d)", i, i, v, v)
   334  			}
   335  		}
   336  	}
   337  
   338  	return &nucleic{
   339  		alpha:   a,
   340  		Pairing: pairs,
   341  	}, nil
   342  }
   343  
   344  // NewAlphabet returns a new Alphabet based on the provided definitions. Index values
   345  // for letters reflect order of the letters parameter. Letters must be within the
   346  // ASCII range. No check is performed to determine whether letters appear more than once,
   347  // the index of a letter will be the position of the last occurrence of that letter in the
   348  // letters parameter.
   349  func NewAlphabet(letters string, molType feat.Moltype, gap, ambiguous Letter, caseSensitive bool) (Alphabet, error) {
   350  	return newAlphabet(letters, molType, gap, ambiguous, caseSensitive)
   351  }