github.com/biogo/biogo@v1.0.4/seq/seq.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package seq provides the base for storage and manipulation of biological sequence information.
     6  //
     7  // A variety of sequence types are provided by derived packages including linear and protein sequence
     8  // with and without quality scores. Multiple sequence data is also supported as unaligned sets and aligned sequences.
     9  //
    10  // Quality scoring is based on Phred scores, although there is the capacity to interconvert between Phred and
    11  // Solexa scores and a Solexa quality package is provided, though not integrated.
    12  package seq
    13  
    14  import (
    15  	"github.com/biogo/biogo/alphabet"
    16  	"github.com/biogo/biogo/feat"
    17  
    18  	"math"
    19  )
    20  
    21  const (
    22  	Start = 1 << iota
    23  	End
    24  )
    25  
    26  var (
    27  	// The default value for Qphred scores from non-quality sequences.
    28  	DefaultQphred alphabet.Qphred = 40
    29  	// The default encoding for Qphred scores from non-quality sequences.
    30  	DefaultEncoding alphabet.Encoding = alphabet.Sanger
    31  )
    32  
    33  type Alphabeter interface {
    34  	Alphabet() alphabet.Alphabet
    35  }
    36  
    37  // A QFilter returns a letter based on an alphabet, quality letter and quality threshold.
    38  type QFilter func(a alphabet.Alphabet, thresh alphabet.Qphred, ql alphabet.QLetter) alphabet.Letter
    39  
    40  var (
    41  	// AmbigFilter is a QFilter function that returns the given alphabet's ambiguous position
    42  	// letter for quality letters with a quality score below the specified threshold.
    43  	AmbigFilter QFilter = func(a alphabet.Alphabet, thresh alphabet.Qphred, l alphabet.QLetter) alphabet.Letter {
    44  		if l.L == a.Gap() || l.Q >= thresh {
    45  			return l.L
    46  		}
    47  		return a.Ambiguous()
    48  	}
    49  
    50  	// CaseFilter is a QFilter function that returns a lower case letter for quality letters
    51  	// with a quality score below the specified threshold and upper case equal to or above the threshold.
    52  	CaseFilter QFilter = func(a alphabet.Alphabet, thresh alphabet.Qphred, l alphabet.QLetter) alphabet.Letter {
    53  		switch {
    54  		case l.L == a.Gap():
    55  			return l.L
    56  		case l.Q >= thresh:
    57  			return l.L &^ ('a' - 'A')
    58  		}
    59  		return l.L | ('a' - 'A')
    60  	}
    61  )
    62  
    63  // A Sequence is a feature that stores sequence information.
    64  type Sequence interface {
    65  	Feature
    66  	At(int) alphabet.QLetter         // Return the letter at a specific position.
    67  	Set(int, alphabet.QLetter) error // Set the letter at a specific position.
    68  	Alphabet() alphabet.Alphabet     // Return the Alphabet being used.
    69  	RevComp()                        // Reverse complement the sequence.
    70  	Reverse()                        // Reverse the order of elements in the sequence.
    71  	New() Sequence                   // Return a zero value of the sequence type, with the same alphabet.
    72  	Clone() Sequence                 // Return a copy of the Sequence.
    73  	CloneAnnotation() *Annotation    // Return a copy of the sequence's annotation.
    74  	Slicer
    75  	Conformationer
    76  	ConformationSetter
    77  }
    78  
    79  // A Feature describes the basis for sequence features.
    80  type Feature interface {
    81  	feat.Feature
    82  	feat.Offsetter
    83  }
    84  
    85  // A Conformationer can give information regarding the sequence's conformation. For the
    86  // purposes of sequtils, types that are not a Conformationer are treated as linear.
    87  type Conformationer interface {
    88  	Conformation() feat.Conformation
    89  }
    90  
    91  // A ConformationSetter can set its sequence conformation.
    92  type ConformationSetter interface {
    93  	SetConformation(feat.Conformation) error
    94  }
    95  
    96  // A Slicer returns and sets a Slice.
    97  type Slicer interface {
    98  	Slice() alphabet.Slice
    99  	SetSlice(alphabet.Slice)
   100  }
   101  
   102  // A Scorer is a sequence type that provides Phred-based scoring information.
   103  type Scorer interface {
   104  	Feature
   105  	EAt(int) float64                     // Return the p(Error) for a specific position.
   106  	SetE(int, float64) error             // Set the p(Error) for a specific position.
   107  	Encoding() alphabet.Encoding         // Return the score encoding scheme.
   108  	SetEncoding(alphabet.Encoding) error // Set the score encoding scheme.
   109  	QEncode(int) byte                    // Encode the quality at the specified position according the the encoding scheme.
   110  }
   111  
   112  // A Quality is a feature whose elements are Phred scores.
   113  type Quality interface {
   114  	Scorer
   115  	Copy() Quality // Return a copy of the Quality.
   116  }
   117  
   118  // Rower describes the interface for sets of sequences or aligned multiple sequences.
   119  type Rower interface {
   120  	Rows() int
   121  	Row(i int) Sequence
   122  }
   123  
   124  // RowAppender is a type for sets of sequences or aligned multiple sequences that can append letters to individual or grouped sequences.
   125  type RowAppender interface {
   126  	Rower
   127  	AppendEach(a [][]alphabet.QLetter) (err error)
   128  }
   129  
   130  // An Appender can append letters.
   131  type Appender interface {
   132  	AppendLetters(...alphabet.Letter) error
   133  	AppendQLetters(...alphabet.QLetter) error
   134  }
   135  
   136  // Aligned describes the interface for aligned multiple sequences.
   137  type Aligned interface {
   138  	Start() int
   139  	End() int
   140  	Rows() int
   141  	Column(pos int, fill bool) []alphabet.Letter
   142  	ColumnQL(pos int, fill bool) []alphabet.QLetter
   143  }
   144  
   145  // An AlignedAppender is a multiple sequence alignment that can append letters.
   146  type AlignedAppender interface {
   147  	Aligned
   148  	AppendColumns(a ...[]alphabet.QLetter) (err error)
   149  	AppendEach(a [][]alphabet.QLetter) (err error)
   150  }
   151  
   152  // ConsenseFunc is a function type that returns the consensus letter for a column of an alignment.
   153  type ConsenseFunc func(a Aligned, alpha alphabet.Alphabet, pos int, fill bool) alphabet.QLetter
   154  
   155  var (
   156  	// The default ConsenseFunc function.
   157  	DefaultConsensus ConsenseFunc = func(a Aligned, alpha alphabet.Alphabet, pos int, fill bool) alphabet.QLetter {
   158  		w := make([]int, alpha.Len())
   159  		c := a.Column(pos, fill)
   160  
   161  		for _, l := range c {
   162  			if alpha.IsValid(l) {
   163  				w[alpha.IndexOf(l)]++
   164  			}
   165  		}
   166  
   167  		var max, maxi int
   168  		for i, v := range w {
   169  			if v > max {
   170  				max, maxi = v, i
   171  			}
   172  		}
   173  
   174  		return alphabet.QLetter{
   175  			L: alpha.Letter(maxi),
   176  			Q: alphabet.Ephred(1 - (float64(max) / float64(len(c)))),
   177  		}
   178  	}
   179  
   180  	// A default ConsenseFunc function that takes letter quality into account.
   181  	// http://staden.sourceforge.net/manual/gap4_unix_120.html
   182  	DefaultQConsensus ConsenseFunc = func(a Aligned, alpha alphabet.Alphabet, pos int, fill bool) alphabet.QLetter {
   183  		w := make([]float64, alpha.Len())
   184  		for i := range w {
   185  			w[i] = 1
   186  		}
   187  
   188  		others := float64(alpha.Len() - 1)
   189  		c := a.ColumnQL(pos, fill)
   190  		for _, l := range c {
   191  			if alpha.IsValid(l.L) {
   192  				i, alt := alpha.IndexOf(l.L), l.Q.ProbE()
   193  				p := (1 - alt)
   194  				alt /= others
   195  				for b := range w {
   196  					if i == b {
   197  						w[b] *= p
   198  					} else {
   199  						w[b] *= alt
   200  					}
   201  				}
   202  			}
   203  		}
   204  
   205  		var (
   206  			max         = 0.
   207  			sum         float64
   208  			best, count int
   209  		)
   210  		for _, p := range w {
   211  			sum += p
   212  		}
   213  		for i, v := range w {
   214  			if v /= sum; v > max {
   215  				max, best = v, i
   216  				count = 0
   217  			}
   218  			if v == max || math.Abs(max-v) < FloatTolerance {
   219  				count++
   220  			}
   221  		}
   222  
   223  		if count > 1 {
   224  			return alphabet.QLetter{
   225  				L: alpha.Ambiguous(),
   226  				Q: 0,
   227  			}
   228  		}
   229  
   230  		return alphabet.QLetter{
   231  			L: alpha.Letter(best),
   232  			Q: alphabet.Ephred(1 - max),
   233  		}
   234  	}
   235  )
   236  
   237  // Tolerance on float comparison for DefaultQConsensus.
   238  var FloatTolerance float64 = 1e-10