github.com/biogo/biogo@v1.0.4/alphabet/letters.go (about)

     1  // Copyright ©2011-2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package alphabet
     6  
     7  import (
     8  	"math"
     9  	"unsafe"
    10  )
    11  
    12  // The Slice interface reflects the built-in slice type behavior.
    13  type Slice interface {
    14  	// Make makes a Slice with the same concrete type as the receiver. Make will
    15  	// panic if len or cap are less than zero or cap is less than len.
    16  	Make(len, cap int) Slice
    17  
    18  	// Len returns the length of the Slice.
    19  	Len() int
    20  
    21  	// Cap returns the capacity of the Slice.
    22  	Cap() int
    23  
    24  	// Slice returns a slice of the Slice. The returned slice may be backed by
    25  	// the same array as the receiver.
    26  	Slice(start, end int) Slice
    27  
    28  	// Append appends src... to the receiver and returns the resulting slice. If the append
    29  	// results in a grow slice the receiver will not reflect the appended slice, so the
    30  	// returned Slice should always be stored. Append should panic if src and the receiver
    31  	// are not the same concrete type.
    32  	Append(src Slice) Slice
    33  
    34  	// Copy copies elements from src into the receiver, returning the number of elements
    35  	// copied. Copy should panic if src and the receiver are not the same concrete type.
    36  	Copy(src Slice) int
    37  }
    38  
    39  // An Encoding represents a quality score encoding scheme.
    40  //                                                                                             Q-range
    41  //
    42  //  Sanger         !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI···                                 0 - 40
    43  //  Solexa                                 ··;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh··· -5 - 40
    44  //  Illumina 1.3+                                 @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh···  0 - 40
    45  //  Illumina 1.5+                                 xxḆCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh···  3 - 40
    46  //  Illumina 1.8+  !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ···                                0 - 40
    47  //
    48  //                 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh··· ···{|}~
    49  //                 |                         |    |        |                              |          |
    50  //                33                        59   64       73                            104        126
    51  //
    52  //  Q-range for typical raw reads
    53  type Encoding int8
    54  
    55  const (
    56  	None        Encoding = iota - 1 // All letters are decoded as scores with p(Error) = NaN.
    57  	Sanger                          // Phred+33
    58  	Solexa                          // Solexa+64
    59  	Illumina1_3                     // Phred+64
    60  	Illumina1_5                     // Phred+64 0,1=unused, 2=Read Segment Quality Control Indicator (Ḇ)
    61  	Illumina1_8                     // Phred+33
    62  	Illumina1_9                     // Phred+33
    63  )
    64  
    65  // DecodeToPhred interprets the byte q as an e encoded quality and returns the corresponding Phred score.
    66  func (e Encoding) DecodeToQphred(q byte) Qphred {
    67  	switch e {
    68  	case Sanger, Illumina1_8, Illumina1_9:
    69  		return Qphred(q) - 33
    70  	case Illumina1_3, Illumina1_5:
    71  		return Qphred(q) - 64
    72  	case Solexa:
    73  		return (Qsolexa(q) - 64).Qphred()
    74  	case None:
    75  		return 0xff
    76  	default:
    77  		panic("alphabet: illegal encoding")
    78  	}
    79  }
    80  
    81  // DecodeToPhred interprets the byte q as an e encoded quality and returns the corresponding Solexa score.
    82  func (e Encoding) DecodeToQsolexa(q byte) Qsolexa {
    83  	switch e {
    84  	case Sanger, Illumina1_8, Illumina1_9:
    85  		return (Qphred(q) - 33).Qsolexa()
    86  	case Illumina1_3, Illumina1_5:
    87  		return (Qphred(q) - 64).Qsolexa()
    88  	case Solexa:
    89  		return Qsolexa(q) - 64
    90  	case None:
    91  		return -128
    92  	default:
    93  		panic("alphabet: illegal encoding")
    94  	}
    95  }
    96  
    97  // A Letter represents a sequence letter.
    98  type Letter byte
    99  
   100  const logThreshL = 2e2 // Approximate count where range loop becomes slower than copy
   101  
   102  // Repeat a Letter count times.
   103  func (l Letter) Repeat(count int) []Letter {
   104  	r := make([]Letter, count)
   105  	switch {
   106  	case count == 0:
   107  	case count < logThreshL:
   108  		for i := range r {
   109  			r[i] = l
   110  		}
   111  	default:
   112  		r[0] = l
   113  		for i := 1; i < len(r); {
   114  			i += copy(r[i:], r[:i])
   115  		}
   116  	}
   117  
   118  	return r
   119  }
   120  
   121  // BytesToLetters converts a []byte to a []Letter.
   122  func BytesToLetters(b []byte) []Letter { return *(*[]Letter)(unsafe.Pointer(&b)) }
   123  
   124  // LettersToBytes converts a []Letter to a []byte.
   125  func LettersToBytes(l []Letter) []byte { return *(*[]byte)(unsafe.Pointer(&l)) }
   126  
   127  // A Letters is a slice of Letter that satisfies the Slice interface.
   128  type Letters []Letter
   129  
   130  func (l Letters) Make(len, cap int) Slice    { return make(Letters, len, cap) }
   131  func (l Letters) Len() int                   { return len(l) }
   132  func (l Letters) Cap() int                   { return cap(l) }
   133  func (l Letters) Slice(start, end int) Slice { return l[start:end] }
   134  func (l Letters) Append(src Slice) Slice     { return append(l, src.(Letters)...) }
   135  func (l Letters) Copy(src Slice) int         { return copy(l, src.(Letters)) }
   136  func (l Letters) String() string             { return string(LettersToBytes(l)) }
   137  
   138  // A Columns is a slice of []Letter that satisfies the alphabet.Slice interface.
   139  type Columns [][]Letter
   140  
   141  // Make makes a QColumns with the cap and len for each column set to the number of rows of the
   142  // receiver.
   143  func (lc Columns) Make(len, cap int) Slice {
   144  	r := lc.Rows()
   145  	return make(Columns, len, cap).MakeRows(r, r)
   146  }
   147  
   148  // MakeRows makes a column with len and cap for each column of the receiver and returns the receiver.
   149  func (lc Columns) MakeRows(len, cap int) Slice {
   150  	for i := range lc {
   151  		lc[i] = make([]Letter, len, cap)
   152  	}
   153  	return lc
   154  }
   155  
   156  // Rows returns the number of positions in each column.
   157  func (lc Columns) Rows() int { return len(lc[0]) }
   158  
   159  func (lc Columns) Len() int                   { return len(lc) }
   160  func (lc Columns) Cap() int                   { return cap(lc) }
   161  func (lc Columns) Slice(start, end int) Slice { return lc[start:end] }
   162  func (lc Columns) Append(a Slice) Slice {
   163  	// TODO deep copy the columns.
   164  	return append(lc, a.(Columns)...)
   165  }
   166  func min(a, b int) int {
   167  	if a < b {
   168  		return a
   169  	}
   170  	return b
   171  }
   172  func (lc Columns) Copy(a Slice) int {
   173  	ac := a.(Columns)
   174  	var n int
   175  	for i, src := range ac[:min(len(lc), len(ac))] {
   176  		n += copy(lc[i], src)
   177  	}
   178  	return n
   179  }
   180  
   181  // A QLetter represents a sequence letter with an associated quality score.
   182  type QLetter struct {
   183  	L Letter
   184  	Q Qphred
   185  }
   186  
   187  const logThreshQL = 1e2 // Approximate count where range loop becomes slower than copy
   188  
   189  // Repeat a QLetter count times.
   190  func (ql QLetter) Repeat(count int) []QLetter {
   191  	r := make([]QLetter, count)
   192  	switch {
   193  	case count == 0:
   194  	case count < logThreshQL:
   195  		for i := range r {
   196  			r[i] = ql
   197  		}
   198  	default:
   199  		r[0] = ql
   200  		for i := 1; i < len(r); {
   201  			i += copy(r[i:], r[:i])
   202  		}
   203  	}
   204  
   205  	return r
   206  }
   207  
   208  func (ql QLetter) String() string { return string(ql.L) }
   209  
   210  // A QLetters is a slice of QLetter that satisfies the Slice interface.
   211  type QLetters []QLetter
   212  
   213  func (ql QLetters) Make(len, cap int) Slice    { return make(QLetters, len, cap) }
   214  func (ql QLetters) Len() int                   { return len(ql) }
   215  func (ql QLetters) Cap() int                   { return cap(ql) }
   216  func (ql QLetters) Slice(start, end int) Slice { return ql[start:end] }
   217  func (ql QLetters) Append(src Slice) Slice     { return append(ql, src.(QLetters)...) }
   218  func (ql QLetters) Copy(src Slice) int         { return copy(ql, src.(QLetters)) }
   219  
   220  // A QColumns is a slice of []QLetter that satisfies the Slice interface.
   221  type QColumns [][]QLetter
   222  
   223  // Make makes a QColumns with the cap and len for each column set to the number of rows of the
   224  // receiver.
   225  func (qc QColumns) Make(len, cap int) Slice {
   226  	r := qc.Rows()
   227  	return make(QColumns, len, cap).MakeRows(r, r)
   228  }
   229  
   230  // MakeRows makes a column with len and cap for each column of the receiver and returns the receiver.
   231  func (qc QColumns) MakeRows(len, cap int) Slice {
   232  	for i := range qc {
   233  		qc[i] = make([]QLetter, len, cap)
   234  	}
   235  	return qc
   236  }
   237  
   238  // Rows returns the number of positions in each column.
   239  func (qc QColumns) Rows() int { return len(qc[0]) }
   240  
   241  func (qc QColumns) Len() int                   { return len(qc) }
   242  func (qc QColumns) Cap() int                   { return cap(qc) }
   243  func (qc QColumns) Slice(start, end int) Slice { return qc[start:end] }
   244  func (qc QColumns) Append(a Slice) Slice {
   245  	// TODO deep copy the columns.
   246  	return append(qc, a.(QColumns)...)
   247  }
   248  
   249  func (qc QColumns) Copy(a Slice) int {
   250  	ac := a.(QColumns)
   251  	var n int
   252  	for i, src := range ac[:min(len(qc), len(ac))] {
   253  		n += copy(qc[i], src)
   254  	}
   255  	return n
   256  }
   257  
   258  // A Qscore represents a quality score.
   259  type Qscore interface {
   260  	ProbE() float64
   261  	Encode(Encoding) byte
   262  	String() string
   263  }
   264  
   265  var nan = math.NaN()
   266  
   267  // A Qphred represents a Phred quality score.
   268  type Qphred byte
   269  
   270  // Ephred returns the Qphred for a error probability p.
   271  func Ephred(p float64) Qphred {
   272  	if p == 0 {
   273  		return 254
   274  	}
   275  	if math.IsNaN(p) {
   276  		return 255
   277  	}
   278  	Q := -10 * math.Log10(p)
   279  	Q += 0.5
   280  	if Q > 254 {
   281  		Q = 254
   282  	}
   283  	return Qphred(Q)
   284  }
   285  
   286  // ProbE returns the error probability for the receiver's Phred value.
   287  func (qp Qphred) ProbE() float64 {
   288  	return phredETable[qp]
   289  }
   290  
   291  // phredETable holds a lookup for phred E values.
   292  var phredETable = func() [256]float64 {
   293  	t := [256]float64{254: 0, 255: nan}
   294  	for q := range t[:254] {
   295  		t[q] = math.Pow(10, -(float64(q) / 10))
   296  	}
   297  	return t
   298  }()
   299  
   300  // Qsolexa converts the quality value from Phred to Solexa. This conversion is lossy and
   301  // should be avoided; the epsilon on the E value associated with a converted Qsolexa is
   302  // bounded approximately by math.Pow(10, 1e-4-float64(qp)/10) over the range 0 < qp < 127.
   303  func (qp Qphred) Qsolexa() Qsolexa { return phredSolexaTable[qp] }
   304  
   305  // phredSolexaTable holds a lookup for the near equivalent solexa score of a phred score.
   306  var phredSolexaTable = func() [256]Qsolexa {
   307  	t := [256]Qsolexa{254: 127, 255: -128}
   308  	for q := range t[:254] {
   309  		Q := 10 * math.Log10(math.Pow(10, float64(q)/10)-1)
   310  		if Q > 0 {
   311  			Q += 0.5
   312  		} else {
   313  			Q -= 0.5
   314  		}
   315  		if Q > 127 {
   316  			Q = 127
   317  		}
   318  		t[q] = Qsolexa(Q)
   319  	}
   320  	return t
   321  }()
   322  
   323  // Encode encodes the receiver's Phred score to a byte based on the specified encoding.
   324  func (qp Qphred) Encode(e Encoding) (q byte) {
   325  	if qp == 254 {
   326  		return '~'
   327  	}
   328  	if qp == 255 {
   329  		return ' '
   330  	}
   331  	switch e {
   332  	case Sanger, Illumina1_8, Illumina1_9:
   333  		q = byte(qp)
   334  		if q <= 93 {
   335  			q += 33
   336  		}
   337  	case Illumina1_3:
   338  		q = byte(qp)
   339  		if q <= 62 {
   340  			q += 64
   341  		}
   342  	case Illumina1_5:
   343  		q = byte(qp)
   344  		if q <= 62 {
   345  			q += 64
   346  		}
   347  		if q < 'B' {
   348  			q = 'B'
   349  		}
   350  		return q
   351  	case Solexa:
   352  		q = byte(qp.Qsolexa())
   353  		if q <= 62 {
   354  			q += 64
   355  		}
   356  	case None:
   357  		return ' '
   358  	}
   359  
   360  	return
   361  }
   362  
   363  func (qp Qphred) String() string {
   364  	if qp < 254 {
   365  		return string([]byte{byte(qp)})
   366  	} else if qp == 255 {
   367  		return " "
   368  	}
   369  	return "\u221e"
   370  }
   371  
   372  // A Qsolexa represents a Solexa quality score.
   373  type Qsolexa int8
   374  
   375  // Esolexa returns the Qsolexa for a error probability p.
   376  func Esolexa(p float64) Qsolexa {
   377  	if p == 0 {
   378  		return 127
   379  	}
   380  	if math.IsNaN(p) {
   381  		return -128
   382  	}
   383  	Q := -10 * math.Log10(p/(1-p))
   384  	if Q > 0 {
   385  		Q += 0.5
   386  	} else {
   387  		Q -= 0.5
   388  	}
   389  	return Qsolexa(Q)
   390  }
   391  
   392  // ProbE returns the error probability for the receiver's Phred value.
   393  func (qs Qsolexa) ProbE() float64 { return solexaETable[int(qs)+128] }
   394  
   395  // solexaETable holds a translated lookup table for solexa E values. Since solexa
   396  // scores can extend into negative territory, the table is shifted 128 into the
   397  // positive.
   398  var solexaETable = func() [256]float64 {
   399  	t := [256]float64{0: nan, 255: 0}
   400  	for q := range t[1:255] {
   401  		pq := math.Pow(10, -(float64(q-127) / 10))
   402  		t[q+1] = pq / (1 + pq)
   403  	}
   404  	return t
   405  }()
   406  
   407  // Qphred converts the quality value from Solexa to Phred. This conversion is lossy and
   408  // should be avoided; the epsilon on the E value associated with a converted Qphred is
   409  // bounded approximately by math.Pow(10, 1e-4-float64(qs)/10) over the range 0 < qs < 127.
   410  func (qs Qsolexa) Qphred() Qphred { return solexaPhredTable[int(qs)+128] }
   411  
   412  // solexaPhredTable holds a lookup for the near equivalent phred score of a solexa
   413  // score. Since solexa scores can extend into negative territory, the table is
   414  // shifted 128 into the positive.
   415  var solexaPhredTable = func() [256]Qphred {
   416  	t := [256]Qphred{0: 255, 255: 0}
   417  	for q := range t[1:255] {
   418  		qs := q - 127
   419  		Q := Qphred(10*math.Log10(math.Pow(10, float64(qs)/10)) + 0.5)
   420  		if Q > 254 {
   421  			Q = 254
   422  		}
   423  		t[q+1] = Q
   424  	}
   425  	return t
   426  }()
   427  
   428  // Encode encodes the receiver's Solexa score to a byte based on the specified encoding.
   429  func (qs Qsolexa) Encode(e Encoding) (q byte) {
   430  	if qs == 127 {
   431  		return '~'
   432  	}
   433  	if qs == -128 {
   434  		return ' '
   435  	}
   436  	switch e {
   437  	case Sanger, Illumina1_8:
   438  		q = byte(qs.Qphred())
   439  		if q <= 93 {
   440  			q += 33
   441  		}
   442  	case Illumina1_3:
   443  		q = byte(qs.Qphred())
   444  		if q <= 62 {
   445  			q += 64
   446  		}
   447  	case Illumina1_5:
   448  		q = byte(qs.Qphred())
   449  		if q <= 62 {
   450  			q += 64
   451  		}
   452  		if q < 'B' {
   453  			q = 'B'
   454  		}
   455  	case Solexa:
   456  		q = byte(qs)
   457  		if q <= 62 {
   458  			q += 64
   459  		}
   460  	case None:
   461  		return ' '
   462  	}
   463  
   464  	return
   465  }
   466  
   467  func (qs Qsolexa) String() string {
   468  	if qs < 127 && qs != -128 {
   469  		return string([]byte{byte(qs)})
   470  	} else if qs == -128 {
   471  		return " "
   472  	}
   473  	return "\u221e"
   474  }