github.com/biogo/biogo@v1.0.4/alphabet/letters.go (about) 1 // Copyright ©2011-2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package alphabet 6 7 import ( 8 "math" 9 "unsafe" 10 ) 11 12 // The Slice interface reflects the built-in slice type behavior. 13 type Slice interface { 14 // Make makes a Slice with the same concrete type as the receiver. Make will 15 // panic if len or cap are less than zero or cap is less than len. 16 Make(len, cap int) Slice 17 18 // Len returns the length of the Slice. 19 Len() int 20 21 // Cap returns the capacity of the Slice. 22 Cap() int 23 24 // Slice returns a slice of the Slice. The returned slice may be backed by 25 // the same array as the receiver. 26 Slice(start, end int) Slice 27 28 // Append appends src... to the receiver and returns the resulting slice. If the append 29 // results in a grow slice the receiver will not reflect the appended slice, so the 30 // returned Slice should always be stored. Append should panic if src and the receiver 31 // are not the same concrete type. 32 Append(src Slice) Slice 33 34 // Copy copies elements from src into the receiver, returning the number of elements 35 // copied. Copy should panic if src and the receiver are not the same concrete type. 36 Copy(src Slice) int 37 } 38 39 // An Encoding represents a quality score encoding scheme. 40 // Q-range 41 // 42 // Sanger !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI··· 0 - 40 43 // Solexa ··;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh··· -5 - 40 44 // Illumina 1.3+ @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh··· 0 - 40 45 // Illumina 1.5+ xxḆCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh··· 3 - 40 46 // Illumina 1.8+ !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ··· 0 - 40 47 // 48 // !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh··· ···{|}~ 49 // | | | | | | 50 // 33 59 64 73 104 126 51 // 52 // Q-range for typical raw reads 53 type Encoding int8 54 55 const ( 56 None Encoding = iota - 1 // All letters are decoded as scores with p(Error) = NaN. 57 Sanger // Phred+33 58 Solexa // Solexa+64 59 Illumina1_3 // Phred+64 60 Illumina1_5 // Phred+64 0,1=unused, 2=Read Segment Quality Control Indicator (Ḇ) 61 Illumina1_8 // Phred+33 62 Illumina1_9 // Phred+33 63 ) 64 65 // DecodeToPhred interprets the byte q as an e encoded quality and returns the corresponding Phred score. 66 func (e Encoding) DecodeToQphred(q byte) Qphred { 67 switch e { 68 case Sanger, Illumina1_8, Illumina1_9: 69 return Qphred(q) - 33 70 case Illumina1_3, Illumina1_5: 71 return Qphred(q) - 64 72 case Solexa: 73 return (Qsolexa(q) - 64).Qphred() 74 case None: 75 return 0xff 76 default: 77 panic("alphabet: illegal encoding") 78 } 79 } 80 81 // DecodeToPhred interprets the byte q as an e encoded quality and returns the corresponding Solexa score. 82 func (e Encoding) DecodeToQsolexa(q byte) Qsolexa { 83 switch e { 84 case Sanger, Illumina1_8, Illumina1_9: 85 return (Qphred(q) - 33).Qsolexa() 86 case Illumina1_3, Illumina1_5: 87 return (Qphred(q) - 64).Qsolexa() 88 case Solexa: 89 return Qsolexa(q) - 64 90 case None: 91 return -128 92 default: 93 panic("alphabet: illegal encoding") 94 } 95 } 96 97 // A Letter represents a sequence letter. 98 type Letter byte 99 100 const logThreshL = 2e2 // Approximate count where range loop becomes slower than copy 101 102 // Repeat a Letter count times. 103 func (l Letter) Repeat(count int) []Letter { 104 r := make([]Letter, count) 105 switch { 106 case count == 0: 107 case count < logThreshL: 108 for i := range r { 109 r[i] = l 110 } 111 default: 112 r[0] = l 113 for i := 1; i < len(r); { 114 i += copy(r[i:], r[:i]) 115 } 116 } 117 118 return r 119 } 120 121 // BytesToLetters converts a []byte to a []Letter. 122 func BytesToLetters(b []byte) []Letter { return *(*[]Letter)(unsafe.Pointer(&b)) } 123 124 // LettersToBytes converts a []Letter to a []byte. 125 func LettersToBytes(l []Letter) []byte { return *(*[]byte)(unsafe.Pointer(&l)) } 126 127 // A Letters is a slice of Letter that satisfies the Slice interface. 128 type Letters []Letter 129 130 func (l Letters) Make(len, cap int) Slice { return make(Letters, len, cap) } 131 func (l Letters) Len() int { return len(l) } 132 func (l Letters) Cap() int { return cap(l) } 133 func (l Letters) Slice(start, end int) Slice { return l[start:end] } 134 func (l Letters) Append(src Slice) Slice { return append(l, src.(Letters)...) } 135 func (l Letters) Copy(src Slice) int { return copy(l, src.(Letters)) } 136 func (l Letters) String() string { return string(LettersToBytes(l)) } 137 138 // A Columns is a slice of []Letter that satisfies the alphabet.Slice interface. 139 type Columns [][]Letter 140 141 // Make makes a QColumns with the cap and len for each column set to the number of rows of the 142 // receiver. 143 func (lc Columns) Make(len, cap int) Slice { 144 r := lc.Rows() 145 return make(Columns, len, cap).MakeRows(r, r) 146 } 147 148 // MakeRows makes a column with len and cap for each column of the receiver and returns the receiver. 149 func (lc Columns) MakeRows(len, cap int) Slice { 150 for i := range lc { 151 lc[i] = make([]Letter, len, cap) 152 } 153 return lc 154 } 155 156 // Rows returns the number of positions in each column. 157 func (lc Columns) Rows() int { return len(lc[0]) } 158 159 func (lc Columns) Len() int { return len(lc) } 160 func (lc Columns) Cap() int { return cap(lc) } 161 func (lc Columns) Slice(start, end int) Slice { return lc[start:end] } 162 func (lc Columns) Append(a Slice) Slice { 163 // TODO deep copy the columns. 164 return append(lc, a.(Columns)...) 165 } 166 func min(a, b int) int { 167 if a < b { 168 return a 169 } 170 return b 171 } 172 func (lc Columns) Copy(a Slice) int { 173 ac := a.(Columns) 174 var n int 175 for i, src := range ac[:min(len(lc), len(ac))] { 176 n += copy(lc[i], src) 177 } 178 return n 179 } 180 181 // A QLetter represents a sequence letter with an associated quality score. 182 type QLetter struct { 183 L Letter 184 Q Qphred 185 } 186 187 const logThreshQL = 1e2 // Approximate count where range loop becomes slower than copy 188 189 // Repeat a QLetter count times. 190 func (ql QLetter) Repeat(count int) []QLetter { 191 r := make([]QLetter, count) 192 switch { 193 case count == 0: 194 case count < logThreshQL: 195 for i := range r { 196 r[i] = ql 197 } 198 default: 199 r[0] = ql 200 for i := 1; i < len(r); { 201 i += copy(r[i:], r[:i]) 202 } 203 } 204 205 return r 206 } 207 208 func (ql QLetter) String() string { return string(ql.L) } 209 210 // A QLetters is a slice of QLetter that satisfies the Slice interface. 211 type QLetters []QLetter 212 213 func (ql QLetters) Make(len, cap int) Slice { return make(QLetters, len, cap) } 214 func (ql QLetters) Len() int { return len(ql) } 215 func (ql QLetters) Cap() int { return cap(ql) } 216 func (ql QLetters) Slice(start, end int) Slice { return ql[start:end] } 217 func (ql QLetters) Append(src Slice) Slice { return append(ql, src.(QLetters)...) } 218 func (ql QLetters) Copy(src Slice) int { return copy(ql, src.(QLetters)) } 219 220 // A QColumns is a slice of []QLetter that satisfies the Slice interface. 221 type QColumns [][]QLetter 222 223 // Make makes a QColumns with the cap and len for each column set to the number of rows of the 224 // receiver. 225 func (qc QColumns) Make(len, cap int) Slice { 226 r := qc.Rows() 227 return make(QColumns, len, cap).MakeRows(r, r) 228 } 229 230 // MakeRows makes a column with len and cap for each column of the receiver and returns the receiver. 231 func (qc QColumns) MakeRows(len, cap int) Slice { 232 for i := range qc { 233 qc[i] = make([]QLetter, len, cap) 234 } 235 return qc 236 } 237 238 // Rows returns the number of positions in each column. 239 func (qc QColumns) Rows() int { return len(qc[0]) } 240 241 func (qc QColumns) Len() int { return len(qc) } 242 func (qc QColumns) Cap() int { return cap(qc) } 243 func (qc QColumns) Slice(start, end int) Slice { return qc[start:end] } 244 func (qc QColumns) Append(a Slice) Slice { 245 // TODO deep copy the columns. 246 return append(qc, a.(QColumns)...) 247 } 248 249 func (qc QColumns) Copy(a Slice) int { 250 ac := a.(QColumns) 251 var n int 252 for i, src := range ac[:min(len(qc), len(ac))] { 253 n += copy(qc[i], src) 254 } 255 return n 256 } 257 258 // A Qscore represents a quality score. 259 type Qscore interface { 260 ProbE() float64 261 Encode(Encoding) byte 262 String() string 263 } 264 265 var nan = math.NaN() 266 267 // A Qphred represents a Phred quality score. 268 type Qphred byte 269 270 // Ephred returns the Qphred for a error probability p. 271 func Ephred(p float64) Qphred { 272 if p == 0 { 273 return 254 274 } 275 if math.IsNaN(p) { 276 return 255 277 } 278 Q := -10 * math.Log10(p) 279 Q += 0.5 280 if Q > 254 { 281 Q = 254 282 } 283 return Qphred(Q) 284 } 285 286 // ProbE returns the error probability for the receiver's Phred value. 287 func (qp Qphred) ProbE() float64 { 288 return phredETable[qp] 289 } 290 291 // phredETable holds a lookup for phred E values. 292 var phredETable = func() [256]float64 { 293 t := [256]float64{254: 0, 255: nan} 294 for q := range t[:254] { 295 t[q] = math.Pow(10, -(float64(q) / 10)) 296 } 297 return t 298 }() 299 300 // Qsolexa converts the quality value from Phred to Solexa. This conversion is lossy and 301 // should be avoided; the epsilon on the E value associated with a converted Qsolexa is 302 // bounded approximately by math.Pow(10, 1e-4-float64(qp)/10) over the range 0 < qp < 127. 303 func (qp Qphred) Qsolexa() Qsolexa { return phredSolexaTable[qp] } 304 305 // phredSolexaTable holds a lookup for the near equivalent solexa score of a phred score. 306 var phredSolexaTable = func() [256]Qsolexa { 307 t := [256]Qsolexa{254: 127, 255: -128} 308 for q := range t[:254] { 309 Q := 10 * math.Log10(math.Pow(10, float64(q)/10)-1) 310 if Q > 0 { 311 Q += 0.5 312 } else { 313 Q -= 0.5 314 } 315 if Q > 127 { 316 Q = 127 317 } 318 t[q] = Qsolexa(Q) 319 } 320 return t 321 }() 322 323 // Encode encodes the receiver's Phred score to a byte based on the specified encoding. 324 func (qp Qphred) Encode(e Encoding) (q byte) { 325 if qp == 254 { 326 return '~' 327 } 328 if qp == 255 { 329 return ' ' 330 } 331 switch e { 332 case Sanger, Illumina1_8, Illumina1_9: 333 q = byte(qp) 334 if q <= 93 { 335 q += 33 336 } 337 case Illumina1_3: 338 q = byte(qp) 339 if q <= 62 { 340 q += 64 341 } 342 case Illumina1_5: 343 q = byte(qp) 344 if q <= 62 { 345 q += 64 346 } 347 if q < 'B' { 348 q = 'B' 349 } 350 return q 351 case Solexa: 352 q = byte(qp.Qsolexa()) 353 if q <= 62 { 354 q += 64 355 } 356 case None: 357 return ' ' 358 } 359 360 return 361 } 362 363 func (qp Qphred) String() string { 364 if qp < 254 { 365 return string([]byte{byte(qp)}) 366 } else if qp == 255 { 367 return " " 368 } 369 return "\u221e" 370 } 371 372 // A Qsolexa represents a Solexa quality score. 373 type Qsolexa int8 374 375 // Esolexa returns the Qsolexa for a error probability p. 376 func Esolexa(p float64) Qsolexa { 377 if p == 0 { 378 return 127 379 } 380 if math.IsNaN(p) { 381 return -128 382 } 383 Q := -10 * math.Log10(p/(1-p)) 384 if Q > 0 { 385 Q += 0.5 386 } else { 387 Q -= 0.5 388 } 389 return Qsolexa(Q) 390 } 391 392 // ProbE returns the error probability for the receiver's Phred value. 393 func (qs Qsolexa) ProbE() float64 { return solexaETable[int(qs)+128] } 394 395 // solexaETable holds a translated lookup table for solexa E values. Since solexa 396 // scores can extend into negative territory, the table is shifted 128 into the 397 // positive. 398 var solexaETable = func() [256]float64 { 399 t := [256]float64{0: nan, 255: 0} 400 for q := range t[1:255] { 401 pq := math.Pow(10, -(float64(q-127) / 10)) 402 t[q+1] = pq / (1 + pq) 403 } 404 return t 405 }() 406 407 // Qphred converts the quality value from Solexa to Phred. This conversion is lossy and 408 // should be avoided; the epsilon on the E value associated with a converted Qphred is 409 // bounded approximately by math.Pow(10, 1e-4-float64(qs)/10) over the range 0 < qs < 127. 410 func (qs Qsolexa) Qphred() Qphred { return solexaPhredTable[int(qs)+128] } 411 412 // solexaPhredTable holds a lookup for the near equivalent phred score of a solexa 413 // score. Since solexa scores can extend into negative territory, the table is 414 // shifted 128 into the positive. 415 var solexaPhredTable = func() [256]Qphred { 416 t := [256]Qphred{0: 255, 255: 0} 417 for q := range t[1:255] { 418 qs := q - 127 419 Q := Qphred(10*math.Log10(math.Pow(10, float64(qs)/10)) + 0.5) 420 if Q > 254 { 421 Q = 254 422 } 423 t[q+1] = Q 424 } 425 return t 426 }() 427 428 // Encode encodes the receiver's Solexa score to a byte based on the specified encoding. 429 func (qs Qsolexa) Encode(e Encoding) (q byte) { 430 if qs == 127 { 431 return '~' 432 } 433 if qs == -128 { 434 return ' ' 435 } 436 switch e { 437 case Sanger, Illumina1_8: 438 q = byte(qs.Qphred()) 439 if q <= 93 { 440 q += 33 441 } 442 case Illumina1_3: 443 q = byte(qs.Qphred()) 444 if q <= 62 { 445 q += 64 446 } 447 case Illumina1_5: 448 q = byte(qs.Qphred()) 449 if q <= 62 { 450 q += 64 451 } 452 if q < 'B' { 453 q = 'B' 454 } 455 case Solexa: 456 q = byte(qs) 457 if q <= 62 { 458 q += 64 459 } 460 case None: 461 return ' ' 462 } 463 464 return 465 } 466 467 func (qs Qsolexa) String() string { 468 if qs < 127 && qs != -128 { 469 return string([]byte{byte(qs)}) 470 } else if qs == -128 { 471 return " " 472 } 473 return "\u221e" 474 }