github.com/biogo/biogo@v1.0.4/seq/alignment/qalignment.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package alignment 6 7 import ( 8 "github.com/biogo/biogo/alphabet" 9 "github.com/biogo/biogo/feat" 10 "github.com/biogo/biogo/seq" 11 "github.com/biogo/biogo/seq/linear" 12 "github.com/biogo/biogo/util" 13 14 "errors" 15 "fmt" 16 "strings" 17 "unicode" 18 ) 19 20 // A QSeq is an aligned sequence with quality scores. 21 type QSeq struct { 22 seq.Annotation 23 SubAnnotations []seq.Annotation 24 Seq alphabet.QColumns 25 ColumnConsense seq.ConsenseFunc 26 Threshold alphabet.Qphred // Threshold for returning valid letter. 27 QFilter seq.QFilter // How to represent below threshold letter. 28 Encode alphabet.Encoding 29 } 30 31 // NewSeq creates a new Seq with the given id, letter sequence and alphabet. 32 func NewQSeq(id string, subids []string, ql [][]alphabet.QLetter, alpha alphabet.Alphabet, enc alphabet.Encoding, cons seq.ConsenseFunc) (*QSeq, error) { 33 var ( 34 lids, lseq = len(subids), len(ql) 35 subann []seq.Annotation 36 ) 37 switch { 38 case lids == 0 && len(ql) == 0: 39 case lseq != 0 && lids == len(ql[0]): 40 if lids == 0 { 41 subann = make([]seq.Annotation, len(ql[0])) 42 for i := range subids { 43 subann[i].ID = fmt.Sprintf("%s:%d", id, i) 44 } 45 } else { 46 subann = make([]seq.Annotation, lids) 47 for i, sid := range subids { 48 subann[i].ID = sid 49 } 50 } 51 default: 52 return nil, errors.New("alignment: id/seq number mismatch") 53 } 54 55 return &QSeq{ 56 Annotation: seq.Annotation{ 57 ID: id, 58 Alpha: alpha, 59 }, 60 SubAnnotations: subann, 61 Seq: append([][]alphabet.QLetter(nil), ql...), 62 Encode: enc, 63 ColumnConsense: cons, 64 Threshold: 2, 65 QFilter: seq.AmbigFilter, 66 }, nil 67 } 68 69 // Interface guarantees 70 var ( 71 _ feat.Feature = (*QSeq)(nil) 72 _ feat.Feature = QRow{} 73 _ seq.Sequence = QRow{} 74 _ seq.Scorer = QRow{} 75 ) 76 77 // Slice returns the sequence data as a alphabet.Slice. 78 func (s *QSeq) Slice() alphabet.Slice { return s.Seq } 79 80 // SetSlice sets the sequence data represented by the Seq. SetSlice will panic if sl 81 // is not a QColumns. 82 func (s *QSeq) SetSlice(sl alphabet.Slice) { s.Seq = sl.(alphabet.QColumns) } 83 84 // Encoding returns the quality encoding scheme. 85 func (s *QSeq) Encoding() alphabet.Encoding { return s.Encode } 86 87 // SetEncoding sets the quality encoding scheme to e. 88 func (s *QSeq) SetEncoding(e alphabet.Encoding) error { s.Encode = e; return nil } 89 90 // Len returns the length of the alignment. 91 func (s *QSeq) Len() int { return len(s.Seq) } 92 93 // Rows returns the number of rows in the alignment. 94 func (s *QSeq) Rows() int { return s.Seq.Rows() } 95 96 // Start returns the start position of the sequence in coordinates relative to the 97 // sequence location. 98 func (s *QSeq) Start() int { return s.Offset } 99 100 // End returns the end position of the sequence in coordinates relative to the 101 // sequence location. 102 func (s *QSeq) End() int { return s.Offset + s.Len() } 103 104 // Clone returns a copy of the sequence. 105 func (s *QSeq) Clone() seq.Rower { 106 c := *s 107 c.Seq = make([][]alphabet.QLetter, len(s.Seq)) 108 for i, s := range s.Seq { 109 c.Seq[i] = append([]alphabet.QLetter(nil), s...) 110 } 111 112 return &c 113 } 114 115 // New returns an empty *QSeq sequence with the same alphabet. 116 func (s *QSeq) New() *QSeq { 117 return &QSeq{Annotation: seq.Annotation{Alpha: s.Alpha}} 118 } 119 120 // RevComp reverse complements the sequence. RevComp will panic if the alphabet used by 121 // the receiver is not a Complementor. 122 func (s *QSeq) RevComp() { 123 rs, comp := s.Seq, s.Alpha.(alphabet.Complementor).ComplementTable() 124 i, j := 0, len(rs)-1 125 for ; i < j; i, j = i+1, j-1 { 126 for r := range rs[i] { 127 rs[i][r].L, rs[j][r].L = comp[rs[j][r].L], comp[rs[i][r].L] 128 rs[i][r].Q, rs[j][r].Q = rs[j][r].Q, rs[i][r].Q 129 } 130 } 131 if i == j { 132 for r := range rs[i] { 133 rs[i][r].L = comp[rs[i][r].L] 134 } 135 } 136 s.Strand = -s.Strand 137 } 138 139 // Reverse reverses the order of letters in the the sequence without complementing them. 140 func (s *QSeq) Reverse() { 141 l := s.Seq 142 for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 { 143 l[i], l[j] = l[j], l[i] 144 } 145 s.Strand = seq.None 146 } 147 148 func (s *QSeq) String() string { 149 t := s.Consensus(false) 150 t.Threshold = s.Threshold 151 t.QFilter = s.QFilter 152 return t.String() 153 } 154 155 // Add sequences n to Alignment. Sequences in n must align start and end with the receiving alignment. 156 // Additional sequence will be clipped. 157 func (s *QSeq) Add(n ...seq.Sequence) error { 158 for i := s.Start(); i < s.End(); i++ { 159 s.Seq[i] = append(s.Seq[i], s.column(n, i)...) 160 } 161 for i := range n { 162 s.SubAnnotations = append(s.SubAnnotations, *n[i].CloneAnnotation()) 163 } 164 165 return nil 166 } 167 168 func (s *QSeq) column(m []seq.Sequence, pos int) []alphabet.QLetter { 169 c := make([]alphabet.QLetter, 0, s.Rows()) 170 171 for _, r := range m { 172 if a, ok := r.(seq.Aligned); ok { 173 if a.Start() <= pos && pos < a.End() { 174 c = append(c, a.ColumnQL(pos, true)...) 175 } else { 176 c = append(c, alphabet.QLetter{L: s.Alpha.Gap()}.Repeat(a.Rows())...) 177 } 178 } else { 179 if r.Start() <= pos && pos < r.End() { 180 c = append(c, r.At(pos)) 181 } else { 182 c = append(c, alphabet.QLetter{L: s.Alpha.Gap()}) 183 } 184 } 185 } 186 187 return c 188 } 189 190 // Delete removes the sequence represented at row i of the alignment. It panics if i is out of range. 191 func (s *QSeq) Delete(i int) { 192 if i >= s.Rows() { 193 panic("alignment: index out of range") 194 } 195 cs := s.Seq 196 for j, c := range cs { 197 cs[j] = c[:i+copy(c[i:], c[i+1:])] 198 } 199 sa := s.SubAnnotations 200 s.SubAnnotations = sa[:i+copy(sa[i:], sa[i+1:])] 201 } 202 203 // Row returns the sequence represented at row i of the alignment. It panics is i is out of range. 204 func (s *QSeq) Row(i int) seq.Sequence { 205 if i < 0 || i >= s.Rows() { 206 panic("alignment: index out of range") 207 } 208 return QRow{Align: s, Row: i} 209 } 210 211 // AppendColumns appends each Qletter of each element of a to the appropriate sequence in the receiver. 212 func (s *QSeq) AppendColumns(a ...[]alphabet.QLetter) error { 213 for i, c := range a { 214 if len(c) != s.Rows() { 215 return fmt.Errorf("alignment: column %d does not match Rows(): %d != %d.", i, len(c), s.Rows()) 216 } 217 } 218 219 s.Seq = append(s.Seq, a...) 220 221 return nil 222 } 223 224 // AppendEach appends each []alphabet.QLetter in a to the appropriate sequence in the receiver. 225 func (s *QSeq) AppendEach(a [][]alphabet.QLetter) error { 226 if len(a) != s.Rows() { 227 return fmt.Errorf("alignment: number of sequences does not match Rows(): %d != %d.", len(a), s.Rows()) 228 } 229 max := util.MinInt 230 for _, r := range a { 231 if l := len(r); l > max { 232 max = l 233 } 234 } 235 s.Seq = append(s.Seq, make([][]alphabet.QLetter, max)...)[:len(s.Seq)] 236 for i, b := 0, make([]alphabet.QLetter, 0, len(a)); i < max; i, b = i+1, b[:0] { 237 for _, r := range a { 238 if i < len(r) { 239 b = append(b, r[i]) 240 } else { 241 b = append(b, alphabet.QLetter{L: s.Alpha.Gap()}) 242 } 243 } 244 s.AppendColumns(b) 245 } 246 247 return nil 248 } 249 250 // Column returns a slice of letters reflecting the column at pos. 251 func (s *QSeq) Column(pos int, _ bool) []alphabet.Letter { 252 c := make([]alphabet.Letter, s.Rows()) 253 for i, l := range s.Seq[pos] { 254 if l.Q >= s.Threshold { 255 c[i] = l.L 256 } else { 257 c[i] = s.QFilter(s.Alpha, 255, alphabet.QLetter{}) 258 } 259 } 260 261 return c 262 } 263 264 // ColumnQL returns a slice of quality letters reflecting the column at pos. 265 func (s *QSeq) ColumnQL(pos int, _ bool) []alphabet.QLetter { return s.Seq[pos] } 266 267 // Consensus returns a quality sequence reflecting the consensus of the receiver determined by the 268 // ColumnConsense field. 269 func (s *QSeq) Consensus(_ bool) *linear.QSeq { 270 cs := make([]alphabet.QLetter, 0, s.Len()) 271 alpha := s.Alphabet() 272 for i := range s.Seq { 273 cs = append(cs, s.ColumnConsense(s, alpha, i, false)) 274 } 275 276 qs := linear.NewQSeq("Consensus:"+s.ID, cs, s.Alpha, alphabet.Sanger) 277 qs.Strand = s.Strand 278 qs.SetOffset(s.Offset) 279 qs.Conform = s.Conform 280 281 return qs 282 } 283 284 // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's' 285 // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support 286 // truncated output via the verb's precision. Fasta format supports sequence line 287 // specification via the verb's width field. Fastq format supports optional inclusion 288 // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#' 289 // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for 290 // omission of the sequence name. 291 func (s *QSeq) Format(fs fmt.State, c rune) { 292 if s == nil { 293 fmt.Fprint(fs, "<nil>") 294 return 295 } 296 switch c { 297 case 'v': 298 if fs.Flag('#') { 299 fmt.Fprintf(fs, "&%#v", *s) 300 return 301 } 302 fallthrough 303 case 's', 'a', 'q': 304 r := QRow{Align: s} 305 for r.Row = 0; r.Row < s.Rows(); r.Row++ { 306 r.Format(fs, c) 307 if r.Row < s.Rows()-1 { 308 fmt.Fprintln(fs) 309 } 310 } 311 default: 312 fmt.Fprintf(fs, "%%!%c(*alignment.QSeq=%.10s)", c, s) 313 } 314 } 315 316 // A Row is a pointer into an alignment that satisfies the seq.Sequence and seq.Scorer interfaces. 317 type QRow struct { 318 Align *QSeq 319 Row int 320 } 321 322 // At returns the letter at position i. 323 func (r QRow) At(i int) alphabet.QLetter { 324 return r.Align.Seq[i-r.Align.Offset][r.Row] 325 } 326 327 // Set sets the letter at position i to l. 328 func (r QRow) Set(i int, l alphabet.QLetter) error { 329 r.Align.Seq[i-r.Align.Offset][r.Row] = l 330 return nil 331 } 332 333 // Len returns the length of the alignment. 334 func (r QRow) Len() int { return len(r.Align.Seq) } 335 336 // Start returns the start position of the sequence in coordinates relative to the 337 // sequence location. 338 func (r QRow) Start() int { return r.Align.SubAnnotations[r.Row].Offset } 339 340 // End returns the end position of the sequence in coordinates relative to the 341 // sequence location. 342 func (r QRow) End() int { return r.Start() + r.Len() } 343 344 // Location returns the feature containing the row's sequence. 345 func (r QRow) Location() feat.Feature { return r.Align.SubAnnotations[r.Row].Loc } 346 347 // SetE sets the quality at position i to e to reflect the given p(Error). 348 func (r QRow) SetE(i int, e float64) error { 349 r.Align.Seq[i-r.Align.Offset][r.Row].Q = alphabet.Ephred(e) 350 return nil 351 } 352 353 // QEncode encodes the quality at position i to a letter based on the sequence encoding setting. 354 func (r QRow) QEncode(i int) byte { 355 return r.Align.Seq[i-r.Align.Offset][r.Row].Q.Encode(r.Encoding()) 356 } 357 358 // EAt returns the probability of a sequence error at position i. 359 func (r QRow) EAt(i int) float64 { 360 return r.Align.Seq[i-r.Align.Offset][r.Row].Q.ProbE() 361 } 362 363 func (r QRow) Alphabet() alphabet.Alphabet { return r.Align.Alpha } 364 func (r QRow) Encoding() alphabet.Encoding { return r.Align.Encoding() } 365 func (r QRow) SetEncoding(e alphabet.Encoding) error { return r.Align.SetEncoding(e) } 366 func (r QRow) Conformation() feat.Conformation { return r.Align.SubAnnotations[r.Row].Conform } 367 func (r QRow) SetConformation(c feat.Conformation) error { 368 r.Align.SubAnnotations[r.Row].Conform = c 369 return nil 370 } 371 func (r QRow) Name() string { return r.Align.SubAnnotations[r.Row].ID } 372 func (r QRow) Description() string { return r.Align.SubAnnotations[r.Row].Desc } 373 func (r QRow) SetOffset(o int) error { r.Align.SubAnnotations[r.Row].Offset = o; return nil } 374 375 func (r QRow) RevComp() { 376 rs, comp := r.Align.Seq, r.Alphabet().(alphabet.Complementor).ComplementTable() 377 i, j := 0, len(rs)-1 378 for ; i < j; i, j = i+1, j-1 { 379 rs[i][r.Row].L, rs[j][r.Row].L = comp[rs[j][r.Row].L], comp[rs[i][r.Row].L] 380 rs[i][r.Row].Q, rs[j][r.Row].Q = rs[j][r.Row].Q, rs[i][r.Row].Q 381 } 382 if i == j { 383 rs[i][r.Row].L = comp[rs[i][r.Row].L] 384 } 385 r.Align.SubAnnotations[r.Row].Strand = -r.Align.SubAnnotations[r.Row].Strand 386 } 387 func (r QRow) Reverse() { 388 l := r.Align.Seq 389 for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 { 390 l[i][r.Row], l[j][r.Row] = l[j][r.Row], l[i][r.Row] 391 } 392 r.Align.SubAnnotations[r.Row].Strand = seq.None 393 } 394 func (r QRow) New() seq.Sequence { 395 return QRow{Align: &QSeq{Annotation: seq.Annotation{Alpha: r.Align.Alpha}}} 396 } 397 func (r QRow) Clone() seq.Sequence { 398 b := make([]alphabet.QLetter, r.Len()) 399 for i, c := range r.Align.Seq { 400 b[i] = c[r.Row] 401 } 402 return linear.NewQSeq(r.Name(), b, r.Alphabet(), r.Align.Encoding()) 403 } 404 func (r QRow) CloneAnnotation() *seq.Annotation { 405 return r.Align.SubAnnotations[r.Row].CloneAnnotation() 406 } 407 408 func min(a, b int) int { 409 if a < b { 410 return a 411 } 412 return b 413 } 414 415 // String returns a string representation of the sequence data only. 416 func (r QRow) String() string { return fmt.Sprintf("%-s", r) } 417 418 // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's' 419 // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support 420 // truncated output via the verb's precision. Fasta format supports sequence line 421 // specification via the verb's width field. Fastq format supports optional inclusion 422 // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#' 423 // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for 424 // omission of the sequence name. 425 func (r QRow) Format(fs fmt.State, c rune) { 426 var ( 427 s = r.Align 428 w, wOk = fs.Width() 429 p, pOk = fs.Precision() 430 buf alphabet.QColumns 431 ) 432 if s != nil { 433 if pOk { 434 buf = s.Seq[:min(p, len(s.Seq))] 435 } else { 436 buf = s.Seq 437 } 438 } 439 440 switch c { 441 case 'v': 442 if fs.Flag('#') { 443 type shadowQRow QRow 444 sr := fmt.Sprintf("%#v", shadowQRow(r)) 445 fmt.Fprintf(fs, "%T%s", r, sr[strings.Index(sr, "{"):]) 446 return 447 } 448 fallthrough 449 case 's': 450 if s == nil { 451 fmt.Fprint(fs, "<nil>") 452 return 453 } 454 if !fs.Flag('-') { 455 fmt.Fprintf(fs, "%q ", r.Name()) 456 } 457 for _, qc := range buf { 458 fmt.Fprintf(fs, "%c", s.QFilter(s.Alpha, s.Threshold, qc[r.Row])) 459 } 460 if pOk && p < s.Len() { 461 fmt.Fprint(fs, "...") 462 } 463 case 'a': 464 if s == nil { 465 return 466 } 467 r.formatDescLineTo(fs, '>') 468 for i, qc := range buf { 469 fmt.Fprintf(fs, "%c", s.QFilter(s.Alpha, s.Threshold, qc[r.Row])) 470 if wOk && i < s.Len()-1 && i%w == w-1 { 471 fmt.Fprintln(fs) 472 } 473 } 474 if pOk && p < s.Len() { 475 fmt.Fprint(fs, "...") 476 } 477 case 'q': 478 if s == nil { 479 return 480 } 481 r.formatDescLineTo(fs, '@') 482 for _, qc := range buf { 483 fmt.Fprintf(fs, "%c", qc[r.Row].L) 484 } 485 if pOk && p < s.Len() { 486 fmt.Fprintln(fs, "...") 487 } else { 488 fmt.Fprintln(fs) 489 } 490 if fs.Flag('+') { 491 r.formatDescLineTo(fs, '+') 492 } else { 493 fmt.Fprintln(fs, "+") 494 } 495 for _, qc := range buf { 496 e := qc[r.Row].Q.Encode(s.Encode) 497 if e >= unicode.MaxASCII { 498 e = unicode.MaxASCII - 1 499 } 500 fmt.Fprintf(fs, "%c", e) 501 } 502 if pOk && p < s.Len() { 503 fmt.Fprint(fs, "...") 504 } 505 default: 506 fmt.Fprintf(fs, "%%!%c(alignment.QRow=%.10s)", c, s) 507 } 508 } 509 510 func (r QRow) formatDescLineTo(fs fmt.State, p rune) { 511 fmt.Fprintf(fs, "%c%s", p, r.Name()) 512 if d := r.Description(); d != "" { 513 fmt.Fprintf(fs, " %s", d) 514 } 515 fmt.Fprintln(fs) 516 } 517 518 // SetSlice unconditionally panics. 519 func (r QRow) SetSlice(_ alphabet.Slice) { panic("alignment: cannot alter row slice") } 520 521 // Slice unconditionally panics. 522 func (r QRow) Slice() alphabet.Slice { panic("alignment: cannot get row slice") }