github.com/biogo/biogo@v1.0.4/seq/multi/multi.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package multi handles collections of sequences as alignments or sets. 6 package multi 7 8 import ( 9 "bytes" 10 11 "github.com/biogo/biogo/alphabet" 12 "github.com/biogo/biogo/feat" 13 "github.com/biogo/biogo/seq" 14 "github.com/biogo/biogo/seq/linear" 15 "github.com/biogo/biogo/seq/sequtils" 16 "github.com/biogo/biogo/util" 17 18 "errors" 19 "fmt" 20 "reflect" 21 "sort" 22 "strings" 23 "sync" 24 ) 25 26 func init() { 27 joinerRegistryLock = &sync.RWMutex{} 28 joinerRegistry = make(map[reflect.Type]JoinFunc) 29 } 30 31 var ( 32 joinerRegistryLock *sync.RWMutex 33 joinerRegistry map[reflect.Type]JoinFunc 34 ) 35 36 type Multi struct { 37 seq.Annotation 38 Seq []seq.Sequence 39 ColumnConsense seq.ConsenseFunc 40 Encode alphabet.Encoding 41 } 42 43 // Create a new Multi sequence. 44 func NewMulti(id string, n []seq.Sequence, cons seq.ConsenseFunc) (*Multi, error) { 45 var alpha alphabet.Alphabet 46 for _, s := range n { 47 if alpha != nil && s.Alphabet() != alpha { 48 return nil, errors.New("multi: inconsistent alphabets") 49 } else if alpha == nil { 50 alpha = s.Alphabet() 51 } 52 } 53 return &Multi{ 54 Annotation: seq.Annotation{ 55 ID: id, 56 Alpha: alpha, 57 }, 58 Seq: n, 59 ColumnConsense: cons, 60 }, nil 61 } 62 63 // Encoding returns the quality encoding scheme. 64 func (m *Multi) Encoding() alphabet.Encoding { return m.Encode } 65 66 // SetEncoding sets the quality encoding scheme to e. 67 func (m *Multi) SetEncoding(e alphabet.Encoding) { 68 for _, r := range m.Seq { 69 if enc, ok := r.(seq.Scorer); ok { 70 enc.SetEncoding(e) 71 } 72 } 73 m.Encode = e 74 } 75 76 // Len returns the length of the alignment. 77 func (m *Multi) Len() int { 78 var ( 79 min = util.MaxInt 80 max = util.MinInt 81 ) 82 83 for _, r := range m.Seq { 84 if start := r.Start(); start < min { 85 min = start 86 } 87 if end := r.End(); end > max { 88 max = end 89 } 90 } 91 92 return max - min 93 } 94 95 // Rows returns the number of rows in the alignment. 96 func (m *Multi) Rows() int { 97 return len(m.Seq) 98 } 99 100 // SetOffset sets the location-relative offset of the sequence to o. 101 func (m *Multi) SetOffset(o int) error { 102 for _, r := range m.Seq { 103 r.SetOffset(r.Start() - m.Offset + o) 104 } 105 m.Offset = o 106 return nil 107 } 108 109 // Start returns the start position of the sequence in coordinates relative to 110 // the sequence location. 111 func (m *Multi) Start() int { 112 start := util.MaxInt 113 for _, r := range m.Seq { 114 if lt := r.Start(); lt < start { 115 start = lt 116 } 117 } 118 119 return start 120 } 121 122 // End returns the end position of the sequence in coordinates relative to 123 // the sequence location. 124 func (m *Multi) End() int { 125 end := util.MinInt 126 for _, m := range m.Seq { 127 if rt := m.End(); rt > end { 128 end = rt 129 } 130 } 131 132 return end 133 } 134 135 // Clone returns a copy of the sequence. 136 func (m *Multi) Clone() seq.Rower { 137 c := &Multi{} 138 *c = *m 139 c.Seq = make([]seq.Sequence, len(m.Seq)) 140 for i, r := range m.Seq { 141 c.Seq[i] = r.Clone().(seq.Sequence) 142 } 143 144 return c 145 } 146 147 // RevComp reverse complements the sequence. 148 func (m *Multi) RevComp() { 149 end := m.End() 150 for _, r := range m.Seq { 151 r.RevComp() 152 r.SetOffset(end - m.End()) 153 } 154 155 return 156 } 157 158 // Reverse reverses the order of letters in the the sequence without complementing them. 159 func (m *Multi) Reverse() { 160 end := m.End() 161 for _, r := range m.Seq { 162 r.Reverse() 163 r.SetOffset(end - m.End()) 164 } 165 } 166 167 // Conformation returns the sequence conformation. 168 func (m *Multi) Conformation() feat.Conformation { return m.Conform } 169 170 // SetConformation sets the sequence conformation. 171 func (m *Multi) SetConformation(c feat.Conformation) { 172 for _, r := range m.Seq { 173 r.SetConformation(c) 174 } 175 m.Conform = c 176 } 177 178 // Add adds sequences n to the multiple sequence. 179 func (m *Multi) Add(n ...seq.Sequence) error { 180 for _, r := range n { 181 if m.Alpha == nil { 182 m.Alpha = r.Alphabet() 183 continue 184 } else if r.Alphabet() != m.Alpha { 185 return errors.New("multi: inconsistent alphabets") 186 } 187 } 188 m.Seq = append(m.Seq, n...) 189 190 return nil 191 } 192 193 // Delete removes the sequence represented at row i of the alignment. It panics if i is out of range. 194 func (m *Multi) Delete(i int) { 195 m.Seq = m.Seq[:i+copy(m.Seq[i:], m.Seq[i+1:])] 196 } 197 198 // Row returns the sequence represented at row i of the alignment. It panics is i is out of range. 199 func (m *Multi) Row(i int) seq.Sequence { 200 return m.Seq[i] 201 } 202 203 // Append appends a to the ith sequence in the receiver. 204 func (m *Multi) Append(i int, a ...alphabet.QLetter) (err error) { 205 return m.Row(i).(seq.Appender).AppendQLetters(a...) 206 } 207 208 // Append each byte of each a to the appropriate sequence in the receiver. 209 func (m *Multi) AppendColumns(a ...[]alphabet.QLetter) (err error) { 210 for i, c := range a { 211 if len(c) != m.Rows() { 212 return fmt.Errorf("multi: column %d does not match Rows(): %d != %d.", i, len(c), m.Rows()) 213 } 214 } 215 for i, b := 0, make([]alphabet.QLetter, 0, len(a)); i < m.Rows(); i, b = i+1, b[:0] { 216 for _, r := range a { 217 b = append(b, r[i]) 218 } 219 m.Append(i, b...) 220 } 221 222 return 223 } 224 225 // AppendEach appends each []alphabet.QLetter in a to the appropriate sequence in the receiver. 226 func (m *Multi) AppendEach(a [][]alphabet.QLetter) (err error) { 227 if len(a) != m.Rows() { 228 return fmt.Errorf("multi: number of sequences does not match Rows(): %d != %d.", len(a), m.Rows()) 229 } 230 var i int 231 for _, r := range m.Seq { 232 if al, ok := r.(seq.AlignedAppender); ok { 233 row := al.Rows() 234 if al.AppendEach(a[i:i+row]) != nil { 235 panic("internal size mismatch") 236 } 237 i += row 238 } else { 239 r.(seq.Appender).AppendQLetters(a[i]...) 240 i++ 241 } 242 } 243 244 return 245 } 246 247 // Column returns a slice of letters reflecting the column at pos. 248 func (m *Multi) Column(pos int, fill bool) []alphabet.Letter { 249 if pos < m.Start() || pos >= m.End() { 250 panic("multi: index out of range") 251 } 252 253 var c []alphabet.Letter 254 if fill { 255 c = make([]alphabet.Letter, 0, m.Rows()) 256 } else { 257 c = []alphabet.Letter{} 258 } 259 260 for _, r := range m.Seq { 261 if a, ok := r.(seq.Aligned); ok { 262 if a.Start() <= pos && pos < a.End() { 263 c = append(c, a.Column(pos, fill)...) 264 } else if fill { 265 c = append(c, m.Alpha.Gap().Repeat(a.Rows())...) 266 } 267 } else { 268 if r.Start() <= pos && pos < r.End() { 269 c = append(c, r.At(pos).L) 270 } else if fill { 271 c = append(c, m.Alpha.Gap()) 272 } 273 } 274 } 275 276 return c 277 } 278 279 // ColumnQL returns a slice of quality letters reflecting the column at pos. 280 func (m *Multi) ColumnQL(pos int, fill bool) []alphabet.QLetter { 281 if pos < m.Start() || pos >= m.End() { 282 panic("multi: index out of range") 283 } 284 285 var c []alphabet.QLetter 286 if fill { 287 c = make([]alphabet.QLetter, 0, m.Rows()) 288 } else { 289 c = []alphabet.QLetter{} 290 } 291 292 for _, r := range m.Seq { 293 if a, ok := r.(seq.Aligned); ok { 294 if a.Start() <= pos && pos < a.End() { 295 c = append(c, a.ColumnQL(pos, fill)...) 296 } else if fill { 297 c = append(c, alphabet.QLetter{L: m.Alpha.Gap()}.Repeat(a.Rows())...) 298 } 299 } else { 300 if r.Start() <= pos && pos < r.End() { 301 c = append(c, r.At(pos)) 302 } else if fill { 303 c = append(c, alphabet.QLetter{L: m.Alpha.Gap()}) 304 } 305 } 306 } 307 308 return c 309 } 310 311 // IsFlush returns a boolean indicating whether the end specified by where is flush - that is 312 // all the contributing sequences start at the same offset. 313 func (m *Multi) IsFlush(where int) bool { 314 if m.Rows() <= 1 { 315 return true 316 } 317 var start, end int 318 for i, r := range m.Seq { 319 if lt, rt := r.Start(), r.End(); i > 0 && 320 ((lt != start && where&seq.Start != 0) || 321 (rt != end && where&seq.End != 0)) { 322 return false 323 } else if i == 0 { 324 start, end = lt, rt 325 } 326 } 327 return true 328 } 329 330 // Flush fills ragged sequences with the receiver's gap letter so that all sequences are flush. 331 func (m *Multi) Flush(where int, fill alphabet.Letter) { 332 if m.IsFlush(where) { 333 return 334 } 335 336 if where&seq.Start != 0 { 337 start := m.Start() 338 for _, r := range m.Seq { 339 if r.Start()-start < 1 { 340 continue 341 } 342 switch sl := r.Slice().(type) { 343 case alphabet.Letters: 344 r.SetSlice(alphabet.Letters(append(fill.Repeat(r.Start()-start), sl...))) 345 case alphabet.QLetters: 346 r.SetSlice(alphabet.QLetters(append(alphabet.QLetter{L: fill}.Repeat(r.Start()-start), sl...))) 347 } 348 r.SetOffset(start) 349 } 350 } 351 if where&seq.End != 0 { 352 end := m.End() 353 for _, r := range m.Seq { 354 if end-r.End() < 1 { 355 continue 356 } 357 r.(seq.Appender).AppendQLetters(alphabet.QLetter{L: fill}.Repeat(end - r.End())...) 358 } 359 } 360 } 361 362 // Subseq returns a multiple subsequence slice of the receiver. 363 func (m *Multi) Subseq(start, end int) (*Multi, error) { 364 var ns []seq.Sequence 365 366 for _, r := range m.Seq { 367 rs := reflect.New(reflect.TypeOf(r)).Interface().(sequtils.Sliceable) 368 err := sequtils.Truncate(rs, r, start, end) 369 if err != nil { 370 return nil, err 371 } 372 ns = append(ns, rs.(seq.Sequence)) 373 } 374 375 ss := &Multi{} 376 *ss = *m 377 ss.Seq = ns 378 379 return ss, nil 380 } 381 382 // Truncate truncates the the receiver from start to end. 383 func (m *Multi) Truncate(start, end int) error { 384 for _, r := range m.Seq { 385 err := sequtils.Truncate(r, r, start, end) 386 if err != nil { 387 return err 388 } 389 } 390 391 return nil 392 } 393 394 // Join joins a to the receiver at the end specied by where. 395 func (m *Multi) Join(a *Multi, where int) error { 396 if m.Rows() != a.Rows() { 397 return fmt.Errorf("multi: row number mismatch %d != %d", m.Rows(), a.Rows()) 398 } 399 400 switch where { 401 case seq.Start: 402 if !a.IsFlush(seq.End) { 403 a.Flush(seq.End, m.Alpha.Gap()) 404 } 405 if !m.IsFlush(seq.Start) { 406 m.Flush(seq.Start, m.Alpha.Gap()) 407 } 408 case seq.End: 409 if !a.IsFlush(seq.Start) { 410 a.Flush(seq.Start, m.Alpha.Gap()) 411 } 412 if !m.IsFlush(seq.End) { 413 m.Flush(seq.End, m.Alpha.Gap()) 414 } 415 } 416 417 for i := 0; i < m.Rows(); i++ { 418 r := m.Row(i) 419 as := a.Row(i) 420 err := joinOne(r, as, where) 421 if err != nil { 422 return err 423 } 424 } 425 426 return nil 427 } 428 429 func joinOne(m, am seq.Sequence, where int) error { 430 switch m.(type) { 431 case *linear.Seq: 432 _, ok := am.(*linear.Seq) 433 if !ok { 434 goto MISMATCH 435 } 436 return sequtils.Join(m, am, where) 437 case *linear.QSeq: 438 _, ok := am.(*linear.QSeq) 439 if !ok { 440 goto MISMATCH 441 } 442 return sequtils.Join(m, am, where) 443 default: 444 joinerRegistryLock.RLock() 445 defer joinerRegistryLock.RUnlock() 446 joinerFunc, ok := joinerRegistry[reflect.TypeOf(m)] 447 if !ok { 448 return fmt.Errorf("multi: sequence type %T not handled.", m) 449 } 450 if reflect.TypeOf(m) != reflect.TypeOf(am) { 451 goto MISMATCH 452 } 453 return joinerFunc(m, am, where) 454 } 455 456 MISMATCH: 457 return fmt.Errorf("multi: sequence type mismatch: %T != %T.", m, am) 458 } 459 460 type JoinFunc func(a, b seq.Sequence, where int) (err error) 461 462 func RegisterJoiner(p seq.Sequence, f JoinFunc) { 463 joinerRegistryLock.Lock() 464 joinerRegistry[reflect.TypeOf(p)] = f 465 joinerRegistryLock.Unlock() 466 } 467 468 type ft struct { 469 s, e int 470 } 471 472 func (f *ft) Start() int { return f.s } 473 func (f *ft) End() int { return f.e } 474 func (f *ft) Len() int { return f.e - f.s } 475 func (f *ft) Orientation() feat.Orientation { return feat.Forward } 476 func (f *ft) Name() string { return "" } 477 func (f *ft) Description() string { return "" } 478 func (f *ft) Location() feat.Feature { return nil } 479 480 type fts []feat.Feature 481 482 func (f fts) Features() []feat.Feature { return f } 483 func (f fts) Len() int { return len(f) } 484 func (f fts) Less(i, j int) bool { return f[i].Start() < f[j].Start() } 485 func (f fts) Swap(i, j int) { f[i], f[j] = f[j], f[i] } 486 487 func max(a, b int) int { 488 if a > b { 489 return a 490 } 491 return b 492 } 493 func min(a, b int) int { 494 if a < b { 495 return a 496 } 497 return b 498 } 499 500 // Stitch produces a subsequence of the receiver defined by fs. The result is stored in the receiver 501 // and all contributing sequences are modified. 502 func (m *Multi) Stitch(fs feat.Set) error { 503 ff := fs.Features() 504 for _, f := range ff { 505 if f.End() < f.Start() { 506 return errors.New("multi: feature end < feature start") 507 } 508 } 509 ff = append(fts(nil), ff...) 510 sort.Sort(fts(ff)) 511 512 var ( 513 fsp = make(fts, 0, len(ff)) 514 csp *ft 515 ) 516 for i, f := range ff { 517 if s := f.Start(); i == 0 || s > csp.e { 518 csp = &ft{s: s, e: f.End()} 519 fsp = append(fsp, csp) 520 } else { 521 csp.e = max(csp.e, f.End()) 522 } 523 } 524 525 return m.Compose(fsp) 526 } 527 528 // Compose produces a composition of the receiver defined by the features in fs. The result is stored 529 // in the receiver and all contributing sequences are modified. 530 func (m *Multi) Compose(fs feat.Set) error { 531 m.Flush(seq.Start|seq.End, m.Alpha.Gap()) 532 533 for _, r := range m.Seq { 534 err := sequtils.Compose(r, r, fs) 535 if err != nil { 536 return err 537 } 538 } 539 540 return nil 541 } 542 543 func (m *Multi) String() string { 544 t := m.Consensus(false) 545 return t.String() 546 } 547 548 // Consensus returns a quality sequence reflecting the consensus of the receiver determined by the 549 // ColumnConsense field. 550 func (m *Multi) Consensus(includeMissing bool) *linear.QSeq { 551 cm := make([]alphabet.QLetter, 0, m.Len()) 552 alpha := m.Alphabet() 553 for i := m.Start(); i < m.End(); i++ { 554 cm = append(cm, m.ColumnConsense(m, alpha, i, includeMissing)) 555 } 556 557 c := linear.NewQSeq("Consensus:"+m.ID, cm, m.Alpha, m.Encode) 558 c.SetOffset(m.Offset) 559 560 return c 561 } 562 563 // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's' 564 // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support 565 // truncated output via the verb's precision. Fasta format supports sequence line 566 // specification via the verb's width field. Fastq format supports optional inclusion 567 // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#' 568 // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for 569 // omission of the sequence name and in conjunction with this, the ' ' flag for 570 // alignment justification. 571 func (m *Multi) Format(fs fmt.State, c rune) { 572 if m == nil { 573 fmt.Fprint(fs, "<nil>") 574 return 575 } 576 577 var align bool 578 switch c { 579 case 'v': 580 if fs.Flag('#') { 581 fmt.Fprintf(fs, "&%#v", *m) 582 return 583 } 584 fallthrough 585 case 's': 586 align = fs.Flag(' ') && fs.Flag('-') 587 fallthrough 588 case 'a', 'q': 589 format := formatString(fs, c) 590 for i, r := range m.Seq { 591 if align { 592 fmt.Fprintf(fs, "%s", strings.Repeat(" ", r.Start()-m.Start())) 593 } 594 fmt.Fprintf(fs, format, r) 595 if i < m.Rows()-1 { 596 fmt.Fprintln(fs) 597 } 598 } 599 default: 600 fmt.Fprintf(fs, "%%!%c(*multi.Multi=%.10s)", c, m) 601 } 602 } 603 604 func formatString(fs fmt.State, c rune) string { 605 w, wOk := fs.Width() 606 p, pOk := fs.Precision() 607 var b bytes.Buffer 608 b.WriteByte('%') 609 for _, f := range "+-# 0" { 610 if fs.Flag(int(f)) { 611 b.WriteRune(f) 612 } 613 } 614 if wOk { 615 fmt.Fprint(&b, w) 616 } 617 if pOk { 618 b.WriteByte('.') 619 fmt.Fprint(&b, p) 620 } 621 b.WriteRune(c) 622 return b.String() 623 }