github.com/biogo/biogo@v1.0.4/seq/multi/multi.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package multi handles collections of sequences as alignments or sets.
     6  package multi
     7  
     8  import (
     9  	"bytes"
    10  
    11  	"github.com/biogo/biogo/alphabet"
    12  	"github.com/biogo/biogo/feat"
    13  	"github.com/biogo/biogo/seq"
    14  	"github.com/biogo/biogo/seq/linear"
    15  	"github.com/biogo/biogo/seq/sequtils"
    16  	"github.com/biogo/biogo/util"
    17  
    18  	"errors"
    19  	"fmt"
    20  	"reflect"
    21  	"sort"
    22  	"strings"
    23  	"sync"
    24  )
    25  
    26  func init() {
    27  	joinerRegistryLock = &sync.RWMutex{}
    28  	joinerRegistry = make(map[reflect.Type]JoinFunc)
    29  }
    30  
    31  var (
    32  	joinerRegistryLock *sync.RWMutex
    33  	joinerRegistry     map[reflect.Type]JoinFunc
    34  )
    35  
    36  type Multi struct {
    37  	seq.Annotation
    38  	Seq            []seq.Sequence
    39  	ColumnConsense seq.ConsenseFunc
    40  	Encode         alphabet.Encoding
    41  }
    42  
    43  // Create a new Multi sequence.
    44  func NewMulti(id string, n []seq.Sequence, cons seq.ConsenseFunc) (*Multi, error) {
    45  	var alpha alphabet.Alphabet
    46  	for _, s := range n {
    47  		if alpha != nil && s.Alphabet() != alpha {
    48  			return nil, errors.New("multi: inconsistent alphabets")
    49  		} else if alpha == nil {
    50  			alpha = s.Alphabet()
    51  		}
    52  	}
    53  	return &Multi{
    54  		Annotation: seq.Annotation{
    55  			ID:    id,
    56  			Alpha: alpha,
    57  		},
    58  		Seq:            n,
    59  		ColumnConsense: cons,
    60  	}, nil
    61  }
    62  
    63  // Encoding returns the quality encoding scheme.
    64  func (m *Multi) Encoding() alphabet.Encoding { return m.Encode }
    65  
    66  // SetEncoding sets the quality encoding scheme to e.
    67  func (m *Multi) SetEncoding(e alphabet.Encoding) {
    68  	for _, r := range m.Seq {
    69  		if enc, ok := r.(seq.Scorer); ok {
    70  			enc.SetEncoding(e)
    71  		}
    72  	}
    73  	m.Encode = e
    74  }
    75  
    76  // Len returns the length of the alignment.
    77  func (m *Multi) Len() int {
    78  	var (
    79  		min = util.MaxInt
    80  		max = util.MinInt
    81  	)
    82  
    83  	for _, r := range m.Seq {
    84  		if start := r.Start(); start < min {
    85  			min = start
    86  		}
    87  		if end := r.End(); end > max {
    88  			max = end
    89  		}
    90  	}
    91  
    92  	return max - min
    93  }
    94  
    95  // Rows returns the number of rows in the alignment.
    96  func (m *Multi) Rows() int {
    97  	return len(m.Seq)
    98  }
    99  
   100  // SetOffset sets the location-relative offset of the sequence to o.
   101  func (m *Multi) SetOffset(o int) error {
   102  	for _, r := range m.Seq {
   103  		r.SetOffset(r.Start() - m.Offset + o)
   104  	}
   105  	m.Offset = o
   106  	return nil
   107  }
   108  
   109  // Start returns the start position of the sequence in coordinates relative to
   110  // the sequence location.
   111  func (m *Multi) Start() int {
   112  	start := util.MaxInt
   113  	for _, r := range m.Seq {
   114  		if lt := r.Start(); lt < start {
   115  			start = lt
   116  		}
   117  	}
   118  
   119  	return start
   120  }
   121  
   122  // End returns the end position of the sequence in coordinates relative to
   123  // the sequence location.
   124  func (m *Multi) End() int {
   125  	end := util.MinInt
   126  	for _, m := range m.Seq {
   127  		if rt := m.End(); rt > end {
   128  			end = rt
   129  		}
   130  	}
   131  
   132  	return end
   133  }
   134  
   135  // Clone returns a copy of the sequence.
   136  func (m *Multi) Clone() seq.Rower {
   137  	c := &Multi{}
   138  	*c = *m
   139  	c.Seq = make([]seq.Sequence, len(m.Seq))
   140  	for i, r := range m.Seq {
   141  		c.Seq[i] = r.Clone().(seq.Sequence)
   142  	}
   143  
   144  	return c
   145  }
   146  
   147  // RevComp reverse complements the sequence.
   148  func (m *Multi) RevComp() {
   149  	end := m.End()
   150  	for _, r := range m.Seq {
   151  		r.RevComp()
   152  		r.SetOffset(end - m.End())
   153  	}
   154  
   155  	return
   156  }
   157  
   158  // Reverse reverses the order of letters in the the sequence without complementing them.
   159  func (m *Multi) Reverse() {
   160  	end := m.End()
   161  	for _, r := range m.Seq {
   162  		r.Reverse()
   163  		r.SetOffset(end - m.End())
   164  	}
   165  }
   166  
   167  // Conformation returns the sequence conformation.
   168  func (m *Multi) Conformation() feat.Conformation { return m.Conform }
   169  
   170  // SetConformation sets the sequence conformation.
   171  func (m *Multi) SetConformation(c feat.Conformation) {
   172  	for _, r := range m.Seq {
   173  		r.SetConformation(c)
   174  	}
   175  	m.Conform = c
   176  }
   177  
   178  // Add adds sequences n to the multiple sequence.
   179  func (m *Multi) Add(n ...seq.Sequence) error {
   180  	for _, r := range n {
   181  		if m.Alpha == nil {
   182  			m.Alpha = r.Alphabet()
   183  			continue
   184  		} else if r.Alphabet() != m.Alpha {
   185  			return errors.New("multi: inconsistent alphabets")
   186  		}
   187  	}
   188  	m.Seq = append(m.Seq, n...)
   189  
   190  	return nil
   191  }
   192  
   193  // Delete removes the sequence represented at row i of the alignment. It panics if i is out of range.
   194  func (m *Multi) Delete(i int) {
   195  	m.Seq = m.Seq[:i+copy(m.Seq[i:], m.Seq[i+1:])]
   196  }
   197  
   198  // Row returns the sequence represented at row i of the alignment. It panics is i is out of range.
   199  func (m *Multi) Row(i int) seq.Sequence {
   200  	return m.Seq[i]
   201  }
   202  
   203  // Append appends a to the ith sequence in the receiver.
   204  func (m *Multi) Append(i int, a ...alphabet.QLetter) (err error) {
   205  	return m.Row(i).(seq.Appender).AppendQLetters(a...)
   206  }
   207  
   208  // Append each byte of each a to the appropriate sequence in the receiver.
   209  func (m *Multi) AppendColumns(a ...[]alphabet.QLetter) (err error) {
   210  	for i, c := range a {
   211  		if len(c) != m.Rows() {
   212  			return fmt.Errorf("multi: column %d does not match Rows(): %d != %d.", i, len(c), m.Rows())
   213  		}
   214  	}
   215  	for i, b := 0, make([]alphabet.QLetter, 0, len(a)); i < m.Rows(); i, b = i+1, b[:0] {
   216  		for _, r := range a {
   217  			b = append(b, r[i])
   218  		}
   219  		m.Append(i, b...)
   220  	}
   221  
   222  	return
   223  }
   224  
   225  // AppendEach appends each []alphabet.QLetter in a to the appropriate sequence in the receiver.
   226  func (m *Multi) AppendEach(a [][]alphabet.QLetter) (err error) {
   227  	if len(a) != m.Rows() {
   228  		return fmt.Errorf("multi: number of sequences does not match Rows(): %d != %d.", len(a), m.Rows())
   229  	}
   230  	var i int
   231  	for _, r := range m.Seq {
   232  		if al, ok := r.(seq.AlignedAppender); ok {
   233  			row := al.Rows()
   234  			if al.AppendEach(a[i:i+row]) != nil {
   235  				panic("internal size mismatch")
   236  			}
   237  			i += row
   238  		} else {
   239  			r.(seq.Appender).AppendQLetters(a[i]...)
   240  			i++
   241  		}
   242  	}
   243  
   244  	return
   245  }
   246  
   247  // Column returns a slice of letters reflecting the column at pos.
   248  func (m *Multi) Column(pos int, fill bool) []alphabet.Letter {
   249  	if pos < m.Start() || pos >= m.End() {
   250  		panic("multi: index out of range")
   251  	}
   252  
   253  	var c []alphabet.Letter
   254  	if fill {
   255  		c = make([]alphabet.Letter, 0, m.Rows())
   256  	} else {
   257  		c = []alphabet.Letter{}
   258  	}
   259  
   260  	for _, r := range m.Seq {
   261  		if a, ok := r.(seq.Aligned); ok {
   262  			if a.Start() <= pos && pos < a.End() {
   263  				c = append(c, a.Column(pos, fill)...)
   264  			} else if fill {
   265  				c = append(c, m.Alpha.Gap().Repeat(a.Rows())...)
   266  			}
   267  		} else {
   268  			if r.Start() <= pos && pos < r.End() {
   269  				c = append(c, r.At(pos).L)
   270  			} else if fill {
   271  				c = append(c, m.Alpha.Gap())
   272  			}
   273  		}
   274  	}
   275  
   276  	return c
   277  }
   278  
   279  // ColumnQL returns a slice of quality letters reflecting the column at pos.
   280  func (m *Multi) ColumnQL(pos int, fill bool) []alphabet.QLetter {
   281  	if pos < m.Start() || pos >= m.End() {
   282  		panic("multi: index out of range")
   283  	}
   284  
   285  	var c []alphabet.QLetter
   286  	if fill {
   287  		c = make([]alphabet.QLetter, 0, m.Rows())
   288  	} else {
   289  		c = []alphabet.QLetter{}
   290  	}
   291  
   292  	for _, r := range m.Seq {
   293  		if a, ok := r.(seq.Aligned); ok {
   294  			if a.Start() <= pos && pos < a.End() {
   295  				c = append(c, a.ColumnQL(pos, fill)...)
   296  			} else if fill {
   297  				c = append(c, alphabet.QLetter{L: m.Alpha.Gap()}.Repeat(a.Rows())...)
   298  			}
   299  		} else {
   300  			if r.Start() <= pos && pos < r.End() {
   301  				c = append(c, r.At(pos))
   302  			} else if fill {
   303  				c = append(c, alphabet.QLetter{L: m.Alpha.Gap()})
   304  			}
   305  		}
   306  	}
   307  
   308  	return c
   309  }
   310  
   311  // IsFlush returns a boolean indicating whether the end specified by where is flush - that is
   312  // all the contributing sequences start at the same offset.
   313  func (m *Multi) IsFlush(where int) bool {
   314  	if m.Rows() <= 1 {
   315  		return true
   316  	}
   317  	var start, end int
   318  	for i, r := range m.Seq {
   319  		if lt, rt := r.Start(), r.End(); i > 0 &&
   320  			((lt != start && where&seq.Start != 0) ||
   321  				(rt != end && where&seq.End != 0)) {
   322  			return false
   323  		} else if i == 0 {
   324  			start, end = lt, rt
   325  		}
   326  	}
   327  	return true
   328  }
   329  
   330  // Flush fills ragged sequences with the receiver's gap letter so that all sequences are flush.
   331  func (m *Multi) Flush(where int, fill alphabet.Letter) {
   332  	if m.IsFlush(where) {
   333  		return
   334  	}
   335  
   336  	if where&seq.Start != 0 {
   337  		start := m.Start()
   338  		for _, r := range m.Seq {
   339  			if r.Start()-start < 1 {
   340  				continue
   341  			}
   342  			switch sl := r.Slice().(type) {
   343  			case alphabet.Letters:
   344  				r.SetSlice(alphabet.Letters(append(fill.Repeat(r.Start()-start), sl...)))
   345  			case alphabet.QLetters:
   346  				r.SetSlice(alphabet.QLetters(append(alphabet.QLetter{L: fill}.Repeat(r.Start()-start), sl...)))
   347  			}
   348  			r.SetOffset(start)
   349  		}
   350  	}
   351  	if where&seq.End != 0 {
   352  		end := m.End()
   353  		for _, r := range m.Seq {
   354  			if end-r.End() < 1 {
   355  				continue
   356  			}
   357  			r.(seq.Appender).AppendQLetters(alphabet.QLetter{L: fill}.Repeat(end - r.End())...)
   358  		}
   359  	}
   360  }
   361  
   362  // Subseq returns a multiple subsequence slice of the receiver.
   363  func (m *Multi) Subseq(start, end int) (*Multi, error) {
   364  	var ns []seq.Sequence
   365  
   366  	for _, r := range m.Seq {
   367  		rs := reflect.New(reflect.TypeOf(r)).Interface().(sequtils.Sliceable)
   368  		err := sequtils.Truncate(rs, r, start, end)
   369  		if err != nil {
   370  			return nil, err
   371  		}
   372  		ns = append(ns, rs.(seq.Sequence))
   373  	}
   374  
   375  	ss := &Multi{}
   376  	*ss = *m
   377  	ss.Seq = ns
   378  
   379  	return ss, nil
   380  }
   381  
   382  // Truncate truncates the the receiver from start to end.
   383  func (m *Multi) Truncate(start, end int) error {
   384  	for _, r := range m.Seq {
   385  		err := sequtils.Truncate(r, r, start, end)
   386  		if err != nil {
   387  			return err
   388  		}
   389  	}
   390  
   391  	return nil
   392  }
   393  
   394  // Join joins a to the receiver at the end specied by where.
   395  func (m *Multi) Join(a *Multi, where int) error {
   396  	if m.Rows() != a.Rows() {
   397  		return fmt.Errorf("multi: row number mismatch %d != %d", m.Rows(), a.Rows())
   398  	}
   399  
   400  	switch where {
   401  	case seq.Start:
   402  		if !a.IsFlush(seq.End) {
   403  			a.Flush(seq.End, m.Alpha.Gap())
   404  		}
   405  		if !m.IsFlush(seq.Start) {
   406  			m.Flush(seq.Start, m.Alpha.Gap())
   407  		}
   408  	case seq.End:
   409  		if !a.IsFlush(seq.Start) {
   410  			a.Flush(seq.Start, m.Alpha.Gap())
   411  		}
   412  		if !m.IsFlush(seq.End) {
   413  			m.Flush(seq.End, m.Alpha.Gap())
   414  		}
   415  	}
   416  
   417  	for i := 0; i < m.Rows(); i++ {
   418  		r := m.Row(i)
   419  		as := a.Row(i)
   420  		err := joinOne(r, as, where)
   421  		if err != nil {
   422  			return err
   423  		}
   424  	}
   425  
   426  	return nil
   427  }
   428  
   429  func joinOne(m, am seq.Sequence, where int) error {
   430  	switch m.(type) {
   431  	case *linear.Seq:
   432  		_, ok := am.(*linear.Seq)
   433  		if !ok {
   434  			goto MISMATCH
   435  		}
   436  		return sequtils.Join(m, am, where)
   437  	case *linear.QSeq:
   438  		_, ok := am.(*linear.QSeq)
   439  		if !ok {
   440  			goto MISMATCH
   441  		}
   442  		return sequtils.Join(m, am, where)
   443  	default:
   444  		joinerRegistryLock.RLock()
   445  		defer joinerRegistryLock.RUnlock()
   446  		joinerFunc, ok := joinerRegistry[reflect.TypeOf(m)]
   447  		if !ok {
   448  			return fmt.Errorf("multi: sequence type %T not handled.", m)
   449  		}
   450  		if reflect.TypeOf(m) != reflect.TypeOf(am) {
   451  			goto MISMATCH
   452  		}
   453  		return joinerFunc(m, am, where)
   454  	}
   455  
   456  MISMATCH:
   457  	return fmt.Errorf("multi: sequence type mismatch: %T != %T.", m, am)
   458  }
   459  
   460  type JoinFunc func(a, b seq.Sequence, where int) (err error)
   461  
   462  func RegisterJoiner(p seq.Sequence, f JoinFunc) {
   463  	joinerRegistryLock.Lock()
   464  	joinerRegistry[reflect.TypeOf(p)] = f
   465  	joinerRegistryLock.Unlock()
   466  }
   467  
   468  type ft struct {
   469  	s, e int
   470  }
   471  
   472  func (f *ft) Start() int                    { return f.s }
   473  func (f *ft) End() int                      { return f.e }
   474  func (f *ft) Len() int                      { return f.e - f.s }
   475  func (f *ft) Orientation() feat.Orientation { return feat.Forward }
   476  func (f *ft) Name() string                  { return "" }
   477  func (f *ft) Description() string           { return "" }
   478  func (f *ft) Location() feat.Feature        { return nil }
   479  
   480  type fts []feat.Feature
   481  
   482  func (f fts) Features() []feat.Feature { return f }
   483  func (f fts) Len() int                 { return len(f) }
   484  func (f fts) Less(i, j int) bool       { return f[i].Start() < f[j].Start() }
   485  func (f fts) Swap(i, j int)            { f[i], f[j] = f[j], f[i] }
   486  
   487  func max(a, b int) int {
   488  	if a > b {
   489  		return a
   490  	}
   491  	return b
   492  }
   493  func min(a, b int) int {
   494  	if a < b {
   495  		return a
   496  	}
   497  	return b
   498  }
   499  
   500  // Stitch produces a subsequence of the receiver defined by fs. The result is stored in the receiver
   501  // and all contributing sequences are modified.
   502  func (m *Multi) Stitch(fs feat.Set) error {
   503  	ff := fs.Features()
   504  	for _, f := range ff {
   505  		if f.End() < f.Start() {
   506  			return errors.New("multi: feature end < feature start")
   507  		}
   508  	}
   509  	ff = append(fts(nil), ff...)
   510  	sort.Sort(fts(ff))
   511  
   512  	var (
   513  		fsp = make(fts, 0, len(ff))
   514  		csp *ft
   515  	)
   516  	for i, f := range ff {
   517  		if s := f.Start(); i == 0 || s > csp.e {
   518  			csp = &ft{s: s, e: f.End()}
   519  			fsp = append(fsp, csp)
   520  		} else {
   521  			csp.e = max(csp.e, f.End())
   522  		}
   523  	}
   524  
   525  	return m.Compose(fsp)
   526  }
   527  
   528  // Compose produces a composition of the receiver defined by the features in fs. The result is stored
   529  // in the receiver and all contributing sequences are modified.
   530  func (m *Multi) Compose(fs feat.Set) error {
   531  	m.Flush(seq.Start|seq.End, m.Alpha.Gap())
   532  
   533  	for _, r := range m.Seq {
   534  		err := sequtils.Compose(r, r, fs)
   535  		if err != nil {
   536  			return err
   537  		}
   538  	}
   539  
   540  	return nil
   541  }
   542  
   543  func (m *Multi) String() string {
   544  	t := m.Consensus(false)
   545  	return t.String()
   546  }
   547  
   548  // Consensus returns a quality sequence reflecting the consensus of the receiver determined by the
   549  // ColumnConsense field.
   550  func (m *Multi) Consensus(includeMissing bool) *linear.QSeq {
   551  	cm := make([]alphabet.QLetter, 0, m.Len())
   552  	alpha := m.Alphabet()
   553  	for i := m.Start(); i < m.End(); i++ {
   554  		cm = append(cm, m.ColumnConsense(m, alpha, i, includeMissing))
   555  	}
   556  
   557  	c := linear.NewQSeq("Consensus:"+m.ID, cm, m.Alpha, m.Encode)
   558  	c.SetOffset(m.Offset)
   559  
   560  	return c
   561  }
   562  
   563  // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's'
   564  // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support
   565  // truncated output via the verb's precision. Fasta format supports sequence line
   566  // specification via the verb's width field. Fastq format supports optional inclusion
   567  // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#'
   568  // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for
   569  // omission of the sequence name and in conjunction with this, the ' ' flag for
   570  // alignment justification.
   571  func (m *Multi) Format(fs fmt.State, c rune) {
   572  	if m == nil {
   573  		fmt.Fprint(fs, "<nil>")
   574  		return
   575  	}
   576  
   577  	var align bool
   578  	switch c {
   579  	case 'v':
   580  		if fs.Flag('#') {
   581  			fmt.Fprintf(fs, "&%#v", *m)
   582  			return
   583  		}
   584  		fallthrough
   585  	case 's':
   586  		align = fs.Flag(' ') && fs.Flag('-')
   587  		fallthrough
   588  	case 'a', 'q':
   589  		format := formatString(fs, c)
   590  		for i, r := range m.Seq {
   591  			if align {
   592  				fmt.Fprintf(fs, "%s", strings.Repeat(" ", r.Start()-m.Start()))
   593  			}
   594  			fmt.Fprintf(fs, format, r)
   595  			if i < m.Rows()-1 {
   596  				fmt.Fprintln(fs)
   597  			}
   598  		}
   599  	default:
   600  		fmt.Fprintf(fs, "%%!%c(*multi.Multi=%.10s)", c, m)
   601  	}
   602  }
   603  
   604  func formatString(fs fmt.State, c rune) string {
   605  	w, wOk := fs.Width()
   606  	p, pOk := fs.Precision()
   607  	var b bytes.Buffer
   608  	b.WriteByte('%')
   609  	for _, f := range "+-# 0" {
   610  		if fs.Flag(int(f)) {
   611  			b.WriteRune(f)
   612  		}
   613  	}
   614  	if wOk {
   615  		fmt.Fprint(&b, w)
   616  	}
   617  	if pOk {
   618  		b.WriteByte('.')
   619  		fmt.Fprint(&b, p)
   620  	}
   621  	b.WriteRune(c)
   622  	return b.String()
   623  }