github.com/biogo/biogo@v1.0.4/seq/alignment/alignment.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package alignment handles aligned sequences stored as columns.
     6  package alignment
     7  
     8  import (
     9  	"github.com/biogo/biogo/alphabet"
    10  	"github.com/biogo/biogo/feat"
    11  	"github.com/biogo/biogo/seq"
    12  	"github.com/biogo/biogo/seq/linear"
    13  	"github.com/biogo/biogo/util"
    14  
    15  	"errors"
    16  	"fmt"
    17  	"strings"
    18  	"unicode"
    19  )
    20  
    21  // A Seq is an aligned sequence.
    22  type Seq struct {
    23  	seq.Annotation
    24  	SubAnnotations []seq.Annotation
    25  	Seq            alphabet.Columns
    26  	ColumnConsense seq.ConsenseFunc
    27  }
    28  
    29  // NewSeq creates a new Seq with the given id, letter sequence and alphabet.
    30  func NewSeq(id string, subids []string, b [][]alphabet.Letter, alpha alphabet.Alphabet, cons seq.ConsenseFunc) (*Seq, error) {
    31  	var (
    32  		lids, lseq = len(subids), len(b)
    33  		subann     []seq.Annotation
    34  	)
    35  	switch {
    36  	case lids == 0 && len(b) == 0:
    37  	case lseq != 0 && lids == len(b[0]):
    38  		if lids == 0 {
    39  			subann = make([]seq.Annotation, len(b[0]))
    40  			for i := range subids {
    41  				subann[i].ID = fmt.Sprintf("%s:%d", id, i)
    42  			}
    43  		} else {
    44  			subann = make([]seq.Annotation, lids)
    45  			for i, sid := range subids {
    46  				subann[i].ID = sid
    47  			}
    48  		}
    49  	default:
    50  		return nil, errors.New("alignment: id/seq number mismatch")
    51  	}
    52  
    53  	return &Seq{
    54  		Annotation: seq.Annotation{
    55  			ID:    id,
    56  			Alpha: alpha,
    57  		},
    58  		SubAnnotations: subann,
    59  		Seq:            append([][]alphabet.Letter(nil), b...),
    60  		ColumnConsense: cons,
    61  	}, nil
    62  }
    63  
    64  // Interface guarantees
    65  var (
    66  	_ feat.Feature = (*Seq)(nil)
    67  	_ feat.Feature = Row{}
    68  	_ seq.Sequence = Row{}
    69  )
    70  
    71  // Slice returns the sequence data as a alphabet.Slice.
    72  func (s *Seq) Slice() alphabet.Slice { return s.Seq }
    73  
    74  // SetSlice sets the sequence data represented by the Seq. SetSlice will panic if sl
    75  // is not a Columns.
    76  func (s *Seq) SetSlice(sl alphabet.Slice) { s.Seq = sl.(alphabet.Columns) }
    77  
    78  // Len returns the length of the alignment.
    79  func (s *Seq) Len() int { return len(s.Seq) }
    80  
    81  // Rows returns the number of rows in the alignment.
    82  func (s *Seq) Rows() int { return s.Seq.Rows() }
    83  
    84  // Start returns the start position of the sequence in coordinates relative to the
    85  // sequence location.
    86  func (s *Seq) Start() int { return s.Offset }
    87  
    88  // End returns the end position of the sequence in coordinates relative to the
    89  // sequence location.
    90  func (s *Seq) End() int { return s.Offset + s.Len() }
    91  
    92  // Clone returns a copy of the sequence.
    93  func (s *Seq) Clone() seq.Rower {
    94  	c := *s
    95  	c.Seq = make(alphabet.Columns, len(s.Seq))
    96  	for i, cs := range s.Seq {
    97  		c.Seq[i] = append([]alphabet.Letter(nil), cs...)
    98  	}
    99  
   100  	return &c
   101  }
   102  
   103  // New returns an empty *Seq sequence with the same alphabet.
   104  func (s *Seq) New() *Seq {
   105  	return &Seq{Annotation: seq.Annotation{Alpha: s.Alpha}}
   106  }
   107  
   108  // RevComp reverse complements the sequence. RevComp will panic if the alphabet used by
   109  // the receiver is not a Complementor.
   110  func (s *Seq) RevComp() {
   111  	rs, comp := s.Seq, s.Alpha.(alphabet.Complementor).ComplementTable()
   112  	i, j := 0, len(rs)-1
   113  	for ; i < j; i, j = i+1, j-1 {
   114  		for r := range rs[i] {
   115  			rs[i][r], rs[j][r] = comp[rs[j][r]], comp[rs[i][r]]
   116  		}
   117  	}
   118  	if i == j {
   119  		for r := range rs[i] {
   120  			rs[i][r] = comp[rs[i][r]]
   121  		}
   122  	}
   123  	s.Strand = -s.Strand
   124  }
   125  
   126  // Reverse reverses the order of letters in the the sequence without complementing them.
   127  func (s *Seq) Reverse() {
   128  	l := s.Seq
   129  	for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 {
   130  		l[i], l[j] = l[j], l[i]
   131  	}
   132  	s.Strand = seq.None
   133  }
   134  
   135  func (s *Seq) String() string {
   136  	return s.Consensus(false).String()
   137  }
   138  
   139  // Add adds the sequences n to Seq. Sequences in n should align start and end with the receiving alignment.
   140  // Additional sequence will be clipped and missing sequence will be filled with the gap letter.
   141  func (s *Seq) Add(n ...seq.Sequence) error {
   142  	for i := s.Start(); i < s.End(); i++ {
   143  		s.Seq[i] = append(s.Seq[i], s.column(n, i)...)
   144  	}
   145  	for i := range n {
   146  		s.SubAnnotations = append(s.SubAnnotations, *n[i].CloneAnnotation())
   147  	}
   148  
   149  	return nil
   150  }
   151  
   152  func (s *Seq) column(m []seq.Sequence, pos int) []alphabet.Letter {
   153  	c := make([]alphabet.Letter, 0, s.Rows())
   154  
   155  	for _, ss := range m {
   156  		if a, ok := ss.(seq.Aligned); ok {
   157  			if a.Start() <= pos && pos < a.End() {
   158  				c = append(c, a.Column(pos, true)...)
   159  			} else {
   160  				c = append(c, s.Alpha.Gap().Repeat(a.Rows())...)
   161  			}
   162  		} else {
   163  			if ss.Start() <= pos && pos < ss.End() {
   164  				c = append(c, ss.At(pos).L)
   165  			} else {
   166  				c = append(c, s.Alpha.Gap())
   167  			}
   168  		}
   169  	}
   170  
   171  	return c
   172  }
   173  
   174  // Delete removes the sequence represented at row i of the alignment. It panics if i is out of range.
   175  func (s *Seq) Delete(i int) {
   176  	if i >= s.Rows() {
   177  		panic("alignment: index out of range")
   178  	}
   179  	cs := s.Seq
   180  	for j, c := range cs {
   181  		cs[j] = c[:i+copy(c[i:], c[i+1:])]
   182  	}
   183  	sa := s.SubAnnotations
   184  	s.SubAnnotations = sa[:i+copy(sa[i:], sa[i+1:])]
   185  }
   186  
   187  // Row returns the sequence represented at row i of the alignment. It panics is i is out of range.
   188  func (s *Seq) Row(i int) seq.Sequence {
   189  	if i < 0 || i >= s.Rows() {
   190  		panic("alignment: index out of range")
   191  	}
   192  	return Row{Align: s, Row: i}
   193  }
   194  
   195  // AppendColumns appends each Qletter of each element of a to the appropriate sequence in the receiver.
   196  func (s *Seq) AppendColumns(a ...[]alphabet.QLetter) error {
   197  	for i, r := range a {
   198  		if len(r) != s.Rows() {
   199  			return fmt.Errorf("alignment: column %d does not match Rows(): %d != %d.", i, len(r), s.Rows())
   200  		}
   201  	}
   202  
   203  	s.Seq = append(s.Seq, make([][]alphabet.Letter, len(a))...)[:len(s.Seq)]
   204  	for _, r := range a {
   205  		c := make([]alphabet.Letter, len(r))
   206  		for i := range r {
   207  			c[i] = r[i].L
   208  		}
   209  		s.Seq = append(s.Seq, c)
   210  	}
   211  
   212  	return nil
   213  }
   214  
   215  // AppendEach appends each []alphabet.QLetter in a to the appropriate sequence in the receiver.
   216  func (s *Seq) AppendEach(a [][]alphabet.QLetter) error {
   217  	if len(a) != s.Rows() {
   218  		return fmt.Errorf("alignment: number of sequences does not match Rows(): %d != %d.", len(a), s.Rows())
   219  	}
   220  	max := util.MinInt
   221  	for _, ss := range a {
   222  		if l := len(ss); l > max {
   223  			max = l
   224  		}
   225  	}
   226  	s.Seq = append(s.Seq, make([][]alphabet.Letter, max)...)[:len(s.Seq)]
   227  	for i, b := 0, make([]alphabet.QLetter, 0, len(a)); i < max; i, b = i+1, b[:0] {
   228  		for _, ss := range a {
   229  			if i < len(ss) {
   230  				b = append(b, ss[i])
   231  			} else {
   232  				b = append(b, alphabet.QLetter{L: s.Alpha.Gap()})
   233  			}
   234  		}
   235  		s.AppendColumns(b)
   236  	}
   237  
   238  	return nil
   239  }
   240  
   241  // Column returns a slice of letters reflecting the column at pos.
   242  func (s *Seq) Column(pos int, _ bool) []alphabet.Letter {
   243  	return s.Seq[pos]
   244  }
   245  
   246  // ColumnQL returns a slice of quality letters reflecting the column at pos.
   247  func (s *Seq) ColumnQL(pos int, _ bool) []alphabet.QLetter {
   248  	c := make([]alphabet.QLetter, s.Rows())
   249  	for i, l := range s.Seq[pos] {
   250  		c[i] = alphabet.QLetter{
   251  			L: l,
   252  			Q: seq.DefaultQphred,
   253  		}
   254  	}
   255  
   256  	return c
   257  }
   258  
   259  // Consensus returns a quality sequence reflecting the consensus of the receiver determined by the
   260  // ColumnConsense field.
   261  func (s *Seq) Consensus(_ bool) *linear.QSeq {
   262  	cs := make([]alphabet.QLetter, 0, s.Len())
   263  	alpha := s.Alphabet()
   264  	for i := range s.Seq {
   265  		cs = append(cs, s.ColumnConsense(s, alpha, i, false))
   266  	}
   267  
   268  	qs := linear.NewQSeq("Consensus:"+s.ID, cs, s.Alpha, alphabet.Sanger)
   269  	qs.Strand = s.Strand
   270  	qs.SetOffset(s.Offset)
   271  	qs.Conform = s.Conform
   272  
   273  	return qs
   274  }
   275  
   276  // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's'
   277  // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support
   278  // truncated output via the verb's precision. Fasta format supports sequence line
   279  // specification via the verb's width field. Fastq format supports optional inclusion
   280  // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#'
   281  // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for
   282  // omission of the sequence name.
   283  func (s *Seq) Format(fs fmt.State, c rune) {
   284  	if s == nil {
   285  		fmt.Fprint(fs, "<nil>")
   286  		return
   287  	}
   288  	switch c {
   289  	case 'v':
   290  		if fs.Flag('#') {
   291  			fmt.Fprintf(fs, "&%#v", *s)
   292  			return
   293  		}
   294  		fallthrough
   295  	case 's', 'a', 'q':
   296  		r := Row{Align: s}
   297  		for r.Row = 0; r.Row < s.Rows(); r.Row++ {
   298  			r.Format(fs, c)
   299  			if r.Row < s.Rows()-1 {
   300  				fmt.Fprintln(fs)
   301  			}
   302  		}
   303  	default:
   304  		fmt.Fprintf(fs, "%%!%c(*alignment.Seq=%.10s)", c, s)
   305  	}
   306  }
   307  
   308  // A Row is a pointer into an alignment that satisfies the seq.Sequence interface.
   309  type Row struct {
   310  	Align *Seq
   311  	Row   int
   312  }
   313  
   314  // At returns the letter at position i.
   315  func (r Row) At(i int) alphabet.QLetter {
   316  	return alphabet.QLetter{
   317  		L: r.Align.Seq[i-r.Align.Offset][r.Row],
   318  		Q: seq.DefaultQphred,
   319  	}
   320  }
   321  
   322  // Set sets the letter at position i to l.
   323  func (r Row) Set(i int, l alphabet.QLetter) error {
   324  	r.Align.Seq[i-r.Align.Offset][r.Row] = l.L
   325  	return nil
   326  }
   327  
   328  // Len returns the length of the row.
   329  func (r Row) Len() int { return len(r.Align.Seq) }
   330  
   331  // Start returns the start position of the sequence in coordinates relative to the
   332  // sequence location.
   333  func (r Row) Start() int { return r.Align.SubAnnotations[r.Row].Offset }
   334  
   335  // End returns the end position of the sequence in coordinates relative to the
   336  // sequence location.
   337  func (r Row) End() int { return r.Start() + r.Len() }
   338  
   339  // Location returns the feature containing the row's sequence.
   340  func (r Row) Location() feat.Feature { return r.Align.SubAnnotations[r.Row].Loc }
   341  
   342  func (r Row) Alphabet() alphabet.Alphabet     { return r.Align.Alpha }
   343  func (r Row) Conformation() feat.Conformation { return r.Align.Conform }
   344  func (r Row) SetConformation(c feat.Conformation) error {
   345  	r.Align.SubAnnotations[r.Row].Conform = c
   346  	return nil
   347  }
   348  func (r Row) Name() string {
   349  	return r.Align.SubAnnotations[r.Row].ID
   350  }
   351  func (r Row) Description() string   { return r.Align.SubAnnotations[r.Row].Desc }
   352  func (r Row) SetOffset(o int) error { r.Align.SubAnnotations[r.Row].Offset = o; return nil }
   353  
   354  func (r Row) RevComp() {
   355  	rs, comp := r.Align.Seq, r.Alphabet().(alphabet.Complementor).ComplementTable()
   356  	i, j := 0, len(rs)-1
   357  	for ; i < j; i, j = i+1, j-1 {
   358  		rs[i][r.Row], rs[j][r.Row] = comp[rs[j][r.Row]], comp[rs[i][r.Row]]
   359  	}
   360  	if i == j {
   361  		rs[i][r.Row] = comp[rs[i][r.Row]]
   362  	}
   363  	r.Align.SubAnnotations[r.Row].Strand = -r.Align.SubAnnotations[r.Row].Strand
   364  }
   365  func (r Row) Reverse() {
   366  	l := r.Align.Seq
   367  	for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 {
   368  		l[i][r.Row], l[j][r.Row] = l[j][r.Row], l[i][r.Row]
   369  	}
   370  	r.Align.SubAnnotations[r.Row].Strand = seq.None
   371  }
   372  func (r Row) New() seq.Sequence {
   373  	return Row{Align: &Seq{Annotation: seq.Annotation{Alpha: r.Align.Alpha}}}
   374  }
   375  func (r Row) Clone() seq.Sequence {
   376  	b := make([]alphabet.Letter, r.Len())
   377  	for i, c := range r.Align.Seq {
   378  		b[i] = c[r.Row]
   379  	}
   380  	switch {
   381  	case r.Row < 0:
   382  		panic("under")
   383  	case r.Row >= r.Align.Rows():
   384  		panic("bang over Rows()")
   385  	case r.Row >= len(r.Align.SubAnnotations):
   386  
   387  		panic(fmt.Sprintf("bang over len(SubAnns): %d %d", r.Row, len(r.Align.SubAnnotations)))
   388  	}
   389  	return linear.NewSeq(r.Name(), b, r.Alphabet())
   390  }
   391  func (r Row) CloneAnnotation() *seq.Annotation {
   392  	return r.Align.SubAnnotations[r.Row].CloneAnnotation()
   393  }
   394  
   395  // String returns a string representation of the sequence data only.
   396  func (r Row) String() string { return fmt.Sprintf("%-s", r) }
   397  
   398  // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's'
   399  // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support
   400  // truncated output via the verb's precision. Fasta format supports sequence line
   401  // specification via the verb's width field. Fastq format supports optional inclusion
   402  // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#'
   403  // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for
   404  // omission of the sequence name.
   405  func (r Row) Format(fs fmt.State, c rune) {
   406  	var (
   407  		s      = r.Align
   408  		w, wOk = fs.Width()
   409  		p, pOk = fs.Precision()
   410  		buf    alphabet.Columns
   411  	)
   412  	if s != nil {
   413  		if pOk {
   414  			buf = s.Seq[:min(p, len(s.Seq))]
   415  		} else {
   416  			buf = s.Seq
   417  		}
   418  	}
   419  
   420  	switch c {
   421  	case 'v':
   422  		if fs.Flag('#') {
   423  			type shadowRow Row
   424  			sr := fmt.Sprintf("%#v", shadowRow(r))
   425  			fmt.Fprintf(fs, "%T%s", r, sr[strings.Index(sr, "{"):])
   426  			return
   427  		}
   428  		fallthrough
   429  	case 's':
   430  		if s == nil {
   431  			fmt.Fprint(fs, "<nil>")
   432  			return
   433  		}
   434  		if !fs.Flag('-') {
   435  			fmt.Fprintf(fs, "%q ", r.Name())
   436  		}
   437  		for _, lc := range buf {
   438  			fmt.Fprintf(fs, "%c", lc[r.Row])
   439  		}
   440  		if pOk && s != nil && p < s.Len() {
   441  			fmt.Fprint(fs, "...")
   442  		}
   443  	case 'a':
   444  		if s == nil {
   445  			return
   446  		}
   447  		r.formatDescLineTo(fs, '>')
   448  		for i, lc := range buf {
   449  			fmt.Fprintf(fs, "%c", lc[r.Row])
   450  			if wOk && i < s.Len()-1 && i%w == w-1 {
   451  				fmt.Fprintln(fs)
   452  			}
   453  		}
   454  		if pOk && p < s.Len() {
   455  			fmt.Fprint(fs, "...")
   456  		}
   457  	case 'q':
   458  		if s == nil {
   459  			return
   460  		}
   461  		r.formatDescLineTo(fs, '@')
   462  		for _, lc := range buf {
   463  			fmt.Fprintf(fs, "%c", lc[r.Row])
   464  		}
   465  		if pOk && p < s.Len() {
   466  			fmt.Fprintln(fs, "...")
   467  		} else {
   468  			fmt.Fprintln(fs)
   469  		}
   470  		if fs.Flag('+') {
   471  			r.formatDescLineTo(fs, '+')
   472  		} else {
   473  			fmt.Fprintln(fs, "+")
   474  		}
   475  		e := seq.DefaultQphred.Encode(seq.DefaultEncoding)
   476  		if e >= unicode.MaxASCII {
   477  			e = unicode.MaxASCII - 1
   478  		}
   479  		for range buf {
   480  			fmt.Fprintf(fs, "%c", e)
   481  		}
   482  		if pOk && p < s.Len() {
   483  			fmt.Fprint(fs, "...")
   484  		}
   485  	default:
   486  		fmt.Fprintf(fs, "%%!%c(alignment.Row=%.10s)", c, s)
   487  	}
   488  }
   489  
   490  func (r Row) formatDescLineTo(fs fmt.State, p rune) {
   491  	fmt.Fprintf(fs, "%c%s", p, r.Name())
   492  	if d := r.Description(); d != "" {
   493  		fmt.Fprintf(fs, " %s", d)
   494  	}
   495  	fmt.Fprintln(fs)
   496  }
   497  
   498  // SetSlice unconditionally panics.
   499  func (r Row) SetSlice(_ alphabet.Slice) { panic("alignment: cannot alter row slice") }
   500  
   501  // Slice unconditionally panics.
   502  func (r Row) Slice() alphabet.Slice { panic("alignment: cannot get row slice") }