github.com/biogo/biogo@v1.0.4/seq/alignment/qalignment.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package alignment
     6  
     7  import (
     8  	"github.com/biogo/biogo/alphabet"
     9  	"github.com/biogo/biogo/feat"
    10  	"github.com/biogo/biogo/seq"
    11  	"github.com/biogo/biogo/seq/linear"
    12  	"github.com/biogo/biogo/util"
    13  
    14  	"errors"
    15  	"fmt"
    16  	"strings"
    17  	"unicode"
    18  )
    19  
    20  // A QSeq is an aligned sequence with quality scores.
    21  type QSeq struct {
    22  	seq.Annotation
    23  	SubAnnotations []seq.Annotation
    24  	Seq            alphabet.QColumns
    25  	ColumnConsense seq.ConsenseFunc
    26  	Threshold      alphabet.Qphred // Threshold for returning valid letter.
    27  	QFilter        seq.QFilter     // How to represent below threshold letter.
    28  	Encode         alphabet.Encoding
    29  }
    30  
    31  // NewSeq creates a new Seq with the given id, letter sequence and alphabet.
    32  func NewQSeq(id string, subids []string, ql [][]alphabet.QLetter, alpha alphabet.Alphabet, enc alphabet.Encoding, cons seq.ConsenseFunc) (*QSeq, error) {
    33  	var (
    34  		lids, lseq = len(subids), len(ql)
    35  		subann     []seq.Annotation
    36  	)
    37  	switch {
    38  	case lids == 0 && len(ql) == 0:
    39  	case lseq != 0 && lids == len(ql[0]):
    40  		if lids == 0 {
    41  			subann = make([]seq.Annotation, len(ql[0]))
    42  			for i := range subids {
    43  				subann[i].ID = fmt.Sprintf("%s:%d", id, i)
    44  			}
    45  		} else {
    46  			subann = make([]seq.Annotation, lids)
    47  			for i, sid := range subids {
    48  				subann[i].ID = sid
    49  			}
    50  		}
    51  	default:
    52  		return nil, errors.New("alignment: id/seq number mismatch")
    53  	}
    54  
    55  	return &QSeq{
    56  		Annotation: seq.Annotation{
    57  			ID:    id,
    58  			Alpha: alpha,
    59  		},
    60  		SubAnnotations: subann,
    61  		Seq:            append([][]alphabet.QLetter(nil), ql...),
    62  		Encode:         enc,
    63  		ColumnConsense: cons,
    64  		Threshold:      2,
    65  		QFilter:        seq.AmbigFilter,
    66  	}, nil
    67  }
    68  
    69  // Interface guarantees
    70  var (
    71  	_ feat.Feature = (*QSeq)(nil)
    72  	_ feat.Feature = QRow{}
    73  	_ seq.Sequence = QRow{}
    74  	_ seq.Scorer   = QRow{}
    75  )
    76  
    77  // Slice returns the sequence data as a alphabet.Slice.
    78  func (s *QSeq) Slice() alphabet.Slice { return s.Seq }
    79  
    80  // SetSlice sets the sequence data represented by the Seq. SetSlice will panic if sl
    81  // is not a QColumns.
    82  func (s *QSeq) SetSlice(sl alphabet.Slice) { s.Seq = sl.(alphabet.QColumns) }
    83  
    84  // Encoding returns the quality encoding scheme.
    85  func (s *QSeq) Encoding() alphabet.Encoding { return s.Encode }
    86  
    87  // SetEncoding sets the quality encoding scheme to e.
    88  func (s *QSeq) SetEncoding(e alphabet.Encoding) error { s.Encode = e; return nil }
    89  
    90  // Len returns the length of the alignment.
    91  func (s *QSeq) Len() int { return len(s.Seq) }
    92  
    93  // Rows returns the number of rows in the alignment.
    94  func (s *QSeq) Rows() int { return s.Seq.Rows() }
    95  
    96  // Start returns the start position of the sequence in coordinates relative to the
    97  // sequence location.
    98  func (s *QSeq) Start() int { return s.Offset }
    99  
   100  // End returns the end position of the sequence in coordinates relative to the
   101  // sequence location.
   102  func (s *QSeq) End() int { return s.Offset + s.Len() }
   103  
   104  // Clone returns a copy of the sequence.
   105  func (s *QSeq) Clone() seq.Rower {
   106  	c := *s
   107  	c.Seq = make([][]alphabet.QLetter, len(s.Seq))
   108  	for i, s := range s.Seq {
   109  		c.Seq[i] = append([]alphabet.QLetter(nil), s...)
   110  	}
   111  
   112  	return &c
   113  }
   114  
   115  // New returns an empty *QSeq sequence with the same alphabet.
   116  func (s *QSeq) New() *QSeq {
   117  	return &QSeq{Annotation: seq.Annotation{Alpha: s.Alpha}}
   118  }
   119  
   120  // RevComp reverse complements the sequence. RevComp will panic if the alphabet used by
   121  // the receiver is not a Complementor.
   122  func (s *QSeq) RevComp() {
   123  	rs, comp := s.Seq, s.Alpha.(alphabet.Complementor).ComplementTable()
   124  	i, j := 0, len(rs)-1
   125  	for ; i < j; i, j = i+1, j-1 {
   126  		for r := range rs[i] {
   127  			rs[i][r].L, rs[j][r].L = comp[rs[j][r].L], comp[rs[i][r].L]
   128  			rs[i][r].Q, rs[j][r].Q = rs[j][r].Q, rs[i][r].Q
   129  		}
   130  	}
   131  	if i == j {
   132  		for r := range rs[i] {
   133  			rs[i][r].L = comp[rs[i][r].L]
   134  		}
   135  	}
   136  	s.Strand = -s.Strand
   137  }
   138  
   139  // Reverse reverses the order of letters in the the sequence without complementing them.
   140  func (s *QSeq) Reverse() {
   141  	l := s.Seq
   142  	for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 {
   143  		l[i], l[j] = l[j], l[i]
   144  	}
   145  	s.Strand = seq.None
   146  }
   147  
   148  func (s *QSeq) String() string {
   149  	t := s.Consensus(false)
   150  	t.Threshold = s.Threshold
   151  	t.QFilter = s.QFilter
   152  	return t.String()
   153  }
   154  
   155  // Add sequences n to Alignment. Sequences in n must align start and end with the receiving alignment.
   156  // Additional sequence will be clipped.
   157  func (s *QSeq) Add(n ...seq.Sequence) error {
   158  	for i := s.Start(); i < s.End(); i++ {
   159  		s.Seq[i] = append(s.Seq[i], s.column(n, i)...)
   160  	}
   161  	for i := range n {
   162  		s.SubAnnotations = append(s.SubAnnotations, *n[i].CloneAnnotation())
   163  	}
   164  
   165  	return nil
   166  }
   167  
   168  func (s *QSeq) column(m []seq.Sequence, pos int) []alphabet.QLetter {
   169  	c := make([]alphabet.QLetter, 0, s.Rows())
   170  
   171  	for _, r := range m {
   172  		if a, ok := r.(seq.Aligned); ok {
   173  			if a.Start() <= pos && pos < a.End() {
   174  				c = append(c, a.ColumnQL(pos, true)...)
   175  			} else {
   176  				c = append(c, alphabet.QLetter{L: s.Alpha.Gap()}.Repeat(a.Rows())...)
   177  			}
   178  		} else {
   179  			if r.Start() <= pos && pos < r.End() {
   180  				c = append(c, r.At(pos))
   181  			} else {
   182  				c = append(c, alphabet.QLetter{L: s.Alpha.Gap()})
   183  			}
   184  		}
   185  	}
   186  
   187  	return c
   188  }
   189  
   190  // Delete removes the sequence represented at row i of the alignment. It panics if i is out of range.
   191  func (s *QSeq) Delete(i int) {
   192  	if i >= s.Rows() {
   193  		panic("alignment: index out of range")
   194  	}
   195  	cs := s.Seq
   196  	for j, c := range cs {
   197  		cs[j] = c[:i+copy(c[i:], c[i+1:])]
   198  	}
   199  	sa := s.SubAnnotations
   200  	s.SubAnnotations = sa[:i+copy(sa[i:], sa[i+1:])]
   201  }
   202  
   203  // Row returns the sequence represented at row i of the alignment. It panics is i is out of range.
   204  func (s *QSeq) Row(i int) seq.Sequence {
   205  	if i < 0 || i >= s.Rows() {
   206  		panic("alignment: index out of range")
   207  	}
   208  	return QRow{Align: s, Row: i}
   209  }
   210  
   211  // AppendColumns appends each Qletter of each element of a to the appropriate sequence in the receiver.
   212  func (s *QSeq) AppendColumns(a ...[]alphabet.QLetter) error {
   213  	for i, c := range a {
   214  		if len(c) != s.Rows() {
   215  			return fmt.Errorf("alignment: column %d does not match Rows(): %d != %d.", i, len(c), s.Rows())
   216  		}
   217  	}
   218  
   219  	s.Seq = append(s.Seq, a...)
   220  
   221  	return nil
   222  }
   223  
   224  // AppendEach appends each []alphabet.QLetter in a to the appropriate sequence in the receiver.
   225  func (s *QSeq) AppendEach(a [][]alphabet.QLetter) error {
   226  	if len(a) != s.Rows() {
   227  		return fmt.Errorf("alignment: number of sequences does not match Rows(): %d != %d.", len(a), s.Rows())
   228  	}
   229  	max := util.MinInt
   230  	for _, r := range a {
   231  		if l := len(r); l > max {
   232  			max = l
   233  		}
   234  	}
   235  	s.Seq = append(s.Seq, make([][]alphabet.QLetter, max)...)[:len(s.Seq)]
   236  	for i, b := 0, make([]alphabet.QLetter, 0, len(a)); i < max; i, b = i+1, b[:0] {
   237  		for _, r := range a {
   238  			if i < len(r) {
   239  				b = append(b, r[i])
   240  			} else {
   241  				b = append(b, alphabet.QLetter{L: s.Alpha.Gap()})
   242  			}
   243  		}
   244  		s.AppendColumns(b)
   245  	}
   246  
   247  	return nil
   248  }
   249  
   250  // Column returns a slice of letters reflecting the column at pos.
   251  func (s *QSeq) Column(pos int, _ bool) []alphabet.Letter {
   252  	c := make([]alphabet.Letter, s.Rows())
   253  	for i, l := range s.Seq[pos] {
   254  		if l.Q >= s.Threshold {
   255  			c[i] = l.L
   256  		} else {
   257  			c[i] = s.QFilter(s.Alpha, 255, alphabet.QLetter{})
   258  		}
   259  	}
   260  
   261  	return c
   262  }
   263  
   264  // ColumnQL returns a slice of quality letters reflecting the column at pos.
   265  func (s *QSeq) ColumnQL(pos int, _ bool) []alphabet.QLetter { return s.Seq[pos] }
   266  
   267  // Consensus returns a quality sequence reflecting the consensus of the receiver determined by the
   268  // ColumnConsense field.
   269  func (s *QSeq) Consensus(_ bool) *linear.QSeq {
   270  	cs := make([]alphabet.QLetter, 0, s.Len())
   271  	alpha := s.Alphabet()
   272  	for i := range s.Seq {
   273  		cs = append(cs, s.ColumnConsense(s, alpha, i, false))
   274  	}
   275  
   276  	qs := linear.NewQSeq("Consensus:"+s.ID, cs, s.Alpha, alphabet.Sanger)
   277  	qs.Strand = s.Strand
   278  	qs.SetOffset(s.Offset)
   279  	qs.Conform = s.Conform
   280  
   281  	return qs
   282  }
   283  
   284  // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's'
   285  // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support
   286  // truncated output via the verb's precision. Fasta format supports sequence line
   287  // specification via the verb's width field. Fastq format supports optional inclusion
   288  // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#'
   289  // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for
   290  // omission of the sequence name.
   291  func (s *QSeq) Format(fs fmt.State, c rune) {
   292  	if s == nil {
   293  		fmt.Fprint(fs, "<nil>")
   294  		return
   295  	}
   296  	switch c {
   297  	case 'v':
   298  		if fs.Flag('#') {
   299  			fmt.Fprintf(fs, "&%#v", *s)
   300  			return
   301  		}
   302  		fallthrough
   303  	case 's', 'a', 'q':
   304  		r := QRow{Align: s}
   305  		for r.Row = 0; r.Row < s.Rows(); r.Row++ {
   306  			r.Format(fs, c)
   307  			if r.Row < s.Rows()-1 {
   308  				fmt.Fprintln(fs)
   309  			}
   310  		}
   311  	default:
   312  		fmt.Fprintf(fs, "%%!%c(*alignment.QSeq=%.10s)", c, s)
   313  	}
   314  }
   315  
   316  // A Row is a pointer into an alignment that satisfies the seq.Sequence and seq.Scorer interfaces.
   317  type QRow struct {
   318  	Align *QSeq
   319  	Row   int
   320  }
   321  
   322  // At returns the letter at position i.
   323  func (r QRow) At(i int) alphabet.QLetter {
   324  	return r.Align.Seq[i-r.Align.Offset][r.Row]
   325  }
   326  
   327  // Set sets the letter at position i to l.
   328  func (r QRow) Set(i int, l alphabet.QLetter) error {
   329  	r.Align.Seq[i-r.Align.Offset][r.Row] = l
   330  	return nil
   331  }
   332  
   333  // Len returns the length of the alignment.
   334  func (r QRow) Len() int { return len(r.Align.Seq) }
   335  
   336  // Start returns the start position of the sequence in coordinates relative to the
   337  // sequence location.
   338  func (r QRow) Start() int { return r.Align.SubAnnotations[r.Row].Offset }
   339  
   340  // End returns the end position of the sequence in coordinates relative to the
   341  // sequence location.
   342  func (r QRow) End() int { return r.Start() + r.Len() }
   343  
   344  // Location returns the feature containing the row's sequence.
   345  func (r QRow) Location() feat.Feature { return r.Align.SubAnnotations[r.Row].Loc }
   346  
   347  // SetE sets the quality at position i to e to reflect the given p(Error).
   348  func (r QRow) SetE(i int, e float64) error {
   349  	r.Align.Seq[i-r.Align.Offset][r.Row].Q = alphabet.Ephred(e)
   350  	return nil
   351  }
   352  
   353  // QEncode encodes the quality at position i to a letter based on the sequence encoding setting.
   354  func (r QRow) QEncode(i int) byte {
   355  	return r.Align.Seq[i-r.Align.Offset][r.Row].Q.Encode(r.Encoding())
   356  }
   357  
   358  // EAt returns the probability of a sequence error at position i.
   359  func (r QRow) EAt(i int) float64 {
   360  	return r.Align.Seq[i-r.Align.Offset][r.Row].Q.ProbE()
   361  }
   362  
   363  func (r QRow) Alphabet() alphabet.Alphabet           { return r.Align.Alpha }
   364  func (r QRow) Encoding() alphabet.Encoding           { return r.Align.Encoding() }
   365  func (r QRow) SetEncoding(e alphabet.Encoding) error { return r.Align.SetEncoding(e) }
   366  func (r QRow) Conformation() feat.Conformation       { return r.Align.SubAnnotations[r.Row].Conform }
   367  func (r QRow) SetConformation(c feat.Conformation) error {
   368  	r.Align.SubAnnotations[r.Row].Conform = c
   369  	return nil
   370  }
   371  func (r QRow) Name() string          { return r.Align.SubAnnotations[r.Row].ID }
   372  func (r QRow) Description() string   { return r.Align.SubAnnotations[r.Row].Desc }
   373  func (r QRow) SetOffset(o int) error { r.Align.SubAnnotations[r.Row].Offset = o; return nil }
   374  
   375  func (r QRow) RevComp() {
   376  	rs, comp := r.Align.Seq, r.Alphabet().(alphabet.Complementor).ComplementTable()
   377  	i, j := 0, len(rs)-1
   378  	for ; i < j; i, j = i+1, j-1 {
   379  		rs[i][r.Row].L, rs[j][r.Row].L = comp[rs[j][r.Row].L], comp[rs[i][r.Row].L]
   380  		rs[i][r.Row].Q, rs[j][r.Row].Q = rs[j][r.Row].Q, rs[i][r.Row].Q
   381  	}
   382  	if i == j {
   383  		rs[i][r.Row].L = comp[rs[i][r.Row].L]
   384  	}
   385  	r.Align.SubAnnotations[r.Row].Strand = -r.Align.SubAnnotations[r.Row].Strand
   386  }
   387  func (r QRow) Reverse() {
   388  	l := r.Align.Seq
   389  	for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 {
   390  		l[i][r.Row], l[j][r.Row] = l[j][r.Row], l[i][r.Row]
   391  	}
   392  	r.Align.SubAnnotations[r.Row].Strand = seq.None
   393  }
   394  func (r QRow) New() seq.Sequence {
   395  	return QRow{Align: &QSeq{Annotation: seq.Annotation{Alpha: r.Align.Alpha}}}
   396  }
   397  func (r QRow) Clone() seq.Sequence {
   398  	b := make([]alphabet.QLetter, r.Len())
   399  	for i, c := range r.Align.Seq {
   400  		b[i] = c[r.Row]
   401  	}
   402  	return linear.NewQSeq(r.Name(), b, r.Alphabet(), r.Align.Encoding())
   403  }
   404  func (r QRow) CloneAnnotation() *seq.Annotation {
   405  	return r.Align.SubAnnotations[r.Row].CloneAnnotation()
   406  }
   407  
   408  func min(a, b int) int {
   409  	if a < b {
   410  		return a
   411  	}
   412  	return b
   413  }
   414  
   415  // String returns a string representation of the sequence data only.
   416  func (r QRow) String() string { return fmt.Sprintf("%-s", r) }
   417  
   418  // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's'
   419  // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support
   420  // truncated output via the verb's precision. Fasta format supports sequence line
   421  // specification via the verb's width field. Fastq format supports optional inclusion
   422  // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#'
   423  // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for
   424  // omission of the sequence name.
   425  func (r QRow) Format(fs fmt.State, c rune) {
   426  	var (
   427  		s      = r.Align
   428  		w, wOk = fs.Width()
   429  		p, pOk = fs.Precision()
   430  		buf    alphabet.QColumns
   431  	)
   432  	if s != nil {
   433  		if pOk {
   434  			buf = s.Seq[:min(p, len(s.Seq))]
   435  		} else {
   436  			buf = s.Seq
   437  		}
   438  	}
   439  
   440  	switch c {
   441  	case 'v':
   442  		if fs.Flag('#') {
   443  			type shadowQRow QRow
   444  			sr := fmt.Sprintf("%#v", shadowQRow(r))
   445  			fmt.Fprintf(fs, "%T%s", r, sr[strings.Index(sr, "{"):])
   446  			return
   447  		}
   448  		fallthrough
   449  	case 's':
   450  		if s == nil {
   451  			fmt.Fprint(fs, "<nil>")
   452  			return
   453  		}
   454  		if !fs.Flag('-') {
   455  			fmt.Fprintf(fs, "%q ", r.Name())
   456  		}
   457  		for _, qc := range buf {
   458  			fmt.Fprintf(fs, "%c", s.QFilter(s.Alpha, s.Threshold, qc[r.Row]))
   459  		}
   460  		if pOk && p < s.Len() {
   461  			fmt.Fprint(fs, "...")
   462  		}
   463  	case 'a':
   464  		if s == nil {
   465  			return
   466  		}
   467  		r.formatDescLineTo(fs, '>')
   468  		for i, qc := range buf {
   469  			fmt.Fprintf(fs, "%c", s.QFilter(s.Alpha, s.Threshold, qc[r.Row]))
   470  			if wOk && i < s.Len()-1 && i%w == w-1 {
   471  				fmt.Fprintln(fs)
   472  			}
   473  		}
   474  		if pOk && p < s.Len() {
   475  			fmt.Fprint(fs, "...")
   476  		}
   477  	case 'q':
   478  		if s == nil {
   479  			return
   480  		}
   481  		r.formatDescLineTo(fs, '@')
   482  		for _, qc := range buf {
   483  			fmt.Fprintf(fs, "%c", qc[r.Row].L)
   484  		}
   485  		if pOk && p < s.Len() {
   486  			fmt.Fprintln(fs, "...")
   487  		} else {
   488  			fmt.Fprintln(fs)
   489  		}
   490  		if fs.Flag('+') {
   491  			r.formatDescLineTo(fs, '+')
   492  		} else {
   493  			fmt.Fprintln(fs, "+")
   494  		}
   495  		for _, qc := range buf {
   496  			e := qc[r.Row].Q.Encode(s.Encode)
   497  			if e >= unicode.MaxASCII {
   498  				e = unicode.MaxASCII - 1
   499  			}
   500  			fmt.Fprintf(fs, "%c", e)
   501  		}
   502  		if pOk && p < s.Len() {
   503  			fmt.Fprint(fs, "...")
   504  		}
   505  	default:
   506  		fmt.Fprintf(fs, "%%!%c(alignment.QRow=%.10s)", c, s)
   507  	}
   508  }
   509  
   510  func (r QRow) formatDescLineTo(fs fmt.State, p rune) {
   511  	fmt.Fprintf(fs, "%c%s", p, r.Name())
   512  	if d := r.Description(); d != "" {
   513  		fmt.Fprintf(fs, " %s", d)
   514  	}
   515  	fmt.Fprintln(fs)
   516  }
   517  
   518  // SetSlice unconditionally panics.
   519  func (r QRow) SetSlice(_ alphabet.Slice) { panic("alignment: cannot alter row slice") }
   520  
   521  // Slice unconditionally panics.
   522  func (r QRow) Slice() alphabet.Slice { panic("alignment: cannot get row slice") }