github.com/biogo/biogo@v1.0.4/io/seqio/fastq/fastq.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package fastq provides types to read and write FASTQ format files.
     6  package fastq
     7  
     8  import (
     9  	"github.com/biogo/biogo/alphabet"
    10  	"github.com/biogo/biogo/io/seqio"
    11  	"github.com/biogo/biogo/seq"
    12  
    13  	"bufio"
    14  	"bytes"
    15  	"errors"
    16  	"fmt"
    17  	"io"
    18  )
    19  
    20  var (
    21  	_ seqio.Reader = (*Reader)(nil)
    22  	_ seqio.Writer = (*Writer)(nil)
    23  )
    24  
    25  type Encoder interface {
    26  	Encoding() alphabet.Encoding
    27  }
    28  
    29  // Fastq sequence format reader type.
    30  type Reader struct {
    31  	r   *bufio.Reader
    32  	t   seqio.SequenceAppender
    33  	enc alphabet.Encoding
    34  }
    35  
    36  // Returns a new fastq format reader using r. Sequences returned by the Reader are copied
    37  // from the provided template.
    38  func NewReader(r io.Reader, template seqio.SequenceAppender) *Reader {
    39  	var enc alphabet.Encoding
    40  	if e, ok := template.(Encoder); ok {
    41  		enc = e.Encoding()
    42  	} else {
    43  		enc = alphabet.None
    44  	}
    45  
    46  	return &Reader{
    47  		r:   bufio.NewReader(r),
    48  		t:   template,
    49  		enc: enc,
    50  	}
    51  }
    52  
    53  // Read a single sequence and return it  and potentially an error. Note that
    54  // a non-nil returned error may be associated with a valid sequence, so it is
    55  // the responsibility of the caller to examine the error to determine whether
    56  // the read was successful.
    57  // Note that if the Reader's template type returns different non-nil error
    58  // values from calls to SetName and SetDescription, a new error string will be
    59  // returned on each call to Read. So to allow direct error comparison these
    60  // methods should return the same error.
    61  // TODO: Does not read multi-line fastq.
    62  func (r *Reader) Read() (seq.Sequence, error) {
    63  	const (
    64  		id1 = iota
    65  		letters
    66  		id2
    67  		quality
    68  	)
    69  
    70  	var (
    71  		buff, line, label []byte
    72  		isPrefix          bool
    73  
    74  		seqBuff []alphabet.QLetter
    75  		t       seqio.SequenceAppender
    76  
    77  		state int
    78  		err   error
    79  	)
    80  
    81  loop:
    82  	for {
    83  		buff, isPrefix, err = r.r.ReadLine()
    84  		if err != nil {
    85  			if t != nil && state == quality && err == io.EOF {
    86  				err = nil
    87  				break
    88  			}
    89  			return nil, err
    90  		}
    91  		line = append(line, buff...)
    92  		if isPrefix {
    93  			continue
    94  		}
    95  
    96  		line = bytes.TrimSpace(line)
    97  		switch {
    98  		case state == id1 && maybeID1(line):
    99  			state = letters
   100  			var _err error
   101  			t, _err = r.readHeader(line)
   102  			if err == nil && _err != nil {
   103  				err = _err
   104  			}
   105  			label = append([]byte(nil), line...)
   106  		case state == id2 && maybeID2(line):
   107  			state = quality
   108  			if len(label) == 0 {
   109  				return nil, errors.New("fastq: no header line parsed before +line in fastq format")
   110  			}
   111  			if len(line) != 1 && bytes.Compare(label[1:], line[1:]) != 0 {
   112  				return nil, errors.New("fastq: quality header does not match sequence header")
   113  			}
   114  		case state == letters && len(line) > 0:
   115  			if maybeID2(line) && (len(line) == 1 || bytes.Compare(label[1:], line[1:]) == 0) {
   116  				state = quality
   117  				break
   118  			}
   119  			state = id2
   120  			seqBuff = make([]alphabet.QLetter, len(line))
   121  			var i int
   122  			for _, l := range line {
   123  				if isSpace(l) {
   124  					continue
   125  				}
   126  				seqBuff[i].L = alphabet.Letter(l)
   127  				i++
   128  			}
   129  			seqBuff = seqBuff[:i]
   130  		case state == quality:
   131  			if len(line) == 0 && len(seqBuff) != 0 {
   132  				continue
   133  			}
   134  			break loop
   135  		}
   136  		line = line[:0]
   137  	}
   138  
   139  	line = bytes.Join(bytes.Fields(line), nil)
   140  	if len(line) != len(seqBuff) {
   141  		return nil, errors.New("fastq: sequence/quality length mismatch")
   142  	}
   143  	for i := range line {
   144  		seqBuff[i].Q = r.enc.DecodeToQphred(line[i])
   145  	}
   146  	t.AppendQLetters(seqBuff...)
   147  
   148  	return t, err
   149  }
   150  
   151  func maybeID1(l []byte) bool { return len(l) > 0 && l[0] == '@' }
   152  func maybeID2(l []byte) bool { return len(l) > 0 && l[0] == '+' }
   153  func isSpace(b byte) bool {
   154  	switch b {
   155  	case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
   156  		return true
   157  	}
   158  	return false
   159  }
   160  
   161  func (r *Reader) readHeader(line []byte) (seqio.SequenceAppender, error) {
   162  	s := r.t.Clone().(seqio.SequenceAppender)
   163  	fieldMark := bytes.IndexAny(line, " \t")
   164  	var err error
   165  	if fieldMark < 0 {
   166  		err = s.SetName(string(line[1:]))
   167  		return s, err
   168  	} else {
   169  		err = s.SetName(string(line[1:fieldMark]))
   170  		_err := s.SetDescription(string(line[fieldMark+1:]))
   171  		if err != nil || _err != nil {
   172  			switch {
   173  			case err == _err:
   174  				return s, err
   175  			case err != nil && _err != nil:
   176  				return s, fmt.Errorf("fastq: multiple errors: name: %s, desc:%s", err, _err)
   177  			case err != nil:
   178  				return s, err
   179  			case _err != nil:
   180  				return s, _err
   181  			}
   182  		}
   183  	}
   184  
   185  	return s, nil
   186  }
   187  
   188  // Fastq sequence format writer type.
   189  type Writer struct {
   190  	w   io.Writer
   191  	QID bool // Include ID on +lines
   192  }
   193  
   194  // Returns a new fastq format writer using w.
   195  func NewWriter(w io.Writer) *Writer {
   196  	return &Writer{
   197  		w: w,
   198  	}
   199  }
   200  
   201  // Write a single sequence and return the number of bytes written and any error.
   202  func (w *Writer) Write(s seq.Sequence) (n int, err error) {
   203  	var (
   204  		_n  int
   205  		enc alphabet.Encoding
   206  	)
   207  	if e, ok := s.(Encoder); ok {
   208  		enc = e.Encoding()
   209  	} else {
   210  		enc = alphabet.Sanger
   211  	}
   212  
   213  	n, err = w.writeHeader('@', s)
   214  	if err != nil {
   215  		return
   216  	}
   217  	for i := 0; i < s.Len(); i++ {
   218  		_n, err = w.w.Write([]byte{byte(s.At(i).L)})
   219  		if n += _n; err != nil {
   220  			return
   221  		}
   222  	}
   223  	_n, err = w.w.Write([]byte{'\n'})
   224  	if n += _n; err != nil {
   225  		return
   226  	}
   227  	if w.QID {
   228  		_n, err = w.writeHeader('+', s)
   229  		if n += _n; err != nil {
   230  			return
   231  		}
   232  	} else {
   233  		_n, err = w.w.Write([]byte("+\n"))
   234  		if n += _n; err != nil {
   235  			return
   236  		}
   237  	}
   238  	for i := 0; i < s.Len(); i++ {
   239  		_n, err = w.w.Write([]byte{s.At(i).Q.Encode(enc)})
   240  		if n += _n; err != nil {
   241  			return
   242  		}
   243  	}
   244  	_n, err = w.w.Write([]byte{'\n'})
   245  	if n += _n; err != nil {
   246  		return
   247  	}
   248  
   249  	return
   250  }
   251  
   252  func (w *Writer) writeHeader(prefix byte, s seq.Sequence) (n int, err error) {
   253  	var _n int
   254  	n, err = w.w.Write([]byte{prefix})
   255  	if err != nil {
   256  		return
   257  	}
   258  	_n, err = io.WriteString(w.w, s.Name())
   259  	if n += _n; err != nil {
   260  		return
   261  	}
   262  	if desc := s.Description(); len(desc) != 0 {
   263  		_n, err = w.w.Write([]byte{' '})
   264  		if n += _n; err != nil {
   265  			return
   266  		}
   267  		_n, err = io.WriteString(w.w, desc)
   268  		if n += _n; err != nil {
   269  			return
   270  		}
   271  	}
   272  	_n, err = w.w.Write([]byte("\n"))
   273  	n += _n
   274  	return
   275  }