github.com/biogo/biogo@v1.0.4/io/seqio/fastq/fastq.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package fastq provides types to read and write FASTQ format files. 6 package fastq 7 8 import ( 9 "github.com/biogo/biogo/alphabet" 10 "github.com/biogo/biogo/io/seqio" 11 "github.com/biogo/biogo/seq" 12 13 "bufio" 14 "bytes" 15 "errors" 16 "fmt" 17 "io" 18 ) 19 20 var ( 21 _ seqio.Reader = (*Reader)(nil) 22 _ seqio.Writer = (*Writer)(nil) 23 ) 24 25 type Encoder interface { 26 Encoding() alphabet.Encoding 27 } 28 29 // Fastq sequence format reader type. 30 type Reader struct { 31 r *bufio.Reader 32 t seqio.SequenceAppender 33 enc alphabet.Encoding 34 } 35 36 // Returns a new fastq format reader using r. Sequences returned by the Reader are copied 37 // from the provided template. 38 func NewReader(r io.Reader, template seqio.SequenceAppender) *Reader { 39 var enc alphabet.Encoding 40 if e, ok := template.(Encoder); ok { 41 enc = e.Encoding() 42 } else { 43 enc = alphabet.None 44 } 45 46 return &Reader{ 47 r: bufio.NewReader(r), 48 t: template, 49 enc: enc, 50 } 51 } 52 53 // Read a single sequence and return it and potentially an error. Note that 54 // a non-nil returned error may be associated with a valid sequence, so it is 55 // the responsibility of the caller to examine the error to determine whether 56 // the read was successful. 57 // Note that if the Reader's template type returns different non-nil error 58 // values from calls to SetName and SetDescription, a new error string will be 59 // returned on each call to Read. So to allow direct error comparison these 60 // methods should return the same error. 61 // TODO: Does not read multi-line fastq. 62 func (r *Reader) Read() (seq.Sequence, error) { 63 const ( 64 id1 = iota 65 letters 66 id2 67 quality 68 ) 69 70 var ( 71 buff, line, label []byte 72 isPrefix bool 73 74 seqBuff []alphabet.QLetter 75 t seqio.SequenceAppender 76 77 state int 78 err error 79 ) 80 81 loop: 82 for { 83 buff, isPrefix, err = r.r.ReadLine() 84 if err != nil { 85 if t != nil && state == quality && err == io.EOF { 86 err = nil 87 break 88 } 89 return nil, err 90 } 91 line = append(line, buff...) 92 if isPrefix { 93 continue 94 } 95 96 line = bytes.TrimSpace(line) 97 switch { 98 case state == id1 && maybeID1(line): 99 state = letters 100 var _err error 101 t, _err = r.readHeader(line) 102 if err == nil && _err != nil { 103 err = _err 104 } 105 label = append([]byte(nil), line...) 106 case state == id2 && maybeID2(line): 107 state = quality 108 if len(label) == 0 { 109 return nil, errors.New("fastq: no header line parsed before +line in fastq format") 110 } 111 if len(line) != 1 && bytes.Compare(label[1:], line[1:]) != 0 { 112 return nil, errors.New("fastq: quality header does not match sequence header") 113 } 114 case state == letters && len(line) > 0: 115 if maybeID2(line) && (len(line) == 1 || bytes.Compare(label[1:], line[1:]) == 0) { 116 state = quality 117 break 118 } 119 state = id2 120 seqBuff = make([]alphabet.QLetter, len(line)) 121 var i int 122 for _, l := range line { 123 if isSpace(l) { 124 continue 125 } 126 seqBuff[i].L = alphabet.Letter(l) 127 i++ 128 } 129 seqBuff = seqBuff[:i] 130 case state == quality: 131 if len(line) == 0 && len(seqBuff) != 0 { 132 continue 133 } 134 break loop 135 } 136 line = line[:0] 137 } 138 139 line = bytes.Join(bytes.Fields(line), nil) 140 if len(line) != len(seqBuff) { 141 return nil, errors.New("fastq: sequence/quality length mismatch") 142 } 143 for i := range line { 144 seqBuff[i].Q = r.enc.DecodeToQphred(line[i]) 145 } 146 t.AppendQLetters(seqBuff...) 147 148 return t, err 149 } 150 151 func maybeID1(l []byte) bool { return len(l) > 0 && l[0] == '@' } 152 func maybeID2(l []byte) bool { return len(l) > 0 && l[0] == '+' } 153 func isSpace(b byte) bool { 154 switch b { 155 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: 156 return true 157 } 158 return false 159 } 160 161 func (r *Reader) readHeader(line []byte) (seqio.SequenceAppender, error) { 162 s := r.t.Clone().(seqio.SequenceAppender) 163 fieldMark := bytes.IndexAny(line, " \t") 164 var err error 165 if fieldMark < 0 { 166 err = s.SetName(string(line[1:])) 167 return s, err 168 } else { 169 err = s.SetName(string(line[1:fieldMark])) 170 _err := s.SetDescription(string(line[fieldMark+1:])) 171 if err != nil || _err != nil { 172 switch { 173 case err == _err: 174 return s, err 175 case err != nil && _err != nil: 176 return s, fmt.Errorf("fastq: multiple errors: name: %s, desc:%s", err, _err) 177 case err != nil: 178 return s, err 179 case _err != nil: 180 return s, _err 181 } 182 } 183 } 184 185 return s, nil 186 } 187 188 // Fastq sequence format writer type. 189 type Writer struct { 190 w io.Writer 191 QID bool // Include ID on +lines 192 } 193 194 // Returns a new fastq format writer using w. 195 func NewWriter(w io.Writer) *Writer { 196 return &Writer{ 197 w: w, 198 } 199 } 200 201 // Write a single sequence and return the number of bytes written and any error. 202 func (w *Writer) Write(s seq.Sequence) (n int, err error) { 203 var ( 204 _n int 205 enc alphabet.Encoding 206 ) 207 if e, ok := s.(Encoder); ok { 208 enc = e.Encoding() 209 } else { 210 enc = alphabet.Sanger 211 } 212 213 n, err = w.writeHeader('@', s) 214 if err != nil { 215 return 216 } 217 for i := 0; i < s.Len(); i++ { 218 _n, err = w.w.Write([]byte{byte(s.At(i).L)}) 219 if n += _n; err != nil { 220 return 221 } 222 } 223 _n, err = w.w.Write([]byte{'\n'}) 224 if n += _n; err != nil { 225 return 226 } 227 if w.QID { 228 _n, err = w.writeHeader('+', s) 229 if n += _n; err != nil { 230 return 231 } 232 } else { 233 _n, err = w.w.Write([]byte("+\n")) 234 if n += _n; err != nil { 235 return 236 } 237 } 238 for i := 0; i < s.Len(); i++ { 239 _n, err = w.w.Write([]byte{s.At(i).Q.Encode(enc)}) 240 if n += _n; err != nil { 241 return 242 } 243 } 244 _n, err = w.w.Write([]byte{'\n'}) 245 if n += _n; err != nil { 246 return 247 } 248 249 return 250 } 251 252 func (w *Writer) writeHeader(prefix byte, s seq.Sequence) (n int, err error) { 253 var _n int 254 n, err = w.w.Write([]byte{prefix}) 255 if err != nil { 256 return 257 } 258 _n, err = io.WriteString(w.w, s.Name()) 259 if n += _n; err != nil { 260 return 261 } 262 if desc := s.Description(); len(desc) != 0 { 263 _n, err = w.w.Write([]byte{' '}) 264 if n += _n; err != nil { 265 return 266 } 267 _n, err = io.WriteString(w.w, desc) 268 if n += _n; err != nil { 269 return 270 } 271 } 272 _n, err = w.w.Write([]byte("\n")) 273 n += _n 274 return 275 }