github.com/biogo/biogo@v1.0.4/io/featio/gff/gff.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package gff provides types to read and write version 2 General Feature Format 6 // files according to the Sanger Institute specification. 7 // 8 // The specification can be found at http://www.sanger.ac.uk/resources/software/gff/spec.html. 9 package gff 10 11 import ( 12 "github.com/biogo/biogo/alphabet" 13 "github.com/biogo/biogo/feat" 14 "github.com/biogo/biogo/io/featio" 15 "github.com/biogo/biogo/io/seqio/fasta" 16 "github.com/biogo/biogo/seq" 17 "github.com/biogo/biogo/seq/linear" 18 19 "bufio" 20 "bytes" 21 "encoding/csv" 22 "fmt" 23 "io" 24 "math" 25 "runtime" 26 "strconv" 27 "time" 28 "unicode" 29 "unsafe" 30 ) 31 32 var ( 33 _ featio.Reader = (*Reader)(nil) 34 _ featio.Writer = (*Writer)(nil) 35 ) 36 37 // Version is the GFF version that is read and written. 38 const Version = 2 39 40 // "Astronomical" time format is the format specified in the GFF specification 41 const Astronomical = "2006-1-02" 42 43 type Error struct{ string } 44 45 func (e Error) Error() string { return e.string } 46 47 var ( 48 ErrBadFeature = Error{"gff: feature start not less than feature end"} 49 ErrBadStrandField = Error{"gff: bad strand field"} 50 ErrBadStrand = Error{"gff: invalid strand"} 51 ErrClosed = Error{"gff: writer closed"} 52 ErrBadTag = Error{"gff: invalid tag"} 53 ErrCannotHeader = Error{"gff: cannot write header: data written"} 54 ErrNotHandled = Error{"gff: type not handled"} 55 ErrFieldMissing = Error{"gff: missing fields"} 56 ErrBadMoltype = Error{"gff: invalid moltype"} 57 ErrEmptyMetaLine = Error{"gff: empty comment metaline"} 58 ErrBadMetaLine = Error{"gff: incomplete metaline"} 59 ErrBadSequence = Error{"gff: corrupt metasequence"} 60 ) 61 62 const ( 63 nameField = iota 64 sourceField 65 featureField 66 startField 67 endField 68 scoreField 69 strandField 70 frameField 71 attributeField 72 commentField 73 lastField 74 ) 75 76 // Frame holds feature frame information. 77 type Frame int8 78 79 func (f Frame) String() string { 80 if f <= NoFrame || f > Frame2 { 81 return "." 82 } 83 return [...]string{"0", "1", "2"}[f] 84 } 85 86 const ( 87 NoFrame Frame = iota - 1 88 Frame0 89 Frame1 90 Frame2 91 ) 92 93 // A Sequence is a feat.Feature 94 type Sequence struct { 95 SeqName string 96 Type feat.Moltype 97 } 98 99 func (s Sequence) Start() int { return 0 } 100 func (s Sequence) End() int { return 0 } 101 func (s Sequence) Len() int { return 0 } 102 func (s Sequence) Name() string { return string(s.SeqName) } 103 func (s Sequence) Description() string { return "GFF sequence" } 104 func (s Sequence) Location() feat.Feature { return nil } 105 func (s Sequence) MolType() feat.Moltype { return s.Type } 106 107 // A Region is a feat.Feature 108 type Region struct { 109 Sequence 110 RegionStart int 111 RegionEnd int 112 } 113 114 func (r *Region) Start() int { return r.RegionStart } 115 func (r *Region) End() int { return r.RegionEnd } 116 func (r *Region) Len() int { return r.RegionEnd - r.RegionStart } 117 func (r *Region) Description() string { return "GFF region" } 118 func (r *Region) Location() feat.Feature { return r.Sequence } 119 120 // An Attribute represents a GFF2 attribute field record. Attribute field records 121 // must have an tag value structure following the syntax used within objects in a 122 // .ace file, flattened onto one line by semicolon separators. 123 // Tags must be standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values 124 // must be quoted with double quotes. 125 // 126 // Note: all non-printing characters in free text value strings (e.g. newlines, 127 // tabs, control characters, etc) must be explicitly represented by their C (UNIX) 128 // style backslash-escaped representation. 129 type Attribute struct { 130 Tag, Value string 131 } 132 133 type Attributes []Attribute 134 135 func (a Attributes) Get(tag string) string { 136 for _, tv := range a { 137 if tv.Tag == tag { 138 return tv.Value 139 } 140 } 141 return "" 142 } 143 144 func (a Attributes) Format(fs fmt.State, c rune) { 145 for i, tv := range a { 146 fmt.Fprintf(fs, "%s %s", tv.Tag, tv.Value) 147 if i < len(a)-1 { 148 fs.Write([]byte("; ")) 149 } 150 } 151 } 152 153 // A Feature represents a standard GFF2 feature. 154 type Feature struct { 155 // The name of the sequence. Having an explicit sequence name allows 156 // a feature file to be prepared for a data set of multiple sequences. 157 // Normally the seqname will be the identifier of the sequence in an 158 // accompanying fasta format file. An alternative is that SeqName is 159 // the identifier for a sequence in a public database, such as an 160 // EMBL/Genbank/DDBJ accession number. Which is the case, and which 161 // file or database to use, should be explained in accompanying 162 // information. 163 SeqName string 164 165 // The source of this feature. This field will normally be used to 166 // indicate the program making the prediction, or if it comes from 167 // public database annotation, or is experimentally verified, etc. 168 Source string 169 170 // The feature type name. 171 Feature string 172 173 // FeatStart must be less than FeatEnd and non-negative - GFF indexing 174 // is one-base and GFF features cannot have a zero length or a negative 175 // position. gff.Feature indexing is, to be consistent with the rest of 176 // the library zero-based half open. Translation between zero- and one- 177 // based indexing is handled by the gff package. 178 FeatStart, FeatEnd int 179 180 // A floating point value representing the score for the feature. A nil 181 // value indicates the score is not available. 182 FeatScore *float64 183 184 // The strand of the feature - one of seq.Plus, seq.Minus or seq.None. 185 // seq.None should be used when strand is not relevant, e.g. for 186 // dinucleotide repeats. This field should be set to seq.None for RNA 187 // and protein features. 188 FeatStrand seq.Strand 189 190 // FeatFrame indicates the frame of the feature. and takes the values 191 // Frame0, Frame1, Frame2 or NoFrame. Frame0 indicates that the 192 // specified region is in frame. Frame1 indicates that there is one 193 // extra base, and Frame2 means that the third base of the region 194 // is the first base of a codon. If the FeatStrand is seq.Minus, then 195 // the first base of the region is value of FeatEnd, because the 196 // corresponding coding region will run from FeatEnd to FeatStart on 197 // the reverse strand. As with FeatStrand, if the frame is not relevant 198 // then set FeatFrame to NoFram. This field should be set to seq.None 199 // for RNA and protein features. 200 FeatFrame Frame 201 202 // FeatAttributes represents a collection of GFF2 attributes. 203 FeatAttributes Attributes 204 205 // Free comments. 206 Comments string 207 } 208 209 func (g *Feature) Start() int { return g.FeatStart } 210 func (g *Feature) End() int { return g.FeatEnd } 211 func (g *Feature) Len() int { return g.FeatEnd - g.FeatStart } 212 func (g *Feature) Name() string { 213 return fmt.Sprintf("%s/%s:[%d,%d)", g.Feature, g.SeqName, g.FeatStart, g.FeatEnd) 214 } 215 func (g *Feature) Description() string { return fmt.Sprintf("%s/%s", g.Feature, g.Source) } 216 func (g *Feature) Location() feat.Feature { return Sequence{SeqName: g.SeqName} } 217 218 func handlePanic(f *feat.Feature, err *error) { 219 r := recover() 220 if r != nil { 221 e, ok := r.(error) 222 if !ok { 223 panic(r) 224 } 225 if _, ok = r.(runtime.Error); ok { 226 panic(r) 227 } 228 *err = e 229 *f = nil 230 } 231 } 232 233 // This function cannot be used to create strings that are expected to persist. 234 func unsafeString(b []byte) string { 235 return *(*string)(unsafe.Pointer(&b)) 236 } 237 238 func mustAtoi(f [][]byte, index, line int) int { 239 i, err := strconv.ParseInt(unsafeString(f[index]), 0, 0) 240 if err != nil { 241 panic(&csv.ParseError{Line: line, Column: index, Err: err}) 242 } 243 return int(i) 244 } 245 246 func mustAtofPtr(f [][]byte, index, line int) *float64 { 247 if len(f[index]) == 1 && f[index][0] == '.' { 248 return nil 249 } 250 i, err := strconv.ParseFloat(unsafeString(f[index]), 64) 251 if err != nil { 252 panic(&csv.ParseError{Line: line, Column: index, Err: err}) 253 } 254 return &i 255 } 256 257 func mustAtoFr(f [][]byte, index, line int) Frame { 258 if len(f[index]) == 1 && f[index][0] == '.' { 259 return NoFrame 260 } 261 b, err := strconv.ParseInt(unsafeString(f[index]), 0, 8) 262 if err != nil { 263 panic(&csv.ParseError{Line: line, Column: index, Err: err}) 264 } 265 return Frame(b) 266 } 267 268 var charToStrand = func() [256]seq.Strand { 269 var t [256]seq.Strand 270 for i := range t { 271 t[i] = 0x7f 272 } 273 t['+'] = seq.Plus 274 t['.'] = seq.None 275 t['-'] = seq.Minus 276 return t 277 }() 278 279 func mustAtos(f [][]byte, index, line int) seq.Strand { 280 if len(f[index]) != 1 { 281 panic(&csv.ParseError{Line: line, Column: index, Err: ErrBadStrandField}) 282 } 283 s := charToStrand[f[index][0]] 284 if s == 0x7f { 285 panic(&csv.ParseError{Line: line, Column: index, Err: ErrBadStrand}) 286 } 287 return s 288 } 289 290 var alphaNum = func() [256]bool { 291 var t [256]bool 292 for i := 'a'; i <= 'z'; i++ { 293 t[i] = true 294 } 295 for i := 'A'; i <= 'Z'; i++ { 296 t[i] = true 297 } 298 t['_'] = true 299 return t 300 }() 301 302 func splitAnnot(f []byte, column, line int) (tag, value []byte) { 303 var ( 304 i int 305 b byte 306 split bool 307 ) 308 for i, b = range f { 309 space := unicode.IsSpace(rune(b)) 310 if !split { 311 if !space && !alphaNum[b] { 312 panic(&csv.ParseError{Line: line, Column: column, Err: ErrBadTag}) 313 } 314 if space { 315 split = true 316 tag = f[:i] 317 } 318 } else if !space { 319 break 320 } 321 } 322 if !split { 323 return f, nil 324 } 325 return tag, f[i:] 326 } 327 328 func mustAtoa(f [][]byte, index, line int) []Attribute { 329 c := bytes.Split(f[index], []byte{';'}) 330 a := make([]Attribute, 0, len(c)) 331 for _, f := range c { 332 f = bytes.TrimSpace(f) 333 if len(f) == 0 { 334 continue 335 } 336 tag, value := splitAnnot(f, index, line) 337 if len(tag) == 0 { 338 panic(&csv.ParseError{Line: line, Column: index, Err: ErrBadTag}) 339 } else { 340 a = append(a, Attribute{Tag: string(tag), Value: string(value)}) 341 } 342 } 343 return a 344 } 345 346 type Metadata struct { 347 Name string 348 Date time.Time 349 Version int 350 SourceVersion string 351 Type feat.Moltype 352 } 353 354 // A Reader can parse GFFv2 formatted io.Reader and return feat.Features. 355 type Reader struct { 356 r *bufio.Reader 357 line int 358 359 TimeFormat string // Required for parsing date fields. Defaults to astronomical format. 360 361 Metadata 362 } 363 364 // NewReader returns a new GFFv2 format reader that reads from r. 365 func NewReader(r io.Reader) *Reader { 366 return &Reader{ 367 r: bufio.NewReader(r), 368 TimeFormat: Astronomical, 369 Metadata: Metadata{Type: feat.Undefined}, 370 } 371 } 372 373 func (r *Reader) commentMetaline(line []byte) (f feat.Feature, err error) { 374 fields := bytes.Split(line, []byte{' '}) 375 if len(fields) < 1 { 376 return nil, &csv.ParseError{Line: r.line, Err: ErrEmptyMetaLine} 377 } 378 switch unsafeString(fields[0]) { 379 case "gff-version": 380 v := mustAtoi(fields, 1, r.line) 381 if v > Version { 382 return nil, &csv.ParseError{Line: r.line, Err: ErrNotHandled} 383 } 384 r.Version = Version 385 return r.Read() 386 case "source-version": 387 if len(fields) <= 1 { 388 return nil, &csv.ParseError{Line: r.line, Err: ErrBadMetaLine} 389 } 390 r.SourceVersion = string(bytes.Join(fields[1:], []byte{' '})) 391 return r.Read() 392 case "date": 393 if len(fields) <= 1 { 394 return nil, &csv.ParseError{Line: r.line, Err: ErrBadMetaLine} 395 } 396 if len(r.TimeFormat) > 0 { 397 r.Date, err = time.Parse(r.TimeFormat, unsafeString(bytes.Join(fields[1:], []byte{' '}))) 398 if err != nil { 399 return nil, err 400 } 401 } 402 return r.Read() 403 case "Type", "type": 404 if len(fields) <= 1 { 405 return nil, &csv.ParseError{Line: r.line, Err: ErrBadMetaLine} 406 } 407 r.Type = feat.ParseMoltype(unsafeString(fields[1])) 408 if len(fields) > 2 { 409 r.Name = string(fields[2]) 410 } 411 return r.Read() 412 case "sequence-region": 413 if len(fields) <= 3 { 414 return nil, &csv.ParseError{Line: r.line, Err: ErrBadMetaLine} 415 } 416 return &Region{ 417 Sequence: Sequence{SeqName: string(fields[1]), Type: r.Type}, 418 RegionStart: feat.OneToZero(mustAtoi(fields, 2, r.line)), 419 RegionEnd: mustAtoi(fields, 3, r.line), 420 }, nil 421 case "DNA", "RNA", "Protein", "dna", "rna", "protein": 422 if len(fields) <= 1 { 423 return nil, &csv.ParseError{Line: r.line, Err: ErrBadMetaLine} 424 } 425 return r.metaSeq(fields[0], fields[1]) 426 default: 427 return nil, &csv.ParseError{Line: r.line, Err: ErrNotHandled} 428 } 429 } 430 431 func (r *Reader) metaSeq(moltype, id []byte) (seq.Sequence, error) { 432 var line, body []byte 433 434 var err error 435 for { 436 line, err = r.r.ReadBytes('\n') 437 if err != nil { 438 if err == io.EOF { 439 return nil, err 440 } 441 return nil, &csv.ParseError{Line: r.line, Err: err} 442 } 443 r.line++ 444 line = bytes.TrimSpace(line) 445 if len(line) == 0 { 446 continue 447 } 448 if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) { 449 return nil, &csv.ParseError{Line: r.line, Err: ErrBadSequence} 450 } 451 line = bytes.TrimSpace(line[2:]) 452 if unsafeString(line) == "end-"+unsafeString(moltype) { 453 break 454 } else { 455 line = bytes.Join(bytes.Fields(line), nil) 456 body = append(body, line...) 457 } 458 } 459 460 var alpha alphabet.Alphabet 461 switch feat.ParseMoltype(unsafeString(moltype)) { 462 case feat.DNA: 463 alpha = alphabet.DNA 464 case feat.RNA: 465 alpha = alphabet.RNA 466 case feat.Protein: 467 alpha = alphabet.Protein 468 default: 469 return nil, ErrBadMoltype 470 } 471 s := linear.NewSeq(string(id), alphabet.BytesToLetters(body), alpha) 472 473 return s, err 474 } 475 476 // Read reads a single feature or part and return it or an error. A call to read may 477 // have side effects on the Reader's Metadata field. 478 func (r *Reader) Read() (f feat.Feature, err error) { 479 defer handlePanic(&f, &err) 480 481 var line []byte 482 for { 483 line, err = r.r.ReadBytes('\n') 484 if err != nil { 485 if err == io.EOF { 486 return f, err 487 } 488 return nil, &csv.ParseError{Line: r.line, Err: err} 489 } 490 r.line++ 491 line = bytes.TrimSpace(line) 492 if len(line) == 0 { // ignore blank lines 493 continue 494 } else if bytes.HasPrefix(line, []byte("##")) { 495 f, err = r.commentMetaline(line[2:]) 496 return 497 } else if line[0] != '#' { // ignore comments 498 break 499 } 500 } 501 502 fields := bytes.SplitN(line, []byte{'\t'}, lastField) 503 if len(fields) < frameField { 504 return nil, &csv.ParseError{Line: r.line, Column: len(fields), Err: ErrFieldMissing} 505 } 506 507 gff := &Feature{ 508 SeqName: string(fields[nameField]), 509 Source: string(fields[sourceField]), 510 Feature: string(fields[featureField]), 511 FeatStart: feat.OneToZero(mustAtoi(fields, startField, r.line)), 512 FeatEnd: mustAtoi(fields, endField, r.line), 513 FeatScore: mustAtofPtr(fields, scoreField, r.line), 514 FeatStrand: mustAtos(fields, strandField, r.line), 515 FeatFrame: mustAtoFr(fields, frameField, r.line), 516 } 517 518 if len(fields) <= attributeField { 519 return gff, nil 520 } 521 gff.FeatAttributes = mustAtoa(fields, attributeField, r.line) 522 if len(fields) <= commentField { 523 return gff, nil 524 } 525 gff.Comments = string(fields[commentField]) 526 527 if gff.FeatStart >= gff.FeatEnd { 528 err = ErrBadFeature 529 } 530 return gff, nil 531 } 532 533 // A Writer outputs features and sequences into GFFv2 format. 534 type Writer struct { 535 w io.Writer 536 TimeFormat string 537 Precision int 538 Width int 539 header bool 540 } 541 542 // Returns a new GFF format writer using w. When header is true, 543 // a version header will be written to the GFF. 544 func NewWriter(w io.Writer, width int, header bool) *Writer { 545 gw := &Writer{ 546 w: w, 547 Width: width, 548 TimeFormat: Astronomical, 549 Precision: -1, 550 } 551 552 if header { 553 gw.WriteMetaData(Version) 554 } 555 556 return gw 557 } 558 559 // Write writes a single feature and return the number of bytes written and any error. 560 // gff.Features are written as a canonical GFF line, seq.Sequences are written as inline 561 // sequence in GFF format (note that only sequences of feat.Moltype DNA, RNA and Protein 562 // are supported). gff.Sequences are not handled as they have a zero length. All other 563 // feat.Feature are written as sequence region metadata lines. 564 func (w *Writer) Write(f feat.Feature) (n int, err error) { 565 if f.Start() >= f.End() { 566 return 0, ErrBadFeature 567 } 568 w.header = true 569 switch f := f.(type) { 570 case *Feature: 571 defer func() { 572 if err != nil { 573 return 574 } 575 _, err = w.w.Write([]byte{'\n'}) 576 if err != nil { 577 return 578 } 579 n++ 580 }() 581 n, err = fmt.Fprintf(w.w, "%s\t%s\t%s\t%d\t%d\t", 582 f.SeqName, 583 f.Source, 584 f.Feature, 585 feat.ZeroToOne(f.FeatStart), 586 f.FeatEnd, 587 ) 588 if err != nil { 589 return n, err 590 } 591 var _n int 592 if f.FeatScore != nil && !math.IsNaN(*f.FeatScore) { 593 if w.Precision < 0 { 594 _n, err = fmt.Fprintf(w.w, "%v", *f.FeatScore) 595 } else { 596 _n, err = fmt.Fprintf(w.w, "%.*f", w.Precision, *f.FeatScore) 597 } 598 if err != nil { 599 return n, err 600 } 601 n += _n 602 } else { 603 _, err = w.w.Write([]byte{'.'}) 604 if err != nil { 605 return n, err 606 } 607 n++ 608 } 609 _n, err = fmt.Fprintf(w.w, "\t%s\t%s", 610 f.FeatStrand, 611 f.FeatFrame, 612 ) 613 n += _n 614 if err != nil { 615 return n, err 616 } 617 if f.FeatAttributes != nil { 618 _n, err = fmt.Fprintf(w.w, "\t%v", f.FeatAttributes) 619 if err != nil { 620 return n, err 621 } 622 n += _n 623 } else if f.Comments != "" { 624 _, err = w.w.Write([]byte{'\t'}) 625 if err != nil { 626 return 627 } 628 n++ 629 } 630 if f.Comments != "" { 631 _n, err = fmt.Fprintf(w.w, "\t%s", f.Comments) 632 n += _n 633 } 634 return n, err 635 case seq.Sequence: 636 sw := fasta.NewWriter(w.w, w.Width) 637 moltype := f.Alphabet().Moltype() 638 if moltype < feat.DNA || moltype > feat.Protein { 639 return 0, ErrNotHandled 640 } 641 sw.IDPrefix = [...][]byte{ 642 feat.DNA: []byte("##DNA "), 643 feat.RNA: []byte("##RNA "), 644 feat.Protein: []byte("##Protein "), 645 }[moltype] 646 sw.SeqPrefix = []byte("##") 647 n, err = sw.Write(f) 648 if err != nil { 649 return n, err 650 } 651 var _n int 652 _n, err = w.w.Write([...][]byte{ 653 feat.DNA: []byte("##end-DNA\n"), 654 feat.RNA: []byte("##end-RNA\n"), 655 feat.Protein: []byte("##end-Protein\n"), 656 }[moltype]) 657 return n + _n, err 658 case Sequence: 659 return 0, ErrNotHandled 660 case *Region: 661 return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", f.SeqName, feat.ZeroToOne(f.RegionStart), f.RegionEnd) 662 default: 663 return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", f.Name(), feat.ZeroToOne(f.Start()), f.End()) 664 } 665 } 666 667 // WriteMetaData writes a meta data line to a GFF file. The type of metadata line 668 // depends on the type of d: strings and byte slices are written verbatim, an int is 669 // interpreted as a version number and can only be written before any other data, 670 // feat.Moltype and gff.Sequence types are written as sequence type lines, gff.Features 671 // and gff.Regions are written as sequence regions, sequences are written _n GFF 672 // format and time.Time values are written as date line. All other type return an 673 // ErrNotHandled. 674 func (w *Writer) WriteMetaData(d interface{}) (n int, err error) { 675 defer func() { w.header = true }() 676 switch d := d.(type) { 677 case string: 678 return fmt.Fprintf(w.w, "##%s\n", d) 679 case []byte: 680 return fmt.Fprintf(w.w, "##%s\n", d) 681 case int: 682 if w.header { 683 return 0, ErrCannotHeader 684 } 685 return fmt.Fprintf(w.w, "##gff-version %d\n", d) 686 case feat.Moltype: 687 return fmt.Fprintf(w.w, "##Type %s\n", d) 688 case Sequence: 689 return fmt.Fprintf(w.w, "##Type %s %s\n", d.Type, d.SeqName) 690 case *Feature: 691 return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", d.SeqName, feat.ZeroToOne(d.FeatStart), d.FeatEnd) 692 case feat.Feature: 693 return w.Write(d) 694 case time.Time: 695 return fmt.Fprintf(w.w, "##date %s\n", d.Format(w.TimeFormat)) 696 } 697 return 0, ErrNotHandled 698 } 699 700 // WriteComment writes a comment line to a GFF file. 701 func (w *Writer) WriteComment(c string) (n int, err error) { 702 return fmt.Fprintf(w.w, "# %s\n", c) 703 }