github.com/biogo/biogo@v1.0.4/io/featio/bed/bed.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package bed provides types to read and write BED format files according to
     6  // the UCSC specification.
     7  //
     8  // The specification can be found at http://genome.ucsc.edu/FAQ/FAQformat.html#format1.
     9  package bed
    10  
    11  import (
    12  	"github.com/biogo/biogo/feat"
    13  	"github.com/biogo/biogo/io/featio"
    14  	"github.com/biogo/biogo/seq"
    15  
    16  	"bufio"
    17  	"bytes"
    18  	"encoding/csv"
    19  	"errors"
    20  	"fmt"
    21  	"image/color"
    22  	"io"
    23  	"reflect"
    24  	"runtime"
    25  	"strconv"
    26  	"unsafe"
    27  )
    28  
    29  var (
    30  	ErrBadBedType         = errors.New("bed: bad bed type")
    31  	ErrBadStrandField     = errors.New("bad strand field")
    32  	ErrBadStrand          = errors.New("invalid strand")
    33  	ErrBadColorField      = errors.New("bad color field")
    34  	ErrMissingBlockValues = errors.New("missing block values")
    35  	ErrNoChromField       = errors.New("no chrom field available")
    36  )
    37  
    38  const (
    39  	chromField = iota
    40  	startField
    41  	endField
    42  	nameField
    43  	scoreField
    44  	strandField
    45  	thickStartField
    46  	thickEndField
    47  	rgbField
    48  	blockCountField
    49  	blockSizesField
    50  	blockStartsField
    51  )
    52  
    53  var (
    54  	_ featio.Reader = (*Reader)(nil)
    55  	_ featio.Writer = (*Writer)(nil)
    56  
    57  	_ feat.Feature = (*Bed3)(nil)
    58  	_ feat.Feature = (*Bed4)(nil)
    59  	_ feat.Feature = (*Bed5)(nil)
    60  	_ feat.Feature = (*Bed6)(nil)
    61  	_ feat.Feature = (*Bed12)(nil)
    62  
    63  	_ Bed = (*Bed3)(nil)
    64  	_ Bed = (*Bed4)(nil)
    65  	_ Bed = (*Bed5)(nil)
    66  	_ Bed = (*Bed6)(nil)
    67  	_ Bed = (*Bed12)(nil)
    68  
    69  	_ feat.Orienter = (*Bed6)(nil)
    70  	_ feat.Orienter = (*Bed12)(nil)
    71  )
    72  
    73  type Bed interface {
    74  	feat.Feature
    75  	canBed(int) bool
    76  }
    77  
    78  func handlePanic(f feat.Feature, err *error) {
    79  	r := recover()
    80  	if r != nil {
    81  		e, ok := r.(error)
    82  		if !ok {
    83  			panic(r)
    84  		}
    85  		if _, ok = r.(runtime.Error); ok {
    86  			panic(r)
    87  		}
    88  		*err = e
    89  	}
    90  }
    91  
    92  // This function cannot be used to create strings that are expected to persist.
    93  func unsafeString(b []byte) string {
    94  	return *(*string)(unsafe.Pointer(&b))
    95  }
    96  
    97  func mustAtoi(f []byte, column int) int {
    98  	i, err := strconv.ParseInt(unsafeString(f), 0, 0)
    99  	if err != nil {
   100  		panic(&csv.ParseError{Column: column, Err: err})
   101  	}
   102  	return int(i)
   103  }
   104  
   105  func mustAtob(f []byte, column int) byte {
   106  	b, err := strconv.ParseUint(unsafeString(f), 0, 8)
   107  	if err != nil {
   108  		panic(&csv.ParseError{Column: column, Err: err})
   109  	}
   110  	return byte(b)
   111  }
   112  
   113  var charToStrand = func() [256]seq.Strand {
   114  	var t [256]seq.Strand
   115  	for i := range t {
   116  		t[i] = 0x7f
   117  	}
   118  	t['+'] = seq.Plus
   119  	t['.'] = seq.None
   120  	t['-'] = seq.Minus
   121  	return t
   122  }()
   123  
   124  func mustAtos(f []byte, index int) seq.Strand {
   125  	if len(f) != 1 {
   126  		panic(&csv.ParseError{Column: index, Err: ErrBadStrandField})
   127  	}
   128  	s := charToStrand[f[0]]
   129  	if s == 0x7f {
   130  		panic(&csv.ParseError{Column: index, Err: ErrBadStrand})
   131  	}
   132  	return s
   133  }
   134  
   135  func mustAtoRgb(f []byte, index int) color.RGBA {
   136  	c := bytes.SplitN(f, []byte{','}, 4)
   137  	l := len(c)
   138  	if l == 0 || (l == 1 && mustAtoi(c[0], index) == 0) {
   139  		return color.RGBA{}
   140  	}
   141  	if l < 3 {
   142  		panic(&csv.ParseError{Column: index, Err: ErrBadColorField})
   143  	}
   144  	return color.RGBA{
   145  		R: mustAtob(c[0], index),
   146  		G: mustAtob(c[1], index),
   147  		B: mustAtob(c[2], index),
   148  		A: 0xff,
   149  	}
   150  }
   151  
   152  func mustAtoa(f []byte, index int) []int {
   153  	c := bytes.Split(f, []byte{','})
   154  	a := make([]int, len(c))
   155  	for i, f := range c {
   156  		if len(f) == 0 {
   157  			return a[:i]
   158  		}
   159  		a[i] = mustAtoi(f, index)
   160  	}
   161  	return a
   162  }
   163  
   164  type Chrom string
   165  
   166  func (c Chrom) Start() int             { return 0 }
   167  func (c Chrom) End() int               { return 0 }
   168  func (c Chrom) Len() int               { return 0 }
   169  func (c Chrom) Name() string           { return string(c) }
   170  func (c Chrom) Description() string    { return "bed chrom" }
   171  func (c Chrom) Location() feat.Feature { return nil }
   172  
   173  type Bed3 struct {
   174  	Chrom      string
   175  	ChromStart int
   176  	ChromEnd   int
   177  }
   178  
   179  func parseBed3(line []byte) (b *Bed3, err error) {
   180  	const n = 3
   181  	defer handlePanic(b, &err)
   182  	f := bytes.SplitN(line, []byte{'\t'}, n+1)
   183  	if len(f) < n {
   184  		return nil, ErrBadBedType
   185  	}
   186  	b = &Bed3{
   187  		Chrom:      string(f[chromField]),
   188  		ChromStart: mustAtoi(f[startField], startField),
   189  		ChromEnd:   mustAtoi(f[endField], endField),
   190  	}
   191  	return
   192  }
   193  
   194  func (b *Bed3) Start() int                  { return b.ChromStart }
   195  func (b *Bed3) End() int                    { return b.ChromEnd }
   196  func (b *Bed3) Len() int                    { return b.ChromEnd - b.ChromStart }
   197  func (b *Bed3) Name() string                { return fmt.Sprintf("%s:[%d,%d)", b.Chrom, b.ChromStart, b.ChromEnd) }
   198  func (b *Bed3) Description() string         { return "bed3 feature" }
   199  func (b *Bed3) Location() feat.Feature      { return Chrom(b.Chrom) }
   200  func (b *Bed3) canBed(i int) bool           { return i <= 3 }
   201  func (b *Bed3) Format(fs fmt.State, c rune) { format(b, fs, c) }
   202  
   203  type Bed4 struct {
   204  	Chrom      string
   205  	ChromStart int
   206  	ChromEnd   int
   207  	FeatName   string
   208  }
   209  
   210  func parseBed4(line []byte) (b *Bed4, err error) {
   211  	const n = 4
   212  	defer handlePanic(b, &err)
   213  	f := bytes.SplitN(line, []byte{'\t'}, n+1)
   214  	if len(f) < n {
   215  		return nil, ErrBadBedType
   216  	}
   217  	b = &Bed4{
   218  		Chrom:      string(f[chromField]),
   219  		ChromStart: mustAtoi(f[startField], startField),
   220  		ChromEnd:   mustAtoi(f[endField], endField),
   221  		FeatName:   string(f[nameField]),
   222  	}
   223  	return
   224  }
   225  
   226  func (b *Bed4) Start() int                  { return b.ChromStart }
   227  func (b *Bed4) End() int                    { return b.ChromEnd }
   228  func (b *Bed4) Len() int                    { return b.ChromEnd - b.ChromStart }
   229  func (b *Bed4) Name() string                { return b.FeatName }
   230  func (b *Bed4) Description() string         { return "bed4 feature" }
   231  func (b *Bed4) Location() feat.Feature      { return Chrom(b.Chrom) }
   232  func (b *Bed4) canBed(i int) bool           { return i <= 4 }
   233  func (b *Bed4) Format(fs fmt.State, c rune) { format(b, fs, c) }
   234  
   235  type Bed5 struct {
   236  	Chrom      string
   237  	ChromStart int
   238  	ChromEnd   int
   239  	FeatName   string
   240  	FeatScore  int
   241  }
   242  
   243  func parseBed5(line []byte) (b *Bed5, err error) {
   244  	const n = 5
   245  	defer handlePanic(b, &err)
   246  	f := bytes.SplitN(line, []byte{'\t'}, n+1)
   247  	if len(f) < n {
   248  		return nil, ErrBadBedType
   249  	}
   250  	b = &Bed5{
   251  		Chrom:      string(f[chromField]),
   252  		ChromStart: mustAtoi(f[startField], startField),
   253  		ChromEnd:   mustAtoi(f[endField], endField),
   254  		FeatName:   string(f[nameField]),
   255  		FeatScore:  mustAtoi(f[scoreField], scoreField),
   256  	}
   257  	return
   258  }
   259  
   260  func (b *Bed5) Start() int                  { return b.ChromStart }
   261  func (b *Bed5) End() int                    { return b.ChromEnd }
   262  func (b *Bed5) Len() int                    { return b.ChromEnd - b.ChromStart }
   263  func (b *Bed5) Name() string                { return b.FeatName }
   264  func (b *Bed5) Description() string         { return "bed5 feature" }
   265  func (b *Bed5) Location() feat.Feature      { return Chrom(b.Chrom) }
   266  func (b *Bed5) canBed(i int) bool           { return i <= 5 }
   267  func (b *Bed5) Format(fs fmt.State, c rune) { format(b, fs, c) }
   268  
   269  type Bed6 struct {
   270  	Chrom      string
   271  	ChromStart int
   272  	ChromEnd   int
   273  	FeatName   string
   274  	FeatScore  int
   275  	FeatStrand seq.Strand
   276  }
   277  
   278  func parseBed6(line []byte) (b *Bed6, err error) {
   279  	const n = 6
   280  	defer handlePanic(b, &err)
   281  	f := bytes.SplitN(line, []byte{'\t'}, n+1)
   282  	if len(f) < n {
   283  		return nil, ErrBadBedType
   284  	}
   285  	b = &Bed6{
   286  		Chrom:      string(f[chromField]),
   287  		ChromStart: mustAtoi(f[startField], startField),
   288  		ChromEnd:   mustAtoi(f[endField], endField),
   289  		FeatName:   string(f[nameField]),
   290  		FeatScore:  mustAtoi(f[scoreField], scoreField),
   291  		FeatStrand: mustAtos(f[strandField], strandField),
   292  	}
   293  	return
   294  }
   295  
   296  func (b *Bed6) Start() int                    { return b.ChromStart }
   297  func (b *Bed6) End() int                      { return b.ChromEnd }
   298  func (b *Bed6) Len() int                      { return b.ChromEnd - b.ChromStart }
   299  func (b *Bed6) Name() string                  { return b.FeatName }
   300  func (b *Bed6) Description() string           { return "bed6 feature" }
   301  func (b *Bed6) Location() feat.Feature        { return Chrom(b.Chrom) }
   302  func (b *Bed6) Orientation() feat.Orientation { return feat.Orientation(b.FeatStrand) }
   303  func (b *Bed6) canBed(i int) bool             { return i <= 6 }
   304  func (b *Bed6) Format(fs fmt.State, c rune)   { format(b, fs, c) }
   305  
   306  type Bed12 struct {
   307  	Chrom       string
   308  	ChromStart  int
   309  	ChromEnd    int
   310  	FeatName    string
   311  	FeatScore   int
   312  	FeatStrand  seq.Strand
   313  	ThickStart  int
   314  	ThickEnd    int
   315  	Rgb         color.RGBA
   316  	BlockCount  int
   317  	BlockSizes  []int
   318  	BlockStarts []int
   319  }
   320  
   321  func parseBed12(line []byte) (b *Bed12, err error) {
   322  	const n = 12
   323  	defer handlePanic(b, &err)
   324  	f := bytes.SplitN(line, []byte{'\t'}, n+1)
   325  	if len(f) < n {
   326  		return nil, ErrBadBedType
   327  	}
   328  	b = &Bed12{
   329  		Chrom:       string(f[chromField]),
   330  		ChromStart:  mustAtoi(f[startField], startField),
   331  		ChromEnd:    mustAtoi(f[endField], endField),
   332  		FeatName:    string(f[nameField]),
   333  		FeatScore:   mustAtoi(f[scoreField], scoreField),
   334  		FeatStrand:  mustAtos(f[strandField], strandField),
   335  		ThickStart:  mustAtoi(f[thickStartField], thickStartField),
   336  		ThickEnd:    mustAtoi(f[thickEndField], thickEndField),
   337  		Rgb:         mustAtoRgb(f[rgbField], rgbField),
   338  		BlockCount:  mustAtoi(f[blockCountField], blockCountField),
   339  		BlockSizes:  mustAtoa(f[blockSizesField], blockSizesField),
   340  		BlockStarts: mustAtoa(f[blockStartsField], blockStartsField),
   341  	}
   342  	if b.BlockCount != len(b.BlockSizes) || b.BlockCount != len(b.BlockStarts) {
   343  		return nil, ErrMissingBlockValues
   344  	}
   345  	return
   346  }
   347  
   348  func (b *Bed12) Start() int                    { return b.ChromStart }
   349  func (b *Bed12) End() int                      { return b.ChromEnd }
   350  func (b *Bed12) Len() int                      { return b.ChromEnd - b.ChromStart }
   351  func (b *Bed12) Name() string                  { return b.FeatName }
   352  func (b *Bed12) Description() string           { return "bed12 feature" }
   353  func (b *Bed12) Location() feat.Feature        { return Chrom(b.Chrom) }
   354  func (b *Bed12) Orientation() feat.Orientation { return feat.Orientation(b.FeatStrand) }
   355  func (b *Bed12) canBed(i int) bool             { return i <= 12 }
   356  func (b *Bed12) Format(fs fmt.State, c rune)   { format(b, fs, c) }
   357  
   358  // BED format reader type.
   359  type Reader struct {
   360  	r       *bufio.Reader
   361  	BedType int
   362  	line    int
   363  }
   364  
   365  // Returns a new BED format reader using r.
   366  func NewReader(r io.Reader, b int) (*Reader, error) {
   367  	switch b {
   368  	case 3, 4, 5, 6, 12:
   369  	default:
   370  		return nil, ErrBadBedType
   371  	}
   372  	return &Reader{
   373  		r:       bufio.NewReader(r),
   374  		BedType: b,
   375  	}, nil
   376  }
   377  
   378  // Read a single feature and return it or an error.
   379  func (r *Reader) Read() (f feat.Feature, err error) {
   380  	line, err := r.r.ReadBytes('\n')
   381  	if err != nil {
   382  		return
   383  	}
   384  	r.line++
   385  	line = bytes.TrimSpace(line)
   386  
   387  	switch r.BedType {
   388  	case 3:
   389  		f, err = parseBed3(line)
   390  	case 4:
   391  		f, err = parseBed4(line)
   392  	case 5:
   393  		f, err = parseBed5(line)
   394  	case 6:
   395  		f, err = parseBed6(line)
   396  	case 12:
   397  		f, err = parseBed12(line)
   398  	default:
   399  		return nil, ErrBadBedType
   400  	}
   401  	if err != nil {
   402  		if err, ok := err.(*csv.ParseError); ok {
   403  			err.Line = r.line
   404  			return nil, err
   405  		}
   406  		return nil, fmt.Errorf("%v at line %d", err, r.line)
   407  	}
   408  
   409  	return
   410  }
   411  
   412  // Return the current line number
   413  func (r *Reader) Line() int { return r.line }
   414  
   415  func format(b Bed, fs fmt.State, c rune) {
   416  	bv := reflect.ValueOf(b)
   417  	if bv.IsNil() {
   418  		fmt.Fprint(fs, "<nil>")
   419  		return
   420  	}
   421  	bv = bv.Elem()
   422  	switch c {
   423  	case 'v':
   424  		if fs.Flag('#') {
   425  			fmt.Fprintf(fs, "&%#v", bv.Interface())
   426  			return
   427  		}
   428  		fallthrough
   429  	case 's':
   430  		width, _ := fs.Width()
   431  		if !b.canBed(width) {
   432  			fmt.Fprintf(fs, "%%!(BADWIDTH)%T", b)
   433  			return
   434  		}
   435  		if width == 0 {
   436  			width = bv.NumField()
   437  		}
   438  		for i := 0; i < width; i++ {
   439  			f := bv.Field(i).Interface()
   440  			if i >= rgbField {
   441  				switch i {
   442  				case rgbField:
   443  					rv := reflect.ValueOf(f)
   444  					if reflect.DeepEqual(rv.Interface(), color.RGBA{}) {
   445  						fs.Write([]byte{'0'})
   446  					} else {
   447  						fmt.Fprintf(fs, "%d,%d,%d",
   448  							rv.Field(0).Interface(), rv.Field(1).Interface(), rv.Field(2).Interface())
   449  					}
   450  				case blockCountField:
   451  					fmt.Fprint(fs, f)
   452  				case blockSizesField, blockStartsField:
   453  					av := reflect.ValueOf(f)
   454  					l := av.Len()
   455  					for j := 0; j < l; j++ {
   456  						fmt.Fprint(fs, av.Index(j).Interface())
   457  						if j < l-1 {
   458  							fs.Write([]byte{','})
   459  						}
   460  					}
   461  				}
   462  			} else {
   463  				fmt.Fprint(fs, f)
   464  			}
   465  			if i < width-1 {
   466  				fs.Write([]byte{'\t'})
   467  			}
   468  		}
   469  	default:
   470  		fmt.Fprintf(fs, "%%!%c(%T=%3s)", c, b, b)
   471  	}
   472  }
   473  
   474  // BED format writer type.
   475  type Writer struct {
   476  	w       io.Writer
   477  	BedType int
   478  }
   479  
   480  // Returns a new BED format writer using w.
   481  func NewWriter(w io.Writer, b int) (*Writer, error) {
   482  	switch b {
   483  	case 3, 4, 5, 6, 12:
   484  	default:
   485  		return nil, ErrBadBedType
   486  	}
   487  	return &Writer{
   488  		w:       w,
   489  		BedType: b,
   490  	}, nil
   491  }
   492  
   493  type Scorer interface {
   494  	Score() int
   495  }
   496  
   497  // Write a single feature and return the number of bytes written and any error.
   498  func (w *Writer) Write(f feat.Feature) (n int, err error) {
   499  	defer func() {
   500  		if err != nil {
   501  			return
   502  		}
   503  		_, err = w.w.Write([]byte{'\n'})
   504  		if err != nil {
   505  			return
   506  		}
   507  		n++
   508  	}()
   509  
   510  	// Handle Bed types.
   511  	if f, ok := f.(Bed); ok {
   512  		if !f.canBed(w.BedType) {
   513  			return 0, ErrBadBedType
   514  		}
   515  		return fmt.Fprintf(w.w, "%*s", w.BedType, f)
   516  	}
   517  
   518  	// Handle other feature types.
   519  	if f.Location() == nil {
   520  		return 0, ErrNoChromField
   521  	}
   522  
   523  	// Bed3
   524  	n, err = fmt.Fprintf(w.w, "%s\t%d\t%d", f.Location(), f.Start(), f.End())
   525  	if w.BedType == 3 {
   526  		return n, err
   527  	}
   528  
   529  	// Bed4
   530  	_n, err := fmt.Fprintf(w.w, "\t%s", f.Name())
   531  	n += _n
   532  	if w.BedType == 4 || err != nil {
   533  		return n, err
   534  	}
   535  
   536  	// Bed5
   537  	if f, ok := f.(Scorer); ok {
   538  		_n, err := fmt.Fprintf(w.w, "\t%d", f.Score())
   539  		n += _n
   540  		if err != nil {
   541  			return n, err
   542  		}
   543  	} else {
   544  		_n, err := fmt.Fprintf(w.w, "\t0")
   545  		n += _n
   546  		if err != nil {
   547  			return n, err
   548  		}
   549  	}
   550  	if w.BedType == 5 {
   551  		return
   552  	}
   553  
   554  	// Bed6
   555  	if f, ok := f.(feat.Orienter); ok {
   556  		_n, err := fmt.Fprintf(w.w, "\t%s", seq.Strand(f.Orientation()))
   557  		n += _n
   558  		if err != nil {
   559  			return n, err
   560  		}
   561  	} else {
   562  		_n, err := fmt.Fprintf(w.w, "\t.")
   563  		n += _n
   564  		if err != nil {
   565  			return n, err
   566  		}
   567  	}
   568  	if w.BedType == 6 || w.BedType == 0 {
   569  		return
   570  	}
   571  
   572  	// Don't handle Bed12.
   573  	_n, err = w.w.Write([]byte{'\n'})
   574  	n += _n
   575  	return n, ErrBadBedType
   576  }