github.com/biogo/biogo@v1.0.4/feat/gene/gene.go (about)

     1  // Copyright ©2015 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package gene contains the types and methods to handle the definition of a
     6  // gene. A gene is a union of genomic sequences encoding a coherent set of
     7  // potentially overlapping functional products. Since the package is located
     8  // under the feat namespace, we define gene to correspond to a specific
     9  // genomic region (has genomic coordinates).
    10  //
    11  // The package also contain types to describe gene transcripts. Transcripts
    12  // can be coding and non-coding. Coding transcripts have functional regions
    13  // (5'UTR, CDS and 3'UTR) and consist of exons.
    14  package gene
    15  
    16  import (
    17  	"github.com/biogo/biogo/feat"
    18  
    19  	"errors"
    20  	"sort"
    21  )
    22  
    23  const maxInt = int(^uint(0) >> 1) // The maximum int value.
    24  
    25  // Interface defines the gene interface.
    26  type Interface interface {
    27  	feat.Feature
    28  	feat.Orienter
    29  	feat.Set
    30  	SetFeatures(...feat.Feature) error
    31  }
    32  
    33  // Transcript is the interface for a gene transcript.
    34  type Transcript interface {
    35  	feat.Feature
    36  	feat.Orienter
    37  	Exons() Exons
    38  	Introns() Introns
    39  	SetExons(...Exon) error
    40  }
    41  
    42  // TranscriptsOf scans a feat.Set and returns any Transcripts that it finds.
    43  func TranscriptsOf(s feat.Set) []Transcript {
    44  	var ts []Transcript
    45  	for _, f := range s.Features() {
    46  		if t, ok := f.(Transcript); ok {
    47  			ts = append(ts, t)
    48  		}
    49  	}
    50  	return ts
    51  }
    52  
    53  // A Gene occupies a specific region on the genome and may have 0 or more
    54  // features, including transcripts, associated with it. The gene is tightly
    55  // coupled with its features in the sense that the gene boundaries are defined
    56  // by the features. By definition one of the features must always start at
    57  // position 0 relative to the gene and this or another one has to end at the
    58  // end of the gene. The former is asserted when features are set and the
    59  // latter is guaranteed by setting the gene end at the largest end of the
    60  // features.
    61  type Gene struct {
    62  	ID     string
    63  	Chrom  feat.Feature
    64  	Offset int
    65  	Orient feat.Orientation
    66  	Desc   string
    67  	length int
    68  	feats  []feat.Feature
    69  }
    70  
    71  // Start returns the gene start on the chromosome.
    72  func (g *Gene) Start() int { return g.Offset }
    73  
    74  // End returns the gene end on the chromosome.
    75  func (g *Gene) End() int { return g.Offset + g.Len() }
    76  
    77  // Len returns the length of the gene.
    78  func (g *Gene) Len() int { return g.length }
    79  
    80  // Name returns the gene name. Currently the same as the id.
    81  func (g *Gene) Name() string { return g.ID }
    82  
    83  // Description returns a description for the gene.
    84  func (g *Gene) Description() string { return g.Desc }
    85  
    86  // Location returns the location of the gene. Namely the chromosome.
    87  func (g *Gene) Location() feat.Feature { return g.Chrom }
    88  
    89  // Orientation returns the orientation of the gene relative to the chromosome.
    90  func (g *Gene) Orientation() feat.Orientation { return g.Orient }
    91  
    92  // Features returns all features added to the gene.
    93  func (g *Gene) Features() []feat.Feature { return g.feats }
    94  
    95  // SetFeatures sets the gene features. Internally, it verifies that their
    96  // Location is the gene and that one of them has zero Start. If an error
    97  // occurs it is returned and the features are not set.
    98  func (g *Gene) SetFeatures(feats ...feat.Feature) error {
    99  	pos := maxInt
   100  	end := 0
   101  	for _, f := range feats {
   102  		if f.Location() != g {
   103  			return errors.New("transcript location does not match the gene")
   104  		}
   105  		if f.Start() < pos {
   106  			pos = f.Start()
   107  		}
   108  		if f.End() > end {
   109  			end = f.End()
   110  		}
   111  	}
   112  	if pos != 0 {
   113  		return errors.New("no transcript with 0 start on gene")
   114  	}
   115  	g.length = end - pos
   116  	g.feats = feats
   117  	return nil
   118  }
   119  
   120  // A NonCodingTranscript is a gene transcript that has no coding potential. It
   121  // can be located on any feat.Feature such as a gene or a chromosome. The
   122  // concept of exons is tightly coupled with the NonCodingTranscript in the
   123  // sense that the transcript borders are basically defined by the contained
   124  // exons. By definition one of the exons must always start at position 0
   125  // relative to the transcript and this or another one must end at the end of
   126  // transcript. The former is asserted when exons are set and the latter is
   127  // guaranteed by setting the transcript end at the end of the last exon.
   128  type NonCodingTranscript struct {
   129  	ID     string
   130  	Loc    feat.Feature
   131  	Offset int
   132  	Orient feat.Orientation
   133  	Desc   string
   134  	exons  Exons
   135  }
   136  
   137  // Start returns the transcript start relative to Location.
   138  func (t *NonCodingTranscript) Start() int { return t.Offset }
   139  
   140  // End returns the transcript end relative to Location.
   141  func (t *NonCodingTranscript) End() int { return t.Offset + t.exons.End() }
   142  
   143  // Len returns the length of the transcript.
   144  func (t *NonCodingTranscript) Len() int { return t.End() - t.Start() }
   145  
   146  // Name returns the transcript name. Currently the same as the id.
   147  func (t *NonCodingTranscript) Name() string { return t.ID }
   148  
   149  // Description returns a description for the transcript.
   150  func (t *NonCodingTranscript) Description() string { return t.Desc }
   151  
   152  // Location returns the location of the transcript. Can be any feat.Feature
   153  // such as a gene or a chromosome.
   154  func (t *NonCodingTranscript) Location() feat.Feature { return t.Loc }
   155  
   156  // Orientation returns the orientation of the transcript relative to Location.
   157  func (t *NonCodingTranscript) Orientation() feat.Orientation { return t.Orient }
   158  
   159  // Exons returns a typed slice with the transcript exons.
   160  func (t *NonCodingTranscript) Exons() Exons { return t.exons }
   161  
   162  // Introns returns a typed slice with the transcript introns.
   163  func (t *NonCodingTranscript) Introns() Introns { return t.exons.Introns() }
   164  
   165  // SetExons sets the transcript exons. Internally, it sorts exons by Start,
   166  // verifies that their Location is the transcript, that they are not
   167  // overlapping and that one has zero Start. If an error occurs it is returned
   168  // and the exons are not set.
   169  func (t *NonCodingTranscript) SetExons(exons ...Exon) error {
   170  	exons, err := buildExonsFor(t, exons...)
   171  	if err != nil {
   172  		return err
   173  	}
   174  	t.exons = exons
   175  	return nil
   176  }
   177  
   178  // A CodingTranscript is a gene transcript that has coding potential. It can
   179  // be located on any feat.Feature such as a gene or a chromosome. The concept
   180  // of exons is tightly coupled with the CodingTranscript in the sense that
   181  // the transcript borders are basically defined by the contained exons. By
   182  // definition one of the exons must always start at position 0 relative to the
   183  // transcript and this or another one must end at the transcript end. The
   184  // former is asserted when exons are set and the latter is guaranteed by
   185  // setting the transcript end at the end of the last exon.
   186  type CodingTranscript struct {
   187  	ID       string
   188  	Loc      feat.Feature
   189  	Offset   int
   190  	Orient   feat.Orientation
   191  	Desc     string
   192  	CDSstart int
   193  	CDSend   int
   194  	exons    Exons
   195  }
   196  
   197  // Start returns the transcript start relative to Location.
   198  func (t *CodingTranscript) Start() int { return t.Offset }
   199  
   200  // End returns the transcript end relative to Location.
   201  func (t *CodingTranscript) End() int { return t.Offset + t.exons.End() }
   202  
   203  // Len returns the length of the transcript.
   204  func (t *CodingTranscript) Len() int { return t.End() - t.Start() }
   205  
   206  // Name returns the transcript name. Currently the same as the id.
   207  func (t *CodingTranscript) Name() string { return t.ID }
   208  
   209  // Description returns a description for the transcript.
   210  func (t *CodingTranscript) Description() string { return t.Desc }
   211  
   212  // Location returns the location of the transcript. Can be any feat.Feature
   213  // such as a gene or a chromosome.
   214  func (t *CodingTranscript) Location() feat.Feature { return t.Loc }
   215  
   216  // Orientation returns the orientation of the transcript relative to Location.
   217  func (t *CodingTranscript) Orientation() feat.Orientation {
   218  	return t.Orient
   219  }
   220  
   221  // UTR5 returns a feat.Feature that corresponds to the 5'UTR of the
   222  // transcript.
   223  func (t *CodingTranscript) UTR5() feat.Feature {
   224  	tf := &TranscriptFeature{Transcript: t, Orient: feat.Forward}
   225  	ori, _ := feat.BaseOrientationOf(t)
   226  	switch ori {
   227  	case feat.Forward:
   228  		tf.Offset = 0
   229  		tf.Length = t.CDSstart
   230  	case feat.Reverse:
   231  		tf.Offset = t.CDSend
   232  		tf.Length = t.Len() - t.CDSend
   233  	default:
   234  		panic("gene: invalid base orientation for transcript")
   235  	}
   236  	return tf
   237  }
   238  
   239  // CDS returns a feat.Feature that corresponds to the coding region of the
   240  // transcript.
   241  func (t *CodingTranscript) CDS() feat.Feature {
   242  	return &TranscriptFeature{
   243  		Transcript: t,
   244  		Offset:     t.CDSstart,
   245  		Length:     t.CDSend - t.CDSstart,
   246  		Orient:     feat.Forward,
   247  	}
   248  }
   249  
   250  // UTR3 returns a feat.Feature that corresponds to the 3'UTR of the
   251  // transcript.
   252  func (t *CodingTranscript) UTR3() feat.Feature {
   253  	tf := &TranscriptFeature{Transcript: t, Orient: feat.Forward}
   254  	ori, _ := feat.BaseOrientationOf(t)
   255  	switch ori {
   256  	case feat.Forward:
   257  		tf.Offset = t.CDSend
   258  		tf.Length = t.Len() - t.CDSend
   259  	case feat.Reverse:
   260  		tf.Offset = 0
   261  		tf.Length = t.CDSstart
   262  	default:
   263  		panic("gene: invalid base orientation for transcript")
   264  	}
   265  	return tf
   266  }
   267  
   268  // UTR5start returns the start of the 5'UTR relative to the transcript.
   269  // UTR5start is shorthand for t.UTR5().Start().
   270  func (t *CodingTranscript) UTR5start() int {
   271  	return t.UTR5().Start()
   272  }
   273  
   274  // UTR5end returns the end of the 5'UTR relative to the transcript.
   275  // UTR5end is shorthand for t.UTR5().End().
   276  func (t *CodingTranscript) UTR5end() int {
   277  	return t.UTR5().End()
   278  }
   279  
   280  // UTR3start returns the start of the 3'UTR relative to the transcript.
   281  // UTR3start is shorthand for t.UTR3().Start().
   282  func (t *CodingTranscript) UTR3start() int {
   283  	return t.UTR3().Start()
   284  }
   285  
   286  // UTR3end returns the end of the 3'UTR relative to the transcript.
   287  // UTR3end is shorthand for t.UTR3().End().
   288  func (t *CodingTranscript) UTR3end() int {
   289  	return t.UTR3().End()
   290  }
   291  
   292  // Exons returns a typed slice with the transcript exons.
   293  func (t *CodingTranscript) Exons() Exons { return t.exons }
   294  
   295  // Introns returns a typed slice with the transcript introns.
   296  func (t *CodingTranscript) Introns() Introns { return t.exons.Introns() }
   297  
   298  // SetExons sets the transcript exons. Internally, it sorts exons by Start,
   299  // verifies that their Location is the transcript, that they are not
   300  // overlapping and that one has zero Start. If an error occurs it is returned
   301  // and the exons are not set.
   302  func (t *CodingTranscript) SetExons(exons ...Exon) error {
   303  	newExons, err := buildExonsFor(t, exons...)
   304  	if err != nil {
   305  		return err
   306  	}
   307  	t.exons = newExons
   308  	return nil
   309  }
   310  
   311  // TranscriptFeature defines a feature on a transcript.
   312  type TranscriptFeature struct {
   313  	Transcript Transcript       // Transcript is the transcript that the feature is located.
   314  	Offset     int              // Offset is the position of the feature relative to Transcript.
   315  	Length     int              // Length is the feature length.
   316  	Orient     feat.Orientation // Orientation is the feature orientation relative to Transcript.
   317  	FeatName   string           // FeatName is the name of the feature.
   318  	Desc       string           // Desc is the description of the feature.
   319  }
   320  
   321  // Start returns the feature start relative to Transcript.
   322  func (t *TranscriptFeature) Start() int { return t.Offset }
   323  
   324  // End returns the feature end relative to TranscriptLocation.
   325  func (t *TranscriptFeature) End() int { return t.Offset + t.Length }
   326  
   327  // Len returns the length of the feature.
   328  func (t *TranscriptFeature) Len() int { return t.Length }
   329  
   330  // Name returns an empty string.
   331  func (t *TranscriptFeature) Name() string { return t.FeatName }
   332  
   333  // Description returns the feature description.
   334  func (t *TranscriptFeature) Description() string { return t.Desc }
   335  
   336  // Location returns the Transcript.
   337  func (t *TranscriptFeature) Location() feat.Feature { return t.Transcript }
   338  
   339  // Orientation returns the orientation of the feature relative to Transcript.
   340  func (t *TranscriptFeature) Orientation() feat.Orientation {
   341  	return t.Orient
   342  }
   343  
   344  // Exons is a typed slice of Exon. It guarantees that exons are always sorted
   345  // by Start, are all located on the same feature and are non overlapping.
   346  type Exons []Exon
   347  
   348  // SplicedLen returns the total length of the exons.
   349  func (s Exons) SplicedLen() int {
   350  	length := 0
   351  	for _, e := range s {
   352  		length += e.Len()
   353  	}
   354  	return length
   355  }
   356  
   357  // Add adds exons to the slice and safeguards the types contracts. It returns
   358  // a new slice with the added exons. It checks for sorting, overlap, and
   359  // location match.  If and error occurs it returns the old slice (without the
   360  // new exons) and the error.
   361  func (s Exons) Add(exons ...Exon) (Exons, error) {
   362  	newSlice := append(s, exons...)
   363  	sort.Sort(newSlice)
   364  	for i, e := range newSlice {
   365  		if i != 0 && e.Start() < newSlice[i-1].End() {
   366  			return s, errors.New("exons overlap")
   367  		}
   368  		if i != 0 && e.Location() != newSlice[i-1].Location() {
   369  			return s, errors.New("exons location differ")
   370  		}
   371  
   372  	}
   373  	if s.Location() != nil && s.Location() != newSlice.Location() {
   374  		return s, errors.New("new exons locations differ from old ones")
   375  	}
   376  	return newSlice, nil
   377  }
   378  
   379  // Location returns the common location of all the exons.
   380  func (s Exons) Location() feat.Feature {
   381  	if len(s) == 0 {
   382  		return nil
   383  	}
   384  	return s[0].Location()
   385  }
   386  
   387  // Len returns the number of exons in the slice.
   388  func (s Exons) Len() int {
   389  	return len(s)
   390  }
   391  
   392  // Less returns whether the exon with index i should sort before
   393  // the exon with index j.
   394  func (s Exons) Less(i, j int) bool {
   395  	return s[i].Start() < s[j].Start()
   396  }
   397  
   398  // Swap swaps the exons with indexes i and j.
   399  func (s Exons) Swap(i, j int) {
   400  	s[i], s[j] = s[j], s[i]
   401  }
   402  
   403  // End returns the maximum End of all exons. Since exons are sorted and non
   404  // overlapping this matches the End of the last exon in the slice.
   405  func (s Exons) End() int {
   406  	if len(s) == 0 {
   407  		return 0
   408  	}
   409  	return s[len(s)-1].End()
   410  }
   411  
   412  // Start returns the minimum Start of all exons. Since exons are sorted and
   413  // non overlapping this matches the Start of the first exon in the slice.
   414  func (s Exons) Start() int {
   415  	if len(s) == 0 {
   416  		return 0
   417  	}
   418  	return s[0].Start()
   419  }
   420  
   421  // Introns returns a typed slice of Introns. Introns are built dynamically.
   422  func (s Exons) Introns() Introns {
   423  	var introns Introns
   424  	if s.Len() < 2 {
   425  		return introns
   426  	}
   427  	for i := 1; i < s.Len(); i++ {
   428  		intron := Intron{
   429  			Transcript: s[i].Transcript,
   430  			Offset:     s[i-1].End(),
   431  			Length:     s[i].Start() - s[i-1].End(),
   432  		}
   433  		introns = append(introns, intron)
   434  	}
   435  	return introns
   436  }
   437  
   438  // An Exon is the part of a transcript that remains present in the final
   439  // mature RNA product after splicing.
   440  type Exon struct {
   441  	Transcript Transcript
   442  	Offset     int
   443  	Length     int
   444  	Desc       string
   445  }
   446  
   447  // Start returns the start position of the exon relative to Transcript.
   448  func (e Exon) Start() int { return e.Offset }
   449  
   450  // End returns the end position of the exon relative to Transcript.
   451  func (e Exon) End() int { return e.Offset + e.Length }
   452  
   453  // Len returns the length of the exon.
   454  func (e Exon) Len() int { return e.Length }
   455  
   456  // Location returns the location of the exon - the transcript.
   457  func (e Exon) Location() feat.Feature { return e.Transcript }
   458  
   459  // Name returns an empty string.
   460  func (e Exon) Name() string { return "" }
   461  
   462  // Description returns a description for the exon.
   463  func (e Exon) Description() string { return e.Desc }
   464  
   465  // Orientation always returns Forward.
   466  func (e Exon) Orientation() feat.Orientation {
   467  	return feat.Forward
   468  }
   469  
   470  // Introns corresponds to a collection of introns.
   471  type Introns []Intron
   472  
   473  // An Intron is the part of a transcript that is removed during splicing
   474  // and is not part of the final mature RNA product.
   475  type Intron struct {
   476  	Transcript Transcript
   477  	Offset     int
   478  	Length     int
   479  	Desc       string
   480  }
   481  
   482  // Start returns the start position of the intron relative to Transcript.
   483  func (i Intron) Start() int { return i.Offset }
   484  
   485  // End returns the end position of the intron relative to Transcript.
   486  func (i Intron) End() int { return i.Offset + i.Length }
   487  
   488  // Len returns the length of the intron.
   489  func (i Intron) Len() int { return i.Length }
   490  
   491  // Location returns the location of the intron - the transcript.
   492  func (i Intron) Location() feat.Feature { return i.Transcript }
   493  
   494  // Name returns an empty string.
   495  func (i Intron) Name() string { return "" }
   496  
   497  // Description returns a description for the intron.
   498  func (i Intron) Description() string { return i.Desc }
   499  
   500  // Orientation always returns Forward.
   501  func (i Intron) Orientation() feat.Orientation {
   502  	return feat.Forward
   503  }
   504  
   505  // buildExonsFor is a helper function that will check if exons are compatible
   506  // with a transcript and return a typed slice of exons. If it encounters an
   507  // error or the exons are not compatible with the transcript it will return
   508  // the error and a possibly partially filled slice. It is not safe to use the
   509  // slice if the error is not nil.
   510  func buildExonsFor(t Transcript, exons ...Exon) (Exons, error) {
   511  	var newExons Exons
   512  	newExons, err := newExons.Add(exons...)
   513  	if err != nil {
   514  		return newExons, err
   515  	}
   516  	if newExons.Location() != t {
   517  		return newExons, errors.New("exon location is not the transcript")
   518  	}
   519  	if newExons.Start() != 0 {
   520  		return newExons, errors.New("no exon with a zero start")
   521  	}
   522  	return newExons, nil
   523  }