github.com/biogo/biogo@v1.0.4/feat/gene/gene.go (about) 1 // Copyright ©2015 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package gene contains the types and methods to handle the definition of a 6 // gene. A gene is a union of genomic sequences encoding a coherent set of 7 // potentially overlapping functional products. Since the package is located 8 // under the feat namespace, we define gene to correspond to a specific 9 // genomic region (has genomic coordinates). 10 // 11 // The package also contain types to describe gene transcripts. Transcripts 12 // can be coding and non-coding. Coding transcripts have functional regions 13 // (5'UTR, CDS and 3'UTR) and consist of exons. 14 package gene 15 16 import ( 17 "github.com/biogo/biogo/feat" 18 19 "errors" 20 "sort" 21 ) 22 23 const maxInt = int(^uint(0) >> 1) // The maximum int value. 24 25 // Interface defines the gene interface. 26 type Interface interface { 27 feat.Feature 28 feat.Orienter 29 feat.Set 30 SetFeatures(...feat.Feature) error 31 } 32 33 // Transcript is the interface for a gene transcript. 34 type Transcript interface { 35 feat.Feature 36 feat.Orienter 37 Exons() Exons 38 Introns() Introns 39 SetExons(...Exon) error 40 } 41 42 // TranscriptsOf scans a feat.Set and returns any Transcripts that it finds. 43 func TranscriptsOf(s feat.Set) []Transcript { 44 var ts []Transcript 45 for _, f := range s.Features() { 46 if t, ok := f.(Transcript); ok { 47 ts = append(ts, t) 48 } 49 } 50 return ts 51 } 52 53 // A Gene occupies a specific region on the genome and may have 0 or more 54 // features, including transcripts, associated with it. The gene is tightly 55 // coupled with its features in the sense that the gene boundaries are defined 56 // by the features. By definition one of the features must always start at 57 // position 0 relative to the gene and this or another one has to end at the 58 // end of the gene. The former is asserted when features are set and the 59 // latter is guaranteed by setting the gene end at the largest end of the 60 // features. 61 type Gene struct { 62 ID string 63 Chrom feat.Feature 64 Offset int 65 Orient feat.Orientation 66 Desc string 67 length int 68 feats []feat.Feature 69 } 70 71 // Start returns the gene start on the chromosome. 72 func (g *Gene) Start() int { return g.Offset } 73 74 // End returns the gene end on the chromosome. 75 func (g *Gene) End() int { return g.Offset + g.Len() } 76 77 // Len returns the length of the gene. 78 func (g *Gene) Len() int { return g.length } 79 80 // Name returns the gene name. Currently the same as the id. 81 func (g *Gene) Name() string { return g.ID } 82 83 // Description returns a description for the gene. 84 func (g *Gene) Description() string { return g.Desc } 85 86 // Location returns the location of the gene. Namely the chromosome. 87 func (g *Gene) Location() feat.Feature { return g.Chrom } 88 89 // Orientation returns the orientation of the gene relative to the chromosome. 90 func (g *Gene) Orientation() feat.Orientation { return g.Orient } 91 92 // Features returns all features added to the gene. 93 func (g *Gene) Features() []feat.Feature { return g.feats } 94 95 // SetFeatures sets the gene features. Internally, it verifies that their 96 // Location is the gene and that one of them has zero Start. If an error 97 // occurs it is returned and the features are not set. 98 func (g *Gene) SetFeatures(feats ...feat.Feature) error { 99 pos := maxInt 100 end := 0 101 for _, f := range feats { 102 if f.Location() != g { 103 return errors.New("transcript location does not match the gene") 104 } 105 if f.Start() < pos { 106 pos = f.Start() 107 } 108 if f.End() > end { 109 end = f.End() 110 } 111 } 112 if pos != 0 { 113 return errors.New("no transcript with 0 start on gene") 114 } 115 g.length = end - pos 116 g.feats = feats 117 return nil 118 } 119 120 // A NonCodingTranscript is a gene transcript that has no coding potential. It 121 // can be located on any feat.Feature such as a gene or a chromosome. The 122 // concept of exons is tightly coupled with the NonCodingTranscript in the 123 // sense that the transcript borders are basically defined by the contained 124 // exons. By definition one of the exons must always start at position 0 125 // relative to the transcript and this or another one must end at the end of 126 // transcript. The former is asserted when exons are set and the latter is 127 // guaranteed by setting the transcript end at the end of the last exon. 128 type NonCodingTranscript struct { 129 ID string 130 Loc feat.Feature 131 Offset int 132 Orient feat.Orientation 133 Desc string 134 exons Exons 135 } 136 137 // Start returns the transcript start relative to Location. 138 func (t *NonCodingTranscript) Start() int { return t.Offset } 139 140 // End returns the transcript end relative to Location. 141 func (t *NonCodingTranscript) End() int { return t.Offset + t.exons.End() } 142 143 // Len returns the length of the transcript. 144 func (t *NonCodingTranscript) Len() int { return t.End() - t.Start() } 145 146 // Name returns the transcript name. Currently the same as the id. 147 func (t *NonCodingTranscript) Name() string { return t.ID } 148 149 // Description returns a description for the transcript. 150 func (t *NonCodingTranscript) Description() string { return t.Desc } 151 152 // Location returns the location of the transcript. Can be any feat.Feature 153 // such as a gene or a chromosome. 154 func (t *NonCodingTranscript) Location() feat.Feature { return t.Loc } 155 156 // Orientation returns the orientation of the transcript relative to Location. 157 func (t *NonCodingTranscript) Orientation() feat.Orientation { return t.Orient } 158 159 // Exons returns a typed slice with the transcript exons. 160 func (t *NonCodingTranscript) Exons() Exons { return t.exons } 161 162 // Introns returns a typed slice with the transcript introns. 163 func (t *NonCodingTranscript) Introns() Introns { return t.exons.Introns() } 164 165 // SetExons sets the transcript exons. Internally, it sorts exons by Start, 166 // verifies that their Location is the transcript, that they are not 167 // overlapping and that one has zero Start. If an error occurs it is returned 168 // and the exons are not set. 169 func (t *NonCodingTranscript) SetExons(exons ...Exon) error { 170 exons, err := buildExonsFor(t, exons...) 171 if err != nil { 172 return err 173 } 174 t.exons = exons 175 return nil 176 } 177 178 // A CodingTranscript is a gene transcript that has coding potential. It can 179 // be located on any feat.Feature such as a gene or a chromosome. The concept 180 // of exons is tightly coupled with the CodingTranscript in the sense that 181 // the transcript borders are basically defined by the contained exons. By 182 // definition one of the exons must always start at position 0 relative to the 183 // transcript and this or another one must end at the transcript end. The 184 // former is asserted when exons are set and the latter is guaranteed by 185 // setting the transcript end at the end of the last exon. 186 type CodingTranscript struct { 187 ID string 188 Loc feat.Feature 189 Offset int 190 Orient feat.Orientation 191 Desc string 192 CDSstart int 193 CDSend int 194 exons Exons 195 } 196 197 // Start returns the transcript start relative to Location. 198 func (t *CodingTranscript) Start() int { return t.Offset } 199 200 // End returns the transcript end relative to Location. 201 func (t *CodingTranscript) End() int { return t.Offset + t.exons.End() } 202 203 // Len returns the length of the transcript. 204 func (t *CodingTranscript) Len() int { return t.End() - t.Start() } 205 206 // Name returns the transcript name. Currently the same as the id. 207 func (t *CodingTranscript) Name() string { return t.ID } 208 209 // Description returns a description for the transcript. 210 func (t *CodingTranscript) Description() string { return t.Desc } 211 212 // Location returns the location of the transcript. Can be any feat.Feature 213 // such as a gene or a chromosome. 214 func (t *CodingTranscript) Location() feat.Feature { return t.Loc } 215 216 // Orientation returns the orientation of the transcript relative to Location. 217 func (t *CodingTranscript) Orientation() feat.Orientation { 218 return t.Orient 219 } 220 221 // UTR5 returns a feat.Feature that corresponds to the 5'UTR of the 222 // transcript. 223 func (t *CodingTranscript) UTR5() feat.Feature { 224 tf := &TranscriptFeature{Transcript: t, Orient: feat.Forward} 225 ori, _ := feat.BaseOrientationOf(t) 226 switch ori { 227 case feat.Forward: 228 tf.Offset = 0 229 tf.Length = t.CDSstart 230 case feat.Reverse: 231 tf.Offset = t.CDSend 232 tf.Length = t.Len() - t.CDSend 233 default: 234 panic("gene: invalid base orientation for transcript") 235 } 236 return tf 237 } 238 239 // CDS returns a feat.Feature that corresponds to the coding region of the 240 // transcript. 241 func (t *CodingTranscript) CDS() feat.Feature { 242 return &TranscriptFeature{ 243 Transcript: t, 244 Offset: t.CDSstart, 245 Length: t.CDSend - t.CDSstart, 246 Orient: feat.Forward, 247 } 248 } 249 250 // UTR3 returns a feat.Feature that corresponds to the 3'UTR of the 251 // transcript. 252 func (t *CodingTranscript) UTR3() feat.Feature { 253 tf := &TranscriptFeature{Transcript: t, Orient: feat.Forward} 254 ori, _ := feat.BaseOrientationOf(t) 255 switch ori { 256 case feat.Forward: 257 tf.Offset = t.CDSend 258 tf.Length = t.Len() - t.CDSend 259 case feat.Reverse: 260 tf.Offset = 0 261 tf.Length = t.CDSstart 262 default: 263 panic("gene: invalid base orientation for transcript") 264 } 265 return tf 266 } 267 268 // UTR5start returns the start of the 5'UTR relative to the transcript. 269 // UTR5start is shorthand for t.UTR5().Start(). 270 func (t *CodingTranscript) UTR5start() int { 271 return t.UTR5().Start() 272 } 273 274 // UTR5end returns the end of the 5'UTR relative to the transcript. 275 // UTR5end is shorthand for t.UTR5().End(). 276 func (t *CodingTranscript) UTR5end() int { 277 return t.UTR5().End() 278 } 279 280 // UTR3start returns the start of the 3'UTR relative to the transcript. 281 // UTR3start is shorthand for t.UTR3().Start(). 282 func (t *CodingTranscript) UTR3start() int { 283 return t.UTR3().Start() 284 } 285 286 // UTR3end returns the end of the 3'UTR relative to the transcript. 287 // UTR3end is shorthand for t.UTR3().End(). 288 func (t *CodingTranscript) UTR3end() int { 289 return t.UTR3().End() 290 } 291 292 // Exons returns a typed slice with the transcript exons. 293 func (t *CodingTranscript) Exons() Exons { return t.exons } 294 295 // Introns returns a typed slice with the transcript introns. 296 func (t *CodingTranscript) Introns() Introns { return t.exons.Introns() } 297 298 // SetExons sets the transcript exons. Internally, it sorts exons by Start, 299 // verifies that their Location is the transcript, that they are not 300 // overlapping and that one has zero Start. If an error occurs it is returned 301 // and the exons are not set. 302 func (t *CodingTranscript) SetExons(exons ...Exon) error { 303 newExons, err := buildExonsFor(t, exons...) 304 if err != nil { 305 return err 306 } 307 t.exons = newExons 308 return nil 309 } 310 311 // TranscriptFeature defines a feature on a transcript. 312 type TranscriptFeature struct { 313 Transcript Transcript // Transcript is the transcript that the feature is located. 314 Offset int // Offset is the position of the feature relative to Transcript. 315 Length int // Length is the feature length. 316 Orient feat.Orientation // Orientation is the feature orientation relative to Transcript. 317 FeatName string // FeatName is the name of the feature. 318 Desc string // Desc is the description of the feature. 319 } 320 321 // Start returns the feature start relative to Transcript. 322 func (t *TranscriptFeature) Start() int { return t.Offset } 323 324 // End returns the feature end relative to TranscriptLocation. 325 func (t *TranscriptFeature) End() int { return t.Offset + t.Length } 326 327 // Len returns the length of the feature. 328 func (t *TranscriptFeature) Len() int { return t.Length } 329 330 // Name returns an empty string. 331 func (t *TranscriptFeature) Name() string { return t.FeatName } 332 333 // Description returns the feature description. 334 func (t *TranscriptFeature) Description() string { return t.Desc } 335 336 // Location returns the Transcript. 337 func (t *TranscriptFeature) Location() feat.Feature { return t.Transcript } 338 339 // Orientation returns the orientation of the feature relative to Transcript. 340 func (t *TranscriptFeature) Orientation() feat.Orientation { 341 return t.Orient 342 } 343 344 // Exons is a typed slice of Exon. It guarantees that exons are always sorted 345 // by Start, are all located on the same feature and are non overlapping. 346 type Exons []Exon 347 348 // SplicedLen returns the total length of the exons. 349 func (s Exons) SplicedLen() int { 350 length := 0 351 for _, e := range s { 352 length += e.Len() 353 } 354 return length 355 } 356 357 // Add adds exons to the slice and safeguards the types contracts. It returns 358 // a new slice with the added exons. It checks for sorting, overlap, and 359 // location match. If and error occurs it returns the old slice (without the 360 // new exons) and the error. 361 func (s Exons) Add(exons ...Exon) (Exons, error) { 362 newSlice := append(s, exons...) 363 sort.Sort(newSlice) 364 for i, e := range newSlice { 365 if i != 0 && e.Start() < newSlice[i-1].End() { 366 return s, errors.New("exons overlap") 367 } 368 if i != 0 && e.Location() != newSlice[i-1].Location() { 369 return s, errors.New("exons location differ") 370 } 371 372 } 373 if s.Location() != nil && s.Location() != newSlice.Location() { 374 return s, errors.New("new exons locations differ from old ones") 375 } 376 return newSlice, nil 377 } 378 379 // Location returns the common location of all the exons. 380 func (s Exons) Location() feat.Feature { 381 if len(s) == 0 { 382 return nil 383 } 384 return s[0].Location() 385 } 386 387 // Len returns the number of exons in the slice. 388 func (s Exons) Len() int { 389 return len(s) 390 } 391 392 // Less returns whether the exon with index i should sort before 393 // the exon with index j. 394 func (s Exons) Less(i, j int) bool { 395 return s[i].Start() < s[j].Start() 396 } 397 398 // Swap swaps the exons with indexes i and j. 399 func (s Exons) Swap(i, j int) { 400 s[i], s[j] = s[j], s[i] 401 } 402 403 // End returns the maximum End of all exons. Since exons are sorted and non 404 // overlapping this matches the End of the last exon in the slice. 405 func (s Exons) End() int { 406 if len(s) == 0 { 407 return 0 408 } 409 return s[len(s)-1].End() 410 } 411 412 // Start returns the minimum Start of all exons. Since exons are sorted and 413 // non overlapping this matches the Start of the first exon in the slice. 414 func (s Exons) Start() int { 415 if len(s) == 0 { 416 return 0 417 } 418 return s[0].Start() 419 } 420 421 // Introns returns a typed slice of Introns. Introns are built dynamically. 422 func (s Exons) Introns() Introns { 423 var introns Introns 424 if s.Len() < 2 { 425 return introns 426 } 427 for i := 1; i < s.Len(); i++ { 428 intron := Intron{ 429 Transcript: s[i].Transcript, 430 Offset: s[i-1].End(), 431 Length: s[i].Start() - s[i-1].End(), 432 } 433 introns = append(introns, intron) 434 } 435 return introns 436 } 437 438 // An Exon is the part of a transcript that remains present in the final 439 // mature RNA product after splicing. 440 type Exon struct { 441 Transcript Transcript 442 Offset int 443 Length int 444 Desc string 445 } 446 447 // Start returns the start position of the exon relative to Transcript. 448 func (e Exon) Start() int { return e.Offset } 449 450 // End returns the end position of the exon relative to Transcript. 451 func (e Exon) End() int { return e.Offset + e.Length } 452 453 // Len returns the length of the exon. 454 func (e Exon) Len() int { return e.Length } 455 456 // Location returns the location of the exon - the transcript. 457 func (e Exon) Location() feat.Feature { return e.Transcript } 458 459 // Name returns an empty string. 460 func (e Exon) Name() string { return "" } 461 462 // Description returns a description for the exon. 463 func (e Exon) Description() string { return e.Desc } 464 465 // Orientation always returns Forward. 466 func (e Exon) Orientation() feat.Orientation { 467 return feat.Forward 468 } 469 470 // Introns corresponds to a collection of introns. 471 type Introns []Intron 472 473 // An Intron is the part of a transcript that is removed during splicing 474 // and is not part of the final mature RNA product. 475 type Intron struct { 476 Transcript Transcript 477 Offset int 478 Length int 479 Desc string 480 } 481 482 // Start returns the start position of the intron relative to Transcript. 483 func (i Intron) Start() int { return i.Offset } 484 485 // End returns the end position of the intron relative to Transcript. 486 func (i Intron) End() int { return i.Offset + i.Length } 487 488 // Len returns the length of the intron. 489 func (i Intron) Len() int { return i.Length } 490 491 // Location returns the location of the intron - the transcript. 492 func (i Intron) Location() feat.Feature { return i.Transcript } 493 494 // Name returns an empty string. 495 func (i Intron) Name() string { return "" } 496 497 // Description returns a description for the intron. 498 func (i Intron) Description() string { return i.Desc } 499 500 // Orientation always returns Forward. 501 func (i Intron) Orientation() feat.Orientation { 502 return feat.Forward 503 } 504 505 // buildExonsFor is a helper function that will check if exons are compatible 506 // with a transcript and return a typed slice of exons. If it encounters an 507 // error or the exons are not compatible with the transcript it will return 508 // the error and a possibly partially filled slice. It is not safe to use the 509 // slice if the error is not nil. 510 func buildExonsFor(t Transcript, exons ...Exon) (Exons, error) { 511 var newExons Exons 512 newExons, err := newExons.Add(exons...) 513 if err != nil { 514 return newExons, err 515 } 516 if newExons.Location() != t { 517 return newExons, errors.New("exon location is not the transcript") 518 } 519 if newExons.Start() != 0 { 520 return newExons, errors.New("no exon with a zero start") 521 } 522 return newExons, nil 523 }