github.com/biogo/biogo@v1.0.4/seq/alignment/alignment.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package alignment handles aligned sequences stored as columns. 6 package alignment 7 8 import ( 9 "github.com/biogo/biogo/alphabet" 10 "github.com/biogo/biogo/feat" 11 "github.com/biogo/biogo/seq" 12 "github.com/biogo/biogo/seq/linear" 13 "github.com/biogo/biogo/util" 14 15 "errors" 16 "fmt" 17 "strings" 18 "unicode" 19 ) 20 21 // A Seq is an aligned sequence. 22 type Seq struct { 23 seq.Annotation 24 SubAnnotations []seq.Annotation 25 Seq alphabet.Columns 26 ColumnConsense seq.ConsenseFunc 27 } 28 29 // NewSeq creates a new Seq with the given id, letter sequence and alphabet. 30 func NewSeq(id string, subids []string, b [][]alphabet.Letter, alpha alphabet.Alphabet, cons seq.ConsenseFunc) (*Seq, error) { 31 var ( 32 lids, lseq = len(subids), len(b) 33 subann []seq.Annotation 34 ) 35 switch { 36 case lids == 0 && len(b) == 0: 37 case lseq != 0 && lids == len(b[0]): 38 if lids == 0 { 39 subann = make([]seq.Annotation, len(b[0])) 40 for i := range subids { 41 subann[i].ID = fmt.Sprintf("%s:%d", id, i) 42 } 43 } else { 44 subann = make([]seq.Annotation, lids) 45 for i, sid := range subids { 46 subann[i].ID = sid 47 } 48 } 49 default: 50 return nil, errors.New("alignment: id/seq number mismatch") 51 } 52 53 return &Seq{ 54 Annotation: seq.Annotation{ 55 ID: id, 56 Alpha: alpha, 57 }, 58 SubAnnotations: subann, 59 Seq: append([][]alphabet.Letter(nil), b...), 60 ColumnConsense: cons, 61 }, nil 62 } 63 64 // Interface guarantees 65 var ( 66 _ feat.Feature = (*Seq)(nil) 67 _ feat.Feature = Row{} 68 _ seq.Sequence = Row{} 69 ) 70 71 // Slice returns the sequence data as a alphabet.Slice. 72 func (s *Seq) Slice() alphabet.Slice { return s.Seq } 73 74 // SetSlice sets the sequence data represented by the Seq. SetSlice will panic if sl 75 // is not a Columns. 76 func (s *Seq) SetSlice(sl alphabet.Slice) { s.Seq = sl.(alphabet.Columns) } 77 78 // Len returns the length of the alignment. 79 func (s *Seq) Len() int { return len(s.Seq) } 80 81 // Rows returns the number of rows in the alignment. 82 func (s *Seq) Rows() int { return s.Seq.Rows() } 83 84 // Start returns the start position of the sequence in coordinates relative to the 85 // sequence location. 86 func (s *Seq) Start() int { return s.Offset } 87 88 // End returns the end position of the sequence in coordinates relative to the 89 // sequence location. 90 func (s *Seq) End() int { return s.Offset + s.Len() } 91 92 // Clone returns a copy of the sequence. 93 func (s *Seq) Clone() seq.Rower { 94 c := *s 95 c.Seq = make(alphabet.Columns, len(s.Seq)) 96 for i, cs := range s.Seq { 97 c.Seq[i] = append([]alphabet.Letter(nil), cs...) 98 } 99 100 return &c 101 } 102 103 // New returns an empty *Seq sequence with the same alphabet. 104 func (s *Seq) New() *Seq { 105 return &Seq{Annotation: seq.Annotation{Alpha: s.Alpha}} 106 } 107 108 // RevComp reverse complements the sequence. RevComp will panic if the alphabet used by 109 // the receiver is not a Complementor. 110 func (s *Seq) RevComp() { 111 rs, comp := s.Seq, s.Alpha.(alphabet.Complementor).ComplementTable() 112 i, j := 0, len(rs)-1 113 for ; i < j; i, j = i+1, j-1 { 114 for r := range rs[i] { 115 rs[i][r], rs[j][r] = comp[rs[j][r]], comp[rs[i][r]] 116 } 117 } 118 if i == j { 119 for r := range rs[i] { 120 rs[i][r] = comp[rs[i][r]] 121 } 122 } 123 s.Strand = -s.Strand 124 } 125 126 // Reverse reverses the order of letters in the the sequence without complementing them. 127 func (s *Seq) Reverse() { 128 l := s.Seq 129 for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 { 130 l[i], l[j] = l[j], l[i] 131 } 132 s.Strand = seq.None 133 } 134 135 func (s *Seq) String() string { 136 return s.Consensus(false).String() 137 } 138 139 // Add adds the sequences n to Seq. Sequences in n should align start and end with the receiving alignment. 140 // Additional sequence will be clipped and missing sequence will be filled with the gap letter. 141 func (s *Seq) Add(n ...seq.Sequence) error { 142 for i := s.Start(); i < s.End(); i++ { 143 s.Seq[i] = append(s.Seq[i], s.column(n, i)...) 144 } 145 for i := range n { 146 s.SubAnnotations = append(s.SubAnnotations, *n[i].CloneAnnotation()) 147 } 148 149 return nil 150 } 151 152 func (s *Seq) column(m []seq.Sequence, pos int) []alphabet.Letter { 153 c := make([]alphabet.Letter, 0, s.Rows()) 154 155 for _, ss := range m { 156 if a, ok := ss.(seq.Aligned); ok { 157 if a.Start() <= pos && pos < a.End() { 158 c = append(c, a.Column(pos, true)...) 159 } else { 160 c = append(c, s.Alpha.Gap().Repeat(a.Rows())...) 161 } 162 } else { 163 if ss.Start() <= pos && pos < ss.End() { 164 c = append(c, ss.At(pos).L) 165 } else { 166 c = append(c, s.Alpha.Gap()) 167 } 168 } 169 } 170 171 return c 172 } 173 174 // Delete removes the sequence represented at row i of the alignment. It panics if i is out of range. 175 func (s *Seq) Delete(i int) { 176 if i >= s.Rows() { 177 panic("alignment: index out of range") 178 } 179 cs := s.Seq 180 for j, c := range cs { 181 cs[j] = c[:i+copy(c[i:], c[i+1:])] 182 } 183 sa := s.SubAnnotations 184 s.SubAnnotations = sa[:i+copy(sa[i:], sa[i+1:])] 185 } 186 187 // Row returns the sequence represented at row i of the alignment. It panics is i is out of range. 188 func (s *Seq) Row(i int) seq.Sequence { 189 if i < 0 || i >= s.Rows() { 190 panic("alignment: index out of range") 191 } 192 return Row{Align: s, Row: i} 193 } 194 195 // AppendColumns appends each Qletter of each element of a to the appropriate sequence in the receiver. 196 func (s *Seq) AppendColumns(a ...[]alphabet.QLetter) error { 197 for i, r := range a { 198 if len(r) != s.Rows() { 199 return fmt.Errorf("alignment: column %d does not match Rows(): %d != %d.", i, len(r), s.Rows()) 200 } 201 } 202 203 s.Seq = append(s.Seq, make([][]alphabet.Letter, len(a))...)[:len(s.Seq)] 204 for _, r := range a { 205 c := make([]alphabet.Letter, len(r)) 206 for i := range r { 207 c[i] = r[i].L 208 } 209 s.Seq = append(s.Seq, c) 210 } 211 212 return nil 213 } 214 215 // AppendEach appends each []alphabet.QLetter in a to the appropriate sequence in the receiver. 216 func (s *Seq) AppendEach(a [][]alphabet.QLetter) error { 217 if len(a) != s.Rows() { 218 return fmt.Errorf("alignment: number of sequences does not match Rows(): %d != %d.", len(a), s.Rows()) 219 } 220 max := util.MinInt 221 for _, ss := range a { 222 if l := len(ss); l > max { 223 max = l 224 } 225 } 226 s.Seq = append(s.Seq, make([][]alphabet.Letter, max)...)[:len(s.Seq)] 227 for i, b := 0, make([]alphabet.QLetter, 0, len(a)); i < max; i, b = i+1, b[:0] { 228 for _, ss := range a { 229 if i < len(ss) { 230 b = append(b, ss[i]) 231 } else { 232 b = append(b, alphabet.QLetter{L: s.Alpha.Gap()}) 233 } 234 } 235 s.AppendColumns(b) 236 } 237 238 return nil 239 } 240 241 // Column returns a slice of letters reflecting the column at pos. 242 func (s *Seq) Column(pos int, _ bool) []alphabet.Letter { 243 return s.Seq[pos] 244 } 245 246 // ColumnQL returns a slice of quality letters reflecting the column at pos. 247 func (s *Seq) ColumnQL(pos int, _ bool) []alphabet.QLetter { 248 c := make([]alphabet.QLetter, s.Rows()) 249 for i, l := range s.Seq[pos] { 250 c[i] = alphabet.QLetter{ 251 L: l, 252 Q: seq.DefaultQphred, 253 } 254 } 255 256 return c 257 } 258 259 // Consensus returns a quality sequence reflecting the consensus of the receiver determined by the 260 // ColumnConsense field. 261 func (s *Seq) Consensus(_ bool) *linear.QSeq { 262 cs := make([]alphabet.QLetter, 0, s.Len()) 263 alpha := s.Alphabet() 264 for i := range s.Seq { 265 cs = append(cs, s.ColumnConsense(s, alpha, i, false)) 266 } 267 268 qs := linear.NewQSeq("Consensus:"+s.ID, cs, s.Alpha, alphabet.Sanger) 269 qs.Strand = s.Strand 270 qs.SetOffset(s.Offset) 271 qs.Conform = s.Conform 272 273 return qs 274 } 275 276 // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's' 277 // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support 278 // truncated output via the verb's precision. Fasta format supports sequence line 279 // specification via the verb's width field. Fastq format supports optional inclusion 280 // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#' 281 // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for 282 // omission of the sequence name. 283 func (s *Seq) Format(fs fmt.State, c rune) { 284 if s == nil { 285 fmt.Fprint(fs, "<nil>") 286 return 287 } 288 switch c { 289 case 'v': 290 if fs.Flag('#') { 291 fmt.Fprintf(fs, "&%#v", *s) 292 return 293 } 294 fallthrough 295 case 's', 'a', 'q': 296 r := Row{Align: s} 297 for r.Row = 0; r.Row < s.Rows(); r.Row++ { 298 r.Format(fs, c) 299 if r.Row < s.Rows()-1 { 300 fmt.Fprintln(fs) 301 } 302 } 303 default: 304 fmt.Fprintf(fs, "%%!%c(*alignment.Seq=%.10s)", c, s) 305 } 306 } 307 308 // A Row is a pointer into an alignment that satisfies the seq.Sequence interface. 309 type Row struct { 310 Align *Seq 311 Row int 312 } 313 314 // At returns the letter at position i. 315 func (r Row) At(i int) alphabet.QLetter { 316 return alphabet.QLetter{ 317 L: r.Align.Seq[i-r.Align.Offset][r.Row], 318 Q: seq.DefaultQphred, 319 } 320 } 321 322 // Set sets the letter at position i to l. 323 func (r Row) Set(i int, l alphabet.QLetter) error { 324 r.Align.Seq[i-r.Align.Offset][r.Row] = l.L 325 return nil 326 } 327 328 // Len returns the length of the row. 329 func (r Row) Len() int { return len(r.Align.Seq) } 330 331 // Start returns the start position of the sequence in coordinates relative to the 332 // sequence location. 333 func (r Row) Start() int { return r.Align.SubAnnotations[r.Row].Offset } 334 335 // End returns the end position of the sequence in coordinates relative to the 336 // sequence location. 337 func (r Row) End() int { return r.Start() + r.Len() } 338 339 // Location returns the feature containing the row's sequence. 340 func (r Row) Location() feat.Feature { return r.Align.SubAnnotations[r.Row].Loc } 341 342 func (r Row) Alphabet() alphabet.Alphabet { return r.Align.Alpha } 343 func (r Row) Conformation() feat.Conformation { return r.Align.Conform } 344 func (r Row) SetConformation(c feat.Conformation) error { 345 r.Align.SubAnnotations[r.Row].Conform = c 346 return nil 347 } 348 func (r Row) Name() string { 349 return r.Align.SubAnnotations[r.Row].ID 350 } 351 func (r Row) Description() string { return r.Align.SubAnnotations[r.Row].Desc } 352 func (r Row) SetOffset(o int) error { r.Align.SubAnnotations[r.Row].Offset = o; return nil } 353 354 func (r Row) RevComp() { 355 rs, comp := r.Align.Seq, r.Alphabet().(alphabet.Complementor).ComplementTable() 356 i, j := 0, len(rs)-1 357 for ; i < j; i, j = i+1, j-1 { 358 rs[i][r.Row], rs[j][r.Row] = comp[rs[j][r.Row]], comp[rs[i][r.Row]] 359 } 360 if i == j { 361 rs[i][r.Row] = comp[rs[i][r.Row]] 362 } 363 r.Align.SubAnnotations[r.Row].Strand = -r.Align.SubAnnotations[r.Row].Strand 364 } 365 func (r Row) Reverse() { 366 l := r.Align.Seq 367 for i, j := 0, len(l)-1; i < j; i, j = i+1, j-1 { 368 l[i][r.Row], l[j][r.Row] = l[j][r.Row], l[i][r.Row] 369 } 370 r.Align.SubAnnotations[r.Row].Strand = seq.None 371 } 372 func (r Row) New() seq.Sequence { 373 return Row{Align: &Seq{Annotation: seq.Annotation{Alpha: r.Align.Alpha}}} 374 } 375 func (r Row) Clone() seq.Sequence { 376 b := make([]alphabet.Letter, r.Len()) 377 for i, c := range r.Align.Seq { 378 b[i] = c[r.Row] 379 } 380 switch { 381 case r.Row < 0: 382 panic("under") 383 case r.Row >= r.Align.Rows(): 384 panic("bang over Rows()") 385 case r.Row >= len(r.Align.SubAnnotations): 386 387 panic(fmt.Sprintf("bang over len(SubAnns): %d %d", r.Row, len(r.Align.SubAnnotations))) 388 } 389 return linear.NewSeq(r.Name(), b, r.Alphabet()) 390 } 391 func (r Row) CloneAnnotation() *seq.Annotation { 392 return r.Align.SubAnnotations[r.Row].CloneAnnotation() 393 } 394 395 // String returns a string representation of the sequence data only. 396 func (r Row) String() string { return fmt.Sprintf("%-s", r) } 397 398 // Format is a support routine for fmt.Formatter. It accepts the formats 'v' and 's' 399 // (string), 'a' (fasta) and 'q' (fastq). String, fasta and fastq formats support 400 // truncated output via the verb's precision. Fasta format supports sequence line 401 // specification via the verb's width field. Fastq format supports optional inclusion 402 // of the '+' line descriptor line with the '+' flag. The 'v' verb supports the '#' 403 // flag for Go syntax output. The 's' and 'v' formats support the '-' flag for 404 // omission of the sequence name. 405 func (r Row) Format(fs fmt.State, c rune) { 406 var ( 407 s = r.Align 408 w, wOk = fs.Width() 409 p, pOk = fs.Precision() 410 buf alphabet.Columns 411 ) 412 if s != nil { 413 if pOk { 414 buf = s.Seq[:min(p, len(s.Seq))] 415 } else { 416 buf = s.Seq 417 } 418 } 419 420 switch c { 421 case 'v': 422 if fs.Flag('#') { 423 type shadowRow Row 424 sr := fmt.Sprintf("%#v", shadowRow(r)) 425 fmt.Fprintf(fs, "%T%s", r, sr[strings.Index(sr, "{"):]) 426 return 427 } 428 fallthrough 429 case 's': 430 if s == nil { 431 fmt.Fprint(fs, "<nil>") 432 return 433 } 434 if !fs.Flag('-') { 435 fmt.Fprintf(fs, "%q ", r.Name()) 436 } 437 for _, lc := range buf { 438 fmt.Fprintf(fs, "%c", lc[r.Row]) 439 } 440 if pOk && s != nil && p < s.Len() { 441 fmt.Fprint(fs, "...") 442 } 443 case 'a': 444 if s == nil { 445 return 446 } 447 r.formatDescLineTo(fs, '>') 448 for i, lc := range buf { 449 fmt.Fprintf(fs, "%c", lc[r.Row]) 450 if wOk && i < s.Len()-1 && i%w == w-1 { 451 fmt.Fprintln(fs) 452 } 453 } 454 if pOk && p < s.Len() { 455 fmt.Fprint(fs, "...") 456 } 457 case 'q': 458 if s == nil { 459 return 460 } 461 r.formatDescLineTo(fs, '@') 462 for _, lc := range buf { 463 fmt.Fprintf(fs, "%c", lc[r.Row]) 464 } 465 if pOk && p < s.Len() { 466 fmt.Fprintln(fs, "...") 467 } else { 468 fmt.Fprintln(fs) 469 } 470 if fs.Flag('+') { 471 r.formatDescLineTo(fs, '+') 472 } else { 473 fmt.Fprintln(fs, "+") 474 } 475 e := seq.DefaultQphred.Encode(seq.DefaultEncoding) 476 if e >= unicode.MaxASCII { 477 e = unicode.MaxASCII - 1 478 } 479 for range buf { 480 fmt.Fprintf(fs, "%c", e) 481 } 482 if pOk && p < s.Len() { 483 fmt.Fprint(fs, "...") 484 } 485 default: 486 fmt.Fprintf(fs, "%%!%c(alignment.Row=%.10s)", c, s) 487 } 488 } 489 490 func (r Row) formatDescLineTo(fs fmt.State, p rune) { 491 fmt.Fprintf(fs, "%c%s", p, r.Name()) 492 if d := r.Description(); d != "" { 493 fmt.Fprintf(fs, " %s", d) 494 } 495 fmt.Fprintln(fs) 496 } 497 498 // SetSlice unconditionally panics. 499 func (r Row) SetSlice(_ alphabet.Slice) { panic("alignment: cannot alter row slice") } 500 501 // Slice unconditionally panics. 502 func (r Row) Slice() alphabet.Slice { panic("alignment: cannot get row slice") }