github.com/biogo/biogo@v1.0.4/align/pals/packseqs.go (about)

     1  // Copyright ©2011-2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package pals
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  
    11  	"github.com/biogo/biogo/alphabet"
    12  	"github.com/biogo/biogo/seq"
    13  	"github.com/biogo/biogo/seq/linear"
    14  	"github.com/biogo/biogo/util"
    15  )
    16  
    17  var (
    18  	binSize    int = 1 << 10
    19  	minPadding int = 50
    20  )
    21  
    22  // A Packer collects a set of sequence into a Packed sequence.
    23  type Packer struct {
    24  	packed  *Packed
    25  	lastPad int
    26  	length  int
    27  }
    28  
    29  type Packed struct {
    30  	*linear.Seq
    31  	seqMap
    32  }
    33  
    34  // Convert coordinates in a packed sequence into a feat.Feature.
    35  func (pa *Packed) feature(from, to int, comp bool) (*Feature, error) {
    36  	if comp {
    37  		from, to = pa.Len()-to, pa.Len()-from
    38  	}
    39  	if from >= to {
    40  		return nil, errors.New("pals: from > to")
    41  	}
    42  
    43  	// DPHit coordinates sometimes over/underflow.
    44  	// This is a lazy hack to work around it, should really figure
    45  	// out what is going on.
    46  	if from < 0 {
    47  		from = 0
    48  	}
    49  	if to > pa.Len() {
    50  		to = pa.Len()
    51  	}
    52  
    53  	// Take midpoint of segment -- lazy hack again, endpoints
    54  	// sometimes under / overflow
    55  	bin := (from + to) / (2 * binSize)
    56  	binCount := (pa.Len() + binSize - 1) / binSize
    57  
    58  	if bin < 0 || bin >= binCount {
    59  		return nil, fmt.Errorf("pals: bin %d out of range 0..%d", bin, binCount-1)
    60  	}
    61  
    62  	contigIndex := pa.seqMap.binMap[bin]
    63  
    64  	if contigIndex < 0 || contigIndex >= len(pa.seqMap.contigs) {
    65  		return nil, fmt.Errorf("pals: contig %s index %d out of range 0..%d", pa.ID, contigIndex, len(pa.seqMap.contigs))
    66  	}
    67  
    68  	length := to - from
    69  
    70  	if length < 0 {
    71  		return nil, errors.New("pals: length < 0")
    72  	}
    73  
    74  	contig := pa.seqMap.contigs[contigIndex]
    75  	contigFrom := from - contig.from
    76  	contigTo := contigFrom + length
    77  
    78  	if contigFrom < 0 {
    79  		contigFrom = 0
    80  	}
    81  
    82  	if contigTo > contig.Len() {
    83  		contigTo = contig.Len()
    84  	}
    85  
    86  	return &Feature{
    87  		ID:   contig.ID,
    88  		From: contigFrom,
    89  		To:   contigTo,
    90  		Loc:  Contig(contig.ID),
    91  	}, nil
    92  }
    93  
    94  // Create a new Packer.
    95  func NewPacker(id string) *Packer {
    96  	return &Packer{
    97  		packed: &Packed{
    98  			Seq:    &linear.Seq{Annotation: seq.Annotation{ID: id}},
    99  			seqMap: seqMap{},
   100  		},
   101  	}
   102  }
   103  
   104  // Pack a sequence into the Packed sequence. Returns a string giving diagnostic information.
   105  func (pa *Packer) Pack(seq *linear.Seq) (string, error) {
   106  	if pa.packed.Alpha == nil {
   107  		pa.packed.Alpha = seq.Alpha
   108  	} else if pa.packed.Alpha != seq.Alpha {
   109  		return "", errors.New("pals: alphabet mismatch")
   110  	}
   111  
   112  	c := contig{Seq: seq}
   113  
   114  	padding := binSize - seq.Len()%binSize
   115  	if padding < minPadding {
   116  		padding += binSize
   117  	}
   118  
   119  	pa.length += pa.lastPad
   120  	c.from = pa.length
   121  	pa.length += seq.Len()
   122  	pa.lastPad = padding
   123  
   124  	m := &pa.packed.seqMap
   125  	bins := make([]int, (padding+seq.Len())/binSize)
   126  	for i := 0; i < len(bins); i++ {
   127  		bins[i] = len(m.contigs)
   128  	}
   129  	m.binMap = append(m.binMap, bins...)
   130  	m.contigs = append(m.contigs, c)
   131  
   132  	return fmt.Sprintf("%20s\t%10d\t%7d-%-d", seq.ID[:util.Min(20, len(seq.ID))], seq.Len(), len(m.binMap)-len(bins), len(m.binMap)-1), nil
   133  }
   134  
   135  // Finalise the sequence packing.
   136  func (pa *Packer) FinalisePack() *Packed {
   137  	lastPad := 0
   138  	seq := make(alphabet.Letters, 0, pa.length)
   139  	for _, c := range pa.packed.seqMap.contigs {
   140  		padding := binSize - c.Len()%binSize
   141  		if padding < minPadding {
   142  			padding += binSize
   143  		}
   144  		seq = append(seq, alphabet.Letter('N').Repeat(lastPad)...)
   145  		seq = append(seq, c.Seq.Seq...)
   146  		lastPad = padding
   147  	}
   148  	pa.packed.Seq.Seq = seq
   149  
   150  	return pa.packed
   151  }
   152  
   153  // A contig holds a sequence within a SeqMap.
   154  type contig struct {
   155  	*linear.Seq
   156  	from int
   157  }
   158  
   159  // A seqMap is a collection of sequences mapped to a Packed sequence.
   160  type seqMap struct {
   161  	contigs []contig
   162  	binMap  []int
   163  }