github.com/biogo/biogo@v1.0.4/align/pals/piler.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package pals
     6  
     7  import (
     8  	"fmt"
     9  	"log"
    10  
    11  	"github.com/biogo/biogo/feat"
    12  	"github.com/biogo/store/interval"
    13  )
    14  
    15  var duplicatePair = fmt.Errorf("pals: attempt to add duplicate feature pair to pile")
    16  
    17  // Note Location must be comparable according to http://golang.org/ref/spec#Comparison_operators.
    18  type pileInterval struct {
    19  	id         uintptr
    20  	start, end int
    21  	pile       *Pile
    22  	location   feat.Feature
    23  	images     []*Feature
    24  	overlap    int
    25  }
    26  
    27  func (i *pileInterval) Overlap(b interval.IntRange) bool {
    28  	return i.end-i.overlap >= b.Start && i.start <= b.End-i.overlap
    29  }
    30  func (i *pileInterval) ID() uintptr { return i.id }
    31  func (i *pileInterval) Range() interval.IntRange {
    32  	return interval.IntRange{Start: i.start + i.overlap, End: i.end - i.overlap}
    33  }
    34  
    35  // A Piler performs the aggregation of feature pairs according to the description in section 2.3
    36  // of Edgar and Myers (2005) using an interval tree, giving O(nlogn) time but better space complexity
    37  // and flexibility with feature overlap.
    38  type Piler struct {
    39  	// Logger logs pile construction during
    40  	// Piles calls if non-nil.
    41  	Logger *log.Logger
    42  	// LogFreq specifies how frequently
    43  	// log lines are witten if not zero.
    44  	LogFreq int
    45  
    46  	intervals map[feat.Feature]*interval.IntTree
    47  	seen      map[[2]sf]struct{}
    48  	overlap   int
    49  	piled     bool
    50  
    51  	// next provides the next ID
    52  	// for merged intervals. IDs
    53  	// are unique across all intervals.
    54  	next uintptr
    55  }
    56  
    57  type sf struct {
    58  	loc  feat.Feature
    59  	s, e int
    60  }
    61  
    62  // NewPiler creates a Piler object ready for piling feature pairs.
    63  func NewPiler(overlap int) *Piler {
    64  	return &Piler{
    65  		intervals: make(map[feat.Feature]*interval.IntTree),
    66  		seen:      make(map[[2]sf]struct{}),
    67  		overlap:   overlap,
    68  	}
    69  }
    70  
    71  // Add adds a feature pair to the piler incorporating the features into piles where appropriate.
    72  func (p *Piler) Add(fp *Pair) error {
    73  	a := sf{loc: fp.A.Location(), s: fp.A.Start(), e: fp.A.End()}
    74  	b := sf{loc: fp.B.Location(), s: fp.B.Start(), e: fp.B.End()}
    75  	ab, ba := [2]sf{a, b}, [2]sf{b, a}
    76  
    77  	if _, ok := p.seen[ab]; ok {
    78  		return duplicatePair
    79  	}
    80  	if _, ok := p.seen[ba]; ok {
    81  		return duplicatePair
    82  	}
    83  
    84  	p.merge(&pileInterval{id: p.nextID(), start: fp.A.Start(), end: fp.A.End(), location: fp.A.Location(), images: []*Feature{fp.A}, overlap: p.overlap})
    85  	p.merge(&pileInterval{id: p.nextID(), start: fp.B.Start(), end: fp.B.End(), location: fp.B.Location(), images: []*Feature{fp.B}, overlap: p.overlap})
    86  	p.seen[ab] = struct{}{}
    87  
    88  	return nil
    89  }
    90  
    91  func (p *Piler) nextID() uintptr {
    92  	id := p.next
    93  	p.next++
    94  	return id
    95  }
    96  
    97  func min(a, b int) int {
    98  	if a < b {
    99  		return a
   100  	}
   101  	return b
   102  }
   103  
   104  func max(a, b int) int {
   105  	if a > b {
   106  		return a
   107  	}
   108  	return b
   109  }
   110  
   111  // merge merges an interval into the tree moving location meta data from the replaced intervals
   112  // into the new interval.
   113  func (p *Piler) merge(pi *pileInterval) {
   114  	var (
   115  		f  = true
   116  		r  []interval.IntInterface
   117  		qi = &pileInterval{start: pi.start, end: pi.end}
   118  	)
   119  	t, ok := p.intervals[pi.location]
   120  	if !ok {
   121  		t = &interval.IntTree{}
   122  		p.intervals[pi.location] = t
   123  	}
   124  	t.DoMatching(
   125  		func(e interval.IntInterface) (done bool) {
   126  			r = append(r, e)
   127  			iv := e.(*pileInterval)
   128  			pi.images = append(pi.images, iv.images...)
   129  			if f {
   130  				pi.start = min(iv.start, pi.start)
   131  				f = false
   132  			}
   133  			pi.end = max(iv.end, pi.end)
   134  			return
   135  		},
   136  		qi,
   137  	)
   138  	for _, d := range r {
   139  		t.Delete(d, false)
   140  	}
   141  	t.Insert(pi, false)
   142  }
   143  
   144  // A PairFilter is used to determine whether a Pair's images are included in a Pile.
   145  type PairFilter func(*Pair) bool
   146  
   147  // Piles returns a slice of piles determined by application of the filter function f to
   148  // the feature pairs that have been added to the piler. Piles may be called more than once,
   149  // but the piles returned in earlier invocations will be altered by subsequent calls.
   150  func (p *Piler) Piles(f PairFilter) []*Pile {
   151  	var n int
   152  	if !p.piled {
   153  		for _, t := range p.intervals {
   154  			t.Do(func(e interval.IntInterface) (done bool) {
   155  				pa := e.(*pileInterval)
   156  				if pa.pile == nil {
   157  					pa.pile = &Pile{Loc: pa.location, From: pa.start, To: pa.end}
   158  				}
   159  				for _, im := range pa.images {
   160  					if checkSanity {
   161  						assertPileSanity(t, im, pa.pile)
   162  					}
   163  					im.Loc = pa.pile
   164  				}
   165  				return
   166  			})
   167  			n++
   168  			if p.Logger != nil && p.LogFreq != 0 && n%p.LogFreq == 0 {
   169  				p.Logger.Printf("piled %d intervals of %d", n, len(p.intervals))
   170  			}
   171  		}
   172  		p.piled = true
   173  	}
   174  
   175  	n = 0
   176  	var piles []*Pile
   177  	for _, t := range p.intervals {
   178  		t.Do(func(e interval.IntInterface) (done bool) {
   179  			pa := e.(*pileInterval)
   180  			pa.pile.Images = pa.pile.Images[:0]
   181  			for _, im := range pa.images {
   182  				if f != nil && !f(im.Pair) {
   183  					continue
   184  				}
   185  				if checkSanity {
   186  					assertPairSanity(im)
   187  				}
   188  				pa.pile.Images = append(pa.pile.Images, im)
   189  			}
   190  			piles = append(piles, pa.pile)
   191  			return
   192  		})
   193  		n++
   194  		if p.Logger != nil && p.LogFreq != 0 && n%p.LogFreq == 0 {
   195  			p.Logger.Printf("filtered %d intervals of %d", n, len(p.intervals))
   196  		}
   197  	}
   198  
   199  	return piles
   200  }
   201  
   202  const checkSanity = false
   203  
   204  func assertPileSanity(t *interval.IntTree, im *Feature, pi *Pile) {
   205  	if im.Start() < pi.Start() || im.End() > pi.End() {
   206  		panic(fmt.Sprintf("image extends beyond pile: %#v", im))
   207  	}
   208  	if foundPiles := t.Get(&pileInterval{start: im.Start(), end: im.End()}); len(foundPiles) > 1 {
   209  		var containing int
   210  		for _, pile := range foundPiles {
   211  			r := pile.Range()
   212  			if (r.Start <= im.Start() && r.End > im.End()) || (r.Start < im.Start() && r.End >= im.End()) {
   213  				containing++
   214  			}
   215  		}
   216  		if containing > 1 {
   217  			panic(fmt.Sprintf("found too many piles for %#v", im))
   218  		}
   219  	}
   220  }
   221  
   222  func assertPairSanity(im *Feature) {
   223  	if _, ok := im.Loc.(*Pile); !ok {
   224  		panic(fmt.Sprintf("image not allocated to pile %#v", im))
   225  	}
   226  	if _, ok := im.Mate().Loc.(*Pile); !ok {
   227  		panic(fmt.Sprintf("image mate not allocated to pile %#v", im.Mate()))
   228  	}
   229  }