github.com/biogo/biogo@v1.0.4/align/pals/filter/filter.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package providing PALS sequence hit filter routines based on
     6  // 'Efficient q-gram filters for finding all 𝛜-matches over a given length.'
     7  //   Kim R. Rasmussen, Jens Stoye, and Eugene W. Myers. J. of Computational Biology 13:296–308 (2006).
     8  package filter
     9  
    10  import (
    11  	"github.com/biogo/biogo/index/kmerindex"
    12  	"github.com/biogo/biogo/morass"
    13  	"github.com/biogo/biogo/seq/linear"
    14  
    15  	"errors"
    16  )
    17  
    18  // Ukonnen's Lemma: U(n, q, 𝛜) := (n + 1) - q(⌊𝛜n⌋ + 1)
    19  func MinWordsPerFilterHit(hitLength, wordLength, maxErrors int) int {
    20  	return hitLength + 1 - wordLength*(maxErrors+1)
    21  }
    22  
    23  // Type for passing filter parameters.
    24  type Params struct {
    25  	WordSize   int
    26  	MinMatch   int
    27  	MaxError   int
    28  	TubeOffset int
    29  }
    30  
    31  // Filter implements a q-gram filter similar to that described in Rassmussen 2005.
    32  // This implementation is a translation of the C++ code written by Edgar and Myers.
    33  type Filter struct {
    34  	target         *linear.Seq
    35  	ki             *kmerindex.Index
    36  	tubes          []tubeState
    37  	morass         *morass.Morass
    38  	k              int
    39  	minMatch       int
    40  	maxError       int
    41  	maxKmerDist    int
    42  	minKmersPerHit int
    43  	tubeOffset     int
    44  	selfAlign      bool
    45  	complement     bool
    46  }
    47  
    48  // Return a new Filter using ki as the target, and filter parameters in params.
    49  func New(ki *kmerindex.Index, params *Params) (f *Filter) {
    50  	f = &Filter{
    51  		ki:         ki,
    52  		target:     ki.Seq(),
    53  		k:          ki.K(),
    54  		minMatch:   params.MinMatch,
    55  		maxError:   params.MaxError,
    56  		tubeOffset: params.TubeOffset,
    57  	}
    58  
    59  	return
    60  }
    61  
    62  // Filter a query sequence against the stored index. If query and the target are the same sequence,
    63  // selfAlign can be used to avoid double searching - behavior is undefined if the the sequences are not the same.
    64  // A morass is used to store and sort individual filter hits.
    65  func (f *Filter) Filter(query *linear.Seq, selfAlign, complement bool, morass *morass.Morass) error {
    66  	f.selfAlign = selfAlign
    67  	f.complement = complement
    68  	f.morass = morass
    69  	f.k = f.ki.K()
    70  
    71  	// Ukonnen's Lemma
    72  	f.minKmersPerHit = MinWordsPerFilterHit(f.minMatch, f.k, f.maxError)
    73  
    74  	// Maximum distance between SeqQ positions of two k-mers in a match
    75  	// (More stringent bounds may be possible, but not a big problem
    76  	// if two adjacent matches get merged).
    77  	f.maxKmerDist = f.minMatch - f.k
    78  
    79  	tubeWidth := f.tubeOffset + f.maxError
    80  
    81  	if f.tubeOffset < f.maxError {
    82  		return errors.New("filter: TubeOffset < MaxError")
    83  	}
    84  
    85  	maxActiveTubes := (f.target.Len()+tubeWidth-1)/f.tubeOffset + 1
    86  	f.tubes = make([]tubeState, maxActiveTubes)
    87  
    88  	// Ticker tracks cycling of circular list of active tubes.
    89  	ticker := tubeWidth
    90  
    91  	var err error
    92  	err = f.ki.ForEachKmerOf(query, 0, query.Len(), func(ki *kmerindex.Index, position, kmer int) {
    93  		from := 0
    94  		if kmer > 0 {
    95  			from = ki.FingerAt(kmer - 1)
    96  		}
    97  		to := ki.FingerAt(kmer)
    98  		for i := from; i < to; i++ {
    99  			f.commonKmer(ki.PosAt(i), position)
   100  		}
   101  
   102  		if ticker--; ticker == 0 {
   103  			if e := f.tubeEnd(position); e != nil {
   104  				panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned
   105  			}
   106  			ticker = f.tubeOffset
   107  		}
   108  	})
   109  	if err != nil {
   110  		return err
   111  	}
   112  
   113  	err = f.tubeEnd(query.Len() - 1)
   114  	if err != nil {
   115  		return err
   116  	}
   117  
   118  	diagFrom := f.diagIndex(f.target.Len()-1, query.Len()-1) - tubeWidth
   119  	diagTo := f.diagIndex(0, query.Len()-1) + tubeWidth
   120  
   121  	tubeFrom := f.tubeIndex(diagFrom)
   122  	if tubeFrom < 0 {
   123  		tubeFrom = 0
   124  	}
   125  
   126  	tubeTo := f.tubeIndex(diagTo)
   127  
   128  	for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ {
   129  		err = f.tubeFlush(tubeIndex)
   130  		if err != nil {
   131  			return err
   132  		}
   133  	}
   134  
   135  	f.tubes = nil
   136  
   137  	return f.morass.Finalise()
   138  }
   139  
   140  // A tubeState stores active filter bin states.
   141  // tubeState is repeated in the pals package to allow memory calculation without exporting tubeState from filter package.
   142  type tubeState struct {
   143  	QLo   int
   144  	QHi   int
   145  	Count int
   146  }
   147  
   148  // Called when q=Qlen - 1 to flush any hits in each tube.
   149  func (f *Filter) tubeFlush(tubeIndex int) error {
   150  	tube := &f.tubes[tubeIndex%cap(f.tubes)]
   151  
   152  	if tube.Count < f.minKmersPerHit {
   153  		return nil
   154  	}
   155  
   156  	err := f.addHit(tubeIndex, tube.QLo, tube.QHi)
   157  	if err != nil {
   158  		return err
   159  	}
   160  	tube.Count = 0
   161  
   162  	return nil
   163  }
   164  
   165  func (f *Filter) diagIndex(t, q int) int {
   166  	return f.target.Len() - t + q
   167  }
   168  
   169  func (f *Filter) tubeIndex(d int) int {
   170  	return d / f.tubeOffset
   171  }
   172  
   173  // Found a common k-mer SeqT[t] and SeqQ[q].
   174  func (f *Filter) commonKmer(t, q int) error {
   175  	if f.selfAlign && ((f.complement && (q < f.target.Len()-t)) || (!f.complement && (q <= t))) {
   176  		return nil
   177  	}
   178  
   179  	diagIndex := f.diagIndex(t, q)
   180  	tubeIndex := f.tubeIndex(diagIndex)
   181  
   182  	err := f.hitTube(tubeIndex, q)
   183  	if err != nil {
   184  		return err
   185  	}
   186  
   187  	// Hit in overlapping tube preceding this one?
   188  	if diagIndex%f.tubeOffset < f.maxError {
   189  		if tubeIndex == 0 {
   190  			tubeIndex = cap(f.tubes) - 1
   191  		} else {
   192  			tubeIndex--
   193  		}
   194  		err = f.hitTube(tubeIndex, q)
   195  	}
   196  
   197  	return err
   198  }
   199  
   200  func (f *Filter) hitTube(tubeIndex, q int) error {
   201  	tube := &f.tubes[tubeIndex%cap(f.tubes)]
   202  
   203  	if tube.Count == 0 {
   204  		tube.Count = 1
   205  		tube.QLo = q
   206  		tube.QHi = q
   207  		return nil
   208  	}
   209  
   210  	if q-tube.QHi > f.maxKmerDist {
   211  		if tube.Count >= f.minKmersPerHit {
   212  			err := f.addHit(tubeIndex, tube.QLo, tube.QHi)
   213  			if err != nil {
   214  				return err
   215  			}
   216  		}
   217  
   218  		tube.Count = 1
   219  		tube.QLo = q
   220  		tube.QHi = q
   221  		return nil
   222  	}
   223  
   224  	tube.Count++
   225  	tube.QHi = q
   226  
   227  	return nil
   228  }
   229  
   230  // Called when end of a tube is reached
   231  // A point in the tube -- the point with maximal q -- is (Tlen-1,q-1).
   232  func (f *Filter) tubeEnd(q int) error {
   233  	diagIndex := f.diagIndex(f.target.Len()-1, q-1)
   234  	tubeIndex := f.tubeIndex(diagIndex)
   235  	tube := &f.tubes[tubeIndex%cap(f.tubes)]
   236  
   237  	if tube.Count >= f.minKmersPerHit {
   238  		err := f.addHit(tubeIndex, tube.QLo, tube.QHi)
   239  		if err != nil {
   240  			return err
   241  		}
   242  	}
   243  
   244  	tube.Count = 0
   245  
   246  	return nil
   247  }
   248  
   249  func (f *Filter) addHit(tubeIndex, QLo, QHi int) error {
   250  	fh := Hit{
   251  		From:     QLo,
   252  		To:       QHi + f.k,
   253  		Diagonal: f.target.Len() - tubeIndex*f.tubeOffset,
   254  	}
   255  
   256  	return f.morass.Push(fh)
   257  }