github.com/biogo/biogo@v1.0.4/align/pals/filter/filter.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package providing PALS sequence hit filter routines based on 6 // 'Efficient q-gram filters for finding all 𝛜-matches over a given length.' 7 // Kim R. Rasmussen, Jens Stoye, and Eugene W. Myers. J. of Computational Biology 13:296–308 (2006). 8 package filter 9 10 import ( 11 "github.com/biogo/biogo/index/kmerindex" 12 "github.com/biogo/biogo/morass" 13 "github.com/biogo/biogo/seq/linear" 14 15 "errors" 16 ) 17 18 // Ukonnen's Lemma: U(n, q, 𝛜) := (n + 1) - q(⌊𝛜n⌋ + 1) 19 func MinWordsPerFilterHit(hitLength, wordLength, maxErrors int) int { 20 return hitLength + 1 - wordLength*(maxErrors+1) 21 } 22 23 // Type for passing filter parameters. 24 type Params struct { 25 WordSize int 26 MinMatch int 27 MaxError int 28 TubeOffset int 29 } 30 31 // Filter implements a q-gram filter similar to that described in Rassmussen 2005. 32 // This implementation is a translation of the C++ code written by Edgar and Myers. 33 type Filter struct { 34 target *linear.Seq 35 ki *kmerindex.Index 36 tubes []tubeState 37 morass *morass.Morass 38 k int 39 minMatch int 40 maxError int 41 maxKmerDist int 42 minKmersPerHit int 43 tubeOffset int 44 selfAlign bool 45 complement bool 46 } 47 48 // Return a new Filter using ki as the target, and filter parameters in params. 49 func New(ki *kmerindex.Index, params *Params) (f *Filter) { 50 f = &Filter{ 51 ki: ki, 52 target: ki.Seq(), 53 k: ki.K(), 54 minMatch: params.MinMatch, 55 maxError: params.MaxError, 56 tubeOffset: params.TubeOffset, 57 } 58 59 return 60 } 61 62 // Filter a query sequence against the stored index. If query and the target are the same sequence, 63 // selfAlign can be used to avoid double searching - behavior is undefined if the the sequences are not the same. 64 // A morass is used to store and sort individual filter hits. 65 func (f *Filter) Filter(query *linear.Seq, selfAlign, complement bool, morass *morass.Morass) error { 66 f.selfAlign = selfAlign 67 f.complement = complement 68 f.morass = morass 69 f.k = f.ki.K() 70 71 // Ukonnen's Lemma 72 f.minKmersPerHit = MinWordsPerFilterHit(f.minMatch, f.k, f.maxError) 73 74 // Maximum distance between SeqQ positions of two k-mers in a match 75 // (More stringent bounds may be possible, but not a big problem 76 // if two adjacent matches get merged). 77 f.maxKmerDist = f.minMatch - f.k 78 79 tubeWidth := f.tubeOffset + f.maxError 80 81 if f.tubeOffset < f.maxError { 82 return errors.New("filter: TubeOffset < MaxError") 83 } 84 85 maxActiveTubes := (f.target.Len()+tubeWidth-1)/f.tubeOffset + 1 86 f.tubes = make([]tubeState, maxActiveTubes) 87 88 // Ticker tracks cycling of circular list of active tubes. 89 ticker := tubeWidth 90 91 var err error 92 err = f.ki.ForEachKmerOf(query, 0, query.Len(), func(ki *kmerindex.Index, position, kmer int) { 93 from := 0 94 if kmer > 0 { 95 from = ki.FingerAt(kmer - 1) 96 } 97 to := ki.FingerAt(kmer) 98 for i := from; i < to; i++ { 99 f.commonKmer(ki.PosAt(i), position) 100 } 101 102 if ticker--; ticker == 0 { 103 if e := f.tubeEnd(position); e != nil { 104 panic(e) // Caught by fastkmerindex.ForEachKmerOf and returned 105 } 106 ticker = f.tubeOffset 107 } 108 }) 109 if err != nil { 110 return err 111 } 112 113 err = f.tubeEnd(query.Len() - 1) 114 if err != nil { 115 return err 116 } 117 118 diagFrom := f.diagIndex(f.target.Len()-1, query.Len()-1) - tubeWidth 119 diagTo := f.diagIndex(0, query.Len()-1) + tubeWidth 120 121 tubeFrom := f.tubeIndex(diagFrom) 122 if tubeFrom < 0 { 123 tubeFrom = 0 124 } 125 126 tubeTo := f.tubeIndex(diagTo) 127 128 for tubeIndex := tubeFrom; tubeIndex <= tubeTo; tubeIndex++ { 129 err = f.tubeFlush(tubeIndex) 130 if err != nil { 131 return err 132 } 133 } 134 135 f.tubes = nil 136 137 return f.morass.Finalise() 138 } 139 140 // A tubeState stores active filter bin states. 141 // tubeState is repeated in the pals package to allow memory calculation without exporting tubeState from filter package. 142 type tubeState struct { 143 QLo int 144 QHi int 145 Count int 146 } 147 148 // Called when q=Qlen - 1 to flush any hits in each tube. 149 func (f *Filter) tubeFlush(tubeIndex int) error { 150 tube := &f.tubes[tubeIndex%cap(f.tubes)] 151 152 if tube.Count < f.minKmersPerHit { 153 return nil 154 } 155 156 err := f.addHit(tubeIndex, tube.QLo, tube.QHi) 157 if err != nil { 158 return err 159 } 160 tube.Count = 0 161 162 return nil 163 } 164 165 func (f *Filter) diagIndex(t, q int) int { 166 return f.target.Len() - t + q 167 } 168 169 func (f *Filter) tubeIndex(d int) int { 170 return d / f.tubeOffset 171 } 172 173 // Found a common k-mer SeqT[t] and SeqQ[q]. 174 func (f *Filter) commonKmer(t, q int) error { 175 if f.selfAlign && ((f.complement && (q < f.target.Len()-t)) || (!f.complement && (q <= t))) { 176 return nil 177 } 178 179 diagIndex := f.diagIndex(t, q) 180 tubeIndex := f.tubeIndex(diagIndex) 181 182 err := f.hitTube(tubeIndex, q) 183 if err != nil { 184 return err 185 } 186 187 // Hit in overlapping tube preceding this one? 188 if diagIndex%f.tubeOffset < f.maxError { 189 if tubeIndex == 0 { 190 tubeIndex = cap(f.tubes) - 1 191 } else { 192 tubeIndex-- 193 } 194 err = f.hitTube(tubeIndex, q) 195 } 196 197 return err 198 } 199 200 func (f *Filter) hitTube(tubeIndex, q int) error { 201 tube := &f.tubes[tubeIndex%cap(f.tubes)] 202 203 if tube.Count == 0 { 204 tube.Count = 1 205 tube.QLo = q 206 tube.QHi = q 207 return nil 208 } 209 210 if q-tube.QHi > f.maxKmerDist { 211 if tube.Count >= f.minKmersPerHit { 212 err := f.addHit(tubeIndex, tube.QLo, tube.QHi) 213 if err != nil { 214 return err 215 } 216 } 217 218 tube.Count = 1 219 tube.QLo = q 220 tube.QHi = q 221 return nil 222 } 223 224 tube.Count++ 225 tube.QHi = q 226 227 return nil 228 } 229 230 // Called when end of a tube is reached 231 // A point in the tube -- the point with maximal q -- is (Tlen-1,q-1). 232 func (f *Filter) tubeEnd(q int) error { 233 diagIndex := f.diagIndex(f.target.Len()-1, q-1) 234 tubeIndex := f.tubeIndex(diagIndex) 235 tube := &f.tubes[tubeIndex%cap(f.tubes)] 236 237 if tube.Count >= f.minKmersPerHit { 238 err := f.addHit(tubeIndex, tube.QLo, tube.QHi) 239 if err != nil { 240 return err 241 } 242 } 243 244 tube.Count = 0 245 246 return nil 247 } 248 249 func (f *Filter) addHit(tubeIndex, QLo, QHi int) error { 250 fh := Hit{ 251 From: QLo, 252 To: QHi + f.k, 253 Diagonal: f.target.Len() - tubeIndex*f.tubeOffset, 254 } 255 256 return f.morass.Push(fh) 257 }