github.com/biogo/biogo@v1.0.4/align/pals/pals.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package pals implements functions and methods required for PALS sequence alignment. 6 package pals 7 8 import ( 9 "github.com/biogo/biogo/align/pals/dp" 10 "github.com/biogo/biogo/align/pals/filter" 11 "github.com/biogo/biogo/index/kmerindex" 12 "github.com/biogo/biogo/morass" 13 "github.com/biogo/biogo/seq/linear" 14 "github.com/biogo/biogo/util" 15 16 "errors" 17 "io" 18 "unsafe" 19 ) 20 21 // Default thresholds for filter and alignment. 22 var ( 23 DefaultLength = 400 24 DefaultMinIdentity = 0.94 25 MaxAvgIndexListLen = 15.0 26 TubeOffsetDelta = 32 27 ) 28 29 // Default filter and dynamic programming Cost values. 30 const ( 31 MaxIGap = 5 32 DiffCost = 3 33 SameCost = 1 34 MatchCost = DiffCost + SameCost 35 BlockCost = DiffCost * MaxIGap 36 RMatchCost = float64(DiffCost) + 1 37 ) 38 39 // A dp.Costs based on the default cost values. 40 var defaultCosts = dp.Costs{ 41 MaxIGap: MaxIGap, 42 DiffCost: DiffCost, 43 SameCost: SameCost, 44 MatchCost: MatchCost, 45 BlockCost: BlockCost, 46 RMatchCost: RMatchCost, 47 } 48 49 // Default word characteristics. 50 var ( 51 MinWordLength = 4 // For minimum word length, choose k=4 arbitrarily. 52 MaxKmerLen = 15 // Currently limited to 15 due to 32 bit int limit for indexing slices 53 ) 54 55 // PALS is a type that can perform pairwise alignments of large sequences based on the papers: 56 // PILER: identification and classification of genomic repeats. 57 // Robert C. Edgar and Eugene W. Myers. Bioinformatics Suppl. 1:i152-i158 (2005) 58 // Efficient q-gram filters for finding all 𝛜-matches over a given length. 59 // Kim R. Rasmussen, Jens Stoye, and Eugene W. Myers. J. of Computational Biology 13:296–308 (2006). 60 type PALS struct { 61 target, query *linear.Seq 62 selfCompare bool 63 index *kmerindex.Index 64 FilterParams *filter.Params 65 DPParams *dp.Params 66 dp.Costs 67 68 log Logger 69 timer *util.Timer 70 tubeOffset int 71 maxMem *uintptr 72 hitFilter *filter.Filter 73 morass *morass.Morass 74 trapezoids filter.Trapezoids 75 err error 76 threads int 77 } 78 79 // Return a new PALS aligner. Requires 80 func New(target, query *linear.Seq, selfComp bool, m *morass.Morass, tubeOffset int, mem *uintptr, log Logger) *PALS { 81 return &PALS{ 82 target: target, 83 query: query, 84 selfCompare: selfComp, 85 log: log, 86 tubeOffset: tubeOffset, 87 Costs: defaultCosts, 88 maxMem: mem, 89 morass: m, 90 } 91 } 92 93 // Optimise the PALS parameters for given memory, kmer length, hit length and sequence identity. 94 // An error is returned if no satisfactory parameters can be found. 95 func (p *PALS) Optimise(minHitLen int, minId float64) error { 96 if minId < 0 || minId > 1.0 { 97 return errors.New("pals: minimum identity out of range") 98 } 99 if minHitLen <= MinWordLength { 100 return errors.New("pals: minimum hit length too short") 101 } 102 103 if p.log != nil { 104 p.log.Print("Optimising filter parameters") 105 } 106 107 filterParams := &filter.Params{} 108 109 // Lower bound on word length k by requiring manageable index. 110 // Given kmer occurs once every 4^k positions. 111 // Hence average number of index entries is i = N/(4^k) for random 112 // string of length N. 113 // Require i <= I, then k > log_4(N/i). 114 minWordSize := int(util.Log4(float64(p.target.Len())) - util.Log4(MaxAvgIndexListLen) + 0.5) 115 116 // First choice is that filter criteria are same as DP criteria, 117 // but this may not be possible. 118 seedLength := minHitLen 119 seedDiffs := int(float64(minHitLen) * (1 - minId)) 120 121 // Find filter valid filter parameters, starting from preferred case. 122 for { 123 minWords := -1 124 if MaxKmerLen < minWordSize { 125 if p.log != nil { 126 p.log.Printf("Word size too small: %d < %d\n", MaxKmerLen, minWordSize) 127 } 128 } 129 for wordSize := MaxKmerLen; wordSize >= minWordSize; wordSize-- { 130 filterParams.WordSize = wordSize 131 filterParams.MinMatch = seedLength 132 filterParams.MaxError = seedDiffs 133 if p.tubeOffset > 0 { 134 filterParams.TubeOffset = p.tubeOffset 135 } else { 136 filterParams.TubeOffset = filterParams.MaxError + TubeOffsetDelta 137 } 138 139 mem := p.MemRequired(filterParams) 140 if p.maxMem != nil && mem > *p.maxMem { 141 if p.log != nil { 142 p.log.Printf("Parameters n=%d k=%d e=%d, mem=%d MB > maxmem=%d MB\n", 143 filterParams.MinMatch, 144 filterParams.WordSize, 145 filterParams.MaxError, 146 mem/1e6, 147 *p.maxMem/1e6) 148 } 149 minWords = -1 150 continue 151 } 152 153 minWords = filter.MinWordsPerFilterHit(seedLength, wordSize, seedDiffs) 154 if minWords <= 0 { 155 if p.log != nil { 156 p.log.Printf("Parameters n=%d k=%d e=%d, B=%d\n", 157 filterParams.MinMatch, 158 filterParams.WordSize, 159 filterParams.MaxError, 160 minWords) 161 } 162 minWords = -1 163 continue 164 } 165 166 length := p.AvgIndexListLength(filterParams) 167 if length > MaxAvgIndexListLen { 168 if p.log != nil { 169 p.log.Printf("Parameters n=%d k=%d e=%d, B=%d avgixlen=%.2f > max = %.2f\n", 170 filterParams.MinMatch, 171 filterParams.WordSize, 172 filterParams.MaxError, 173 minWords, 174 length, 175 MaxAvgIndexListLen) 176 } 177 minWords = -1 178 continue 179 } 180 break 181 } 182 if minWords > 0 { 183 break 184 } 185 186 // Failed to find filter parameters, try 187 // fewer errors and shorter seed. 188 if seedLength >= minHitLen/4 { 189 seedLength /= 2 190 continue 191 } 192 if seedDiffs > 0 { 193 seedDiffs-- 194 continue 195 } 196 197 return errors.New("pals: failed to find filter parameters") 198 } 199 200 p.FilterParams = filterParams 201 202 p.DPParams = &dp.Params{ 203 MinHitLength: minHitLen, 204 MinId: minId, 205 } 206 207 return nil 208 } 209 210 // Return an estimate of the average number of hits for any given kmer. 211 func (p *PALS) AvgIndexListLength(filterParams *filter.Params) float64 { 212 d := int(1) << (uint(filterParams.WordSize) * 2) 213 return float64(p.target.Len()) / float64(d) 214 } 215 216 // Return an estimate of the amount of memory required for the filter. 217 func (p *PALS) filterMemRequired(filterParams *filter.Params) uintptr { 218 words := util.Pow4(filterParams.WordSize) 219 tubeWidth := filterParams.TubeOffset + filterParams.MaxError 220 maxActiveTubes := (p.target.Len()+tubeWidth-1)/filterParams.TubeOffset + 1 221 tubes := uintptr(maxActiveTubes) * unsafe.Sizeof(tubeState{}) 222 finger := unsafe.Sizeof(uint32(0)) * uintptr(words) 223 pos := unsafe.Sizeof(0) * uintptr(p.target.Len()) 224 225 return finger + pos + tubes 226 } 227 228 // filter.tubeState is repeated here to allow memory calculation without 229 // exporting tubeState from filter package. 230 type tubeState struct { 231 QLo int 232 QHi int 233 Count int 234 } 235 236 // Return an estimate of the total amount of memory required. 237 func (p *PALS) MemRequired(filterParams *filter.Params) uintptr { 238 filter := p.filterMemRequired(filterParams) 239 sequence := uintptr(p.target.Len()) + unsafe.Sizeof(p.target) 240 if p.target != p.query { 241 sequence += uintptr(p.query.Len()) + unsafe.Sizeof(p.query) 242 } 243 244 return filter + sequence 245 } 246 247 // Build the kmerindex for filtering. 248 func (p *PALS) BuildIndex() error { 249 p.notify("Indexing") 250 ki, err := kmerindex.New(p.FilterParams.WordSize, p.target) 251 if err != nil { 252 return err 253 } else { 254 ki.Build() 255 p.notify("Indexed") 256 } 257 p.index = ki 258 p.hitFilter = filter.New(p.index, p.FilterParams) 259 260 return nil 261 } 262 263 // Share allows the receiver to use the index and parameters of m. 264 func (p *PALS) Share(m *PALS) { 265 p.index = m.index 266 p.FilterParams = m.FilterParams 267 p.DPParams = m.DPParams 268 p.hitFilter = filter.New(p.index, p.FilterParams) 269 } 270 271 // Align performs filtering and alignment for one strand of query. 272 func (p *PALS) Align(complement bool) (dp.Hits, error) { 273 if p.err != nil { 274 return nil, p.err 275 } 276 var ( 277 working *linear.Seq 278 err error 279 ) 280 if complement { 281 p.notify("Complementing query") 282 working = p.query.Clone().(*linear.Seq) 283 working.RevComp() 284 p.notify("Complemented query") 285 } else { 286 working = p.query 287 } 288 289 p.notify("Filtering") 290 err = p.hitFilter.Filter(working, p.selfCompare, complement, p.morass) 291 if err != nil { 292 return nil, err 293 } 294 p.notifyf("Identified %d filter hits", p.morass.Len()) 295 296 p.notify("Merging") 297 merger := filter.NewMerger(p.index, working, p.FilterParams, p.MaxIGap, p.selfCompare) 298 var h filter.Hit 299 for { 300 if err = p.morass.Pull(&h); err != nil { 301 break 302 } 303 merger.MergeFilterHit(&h) 304 } 305 if err != nil && err != io.EOF { 306 return nil, err 307 } 308 p.err = p.morass.Clear() 309 p.trapezoids = merger.FinaliseMerge() 310 lt, lq := p.trapezoids.Sum() 311 p.notifyf("Merged %d trapezoids covering %d x %d", len(p.trapezoids), lt, lq) 312 313 p.notify("Aligning") 314 aligner := dp.NewAligner( 315 p.target, working, 316 p.FilterParams.WordSize, p.DPParams.MinHitLength, p.DPParams.MinId, 317 ) 318 aligner.Costs = &p.Costs 319 hits := aligner.AlignTraps(p.trapezoids) 320 hitCoverageA, hitCoverageB, err := hits.Sum() 321 if err != nil { 322 return nil, err 323 } 324 p.notifyf("Aligned %d hits covering %d x %d", len(hits), hitCoverageA, hitCoverageB) 325 326 return hits, nil 327 } 328 329 // Trapezoids returns the filter trapezoids identified during a call to Align. 330 func (p *PALS) Trapezoids() filter.Trapezoids { return p.trapezoids } 331 332 // AlignFrom performs filtering and alignment for one strand of query using the 333 // provided filter trapezoids as seeds. 334 func (p *PALS) AlignFrom(traps filter.Trapezoids, complement bool) (dp.Hits, error) { 335 if p.err != nil { 336 return nil, p.err 337 } 338 var ( 339 working *linear.Seq 340 err error 341 ) 342 if complement { 343 p.notify("Complementing query") 344 working = p.query.Clone().(*linear.Seq) 345 working.RevComp() 346 p.notify("Complemented query") 347 } else { 348 working = p.query 349 } 350 351 p.notify("Aligning") 352 aligner := dp.NewAligner( 353 p.target, working, 354 p.FilterParams.WordSize, p.DPParams.MinHitLength, p.DPParams.MinId, 355 ) 356 aligner.Costs = &p.Costs 357 hits := aligner.AlignTraps(traps) 358 hitCoverageA, hitCoverageB, err := hits.Sum() 359 if err != nil { 360 return nil, err 361 } 362 p.notifyf("Aligned %d hits covering %d x %d", len(hits), hitCoverageA, hitCoverageB) 363 364 return hits, nil 365 } 366 367 // Remove file system components of filter. This should be called after 368 // the last use of the aligner. 369 func (p *PALS) CleanUp() error { return p.morass.CleanUp() } 370 371 // Interface for logger used by PALS. 372 type Logger interface { 373 Print(v ...interface{}) 374 Printf(format string, v ...interface{}) 375 } 376 377 func (p *PALS) notify(n string) { 378 if p.log != nil { 379 p.log.Print(n) 380 } 381 } 382 383 func (p *PALS) notifyf(f string, n ...interface{}) { 384 if p.log != nil { 385 p.log.Printf(f, n...) 386 } 387 }