github.com/biogo/biogo@v1.0.4/align/pals/pals.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package pals implements functions and methods required for PALS sequence alignment.
     6  package pals
     7  
     8  import (
     9  	"github.com/biogo/biogo/align/pals/dp"
    10  	"github.com/biogo/biogo/align/pals/filter"
    11  	"github.com/biogo/biogo/index/kmerindex"
    12  	"github.com/biogo/biogo/morass"
    13  	"github.com/biogo/biogo/seq/linear"
    14  	"github.com/biogo/biogo/util"
    15  
    16  	"errors"
    17  	"io"
    18  	"unsafe"
    19  )
    20  
    21  // Default thresholds for filter and alignment.
    22  var (
    23  	DefaultLength      = 400
    24  	DefaultMinIdentity = 0.94
    25  	MaxAvgIndexListLen = 15.0
    26  	TubeOffsetDelta    = 32
    27  )
    28  
    29  // Default filter and dynamic programming Cost values.
    30  const (
    31  	MaxIGap    = 5
    32  	DiffCost   = 3
    33  	SameCost   = 1
    34  	MatchCost  = DiffCost + SameCost
    35  	BlockCost  = DiffCost * MaxIGap
    36  	RMatchCost = float64(DiffCost) + 1
    37  )
    38  
    39  // A dp.Costs based on the default cost values.
    40  var defaultCosts = dp.Costs{
    41  	MaxIGap:    MaxIGap,
    42  	DiffCost:   DiffCost,
    43  	SameCost:   SameCost,
    44  	MatchCost:  MatchCost,
    45  	BlockCost:  BlockCost,
    46  	RMatchCost: RMatchCost,
    47  }
    48  
    49  // Default word characteristics.
    50  var (
    51  	MinWordLength = 4  // For minimum word length, choose k=4 arbitrarily.
    52  	MaxKmerLen    = 15 // Currently limited to 15 due to 32 bit int limit for indexing slices
    53  )
    54  
    55  // PALS is a type that can perform pairwise alignments of large sequences based on the papers:
    56  //  PILER: identification and classification of genomic repeats.
    57  //   Robert C. Edgar and Eugene W. Myers. Bioinformatics Suppl. 1:i152-i158 (2005)
    58  //  Efficient q-gram filters for finding all 𝛜-matches over a given length.
    59  //   Kim R. Rasmussen, Jens Stoye, and Eugene W. Myers. J. of Computational Biology 13:296–308 (2006).
    60  type PALS struct {
    61  	target, query *linear.Seq
    62  	selfCompare   bool
    63  	index         *kmerindex.Index
    64  	FilterParams  *filter.Params
    65  	DPParams      *dp.Params
    66  	dp.Costs
    67  
    68  	log        Logger
    69  	timer      *util.Timer
    70  	tubeOffset int
    71  	maxMem     *uintptr
    72  	hitFilter  *filter.Filter
    73  	morass     *morass.Morass
    74  	trapezoids filter.Trapezoids
    75  	err        error
    76  	threads    int
    77  }
    78  
    79  // Return a new PALS aligner. Requires
    80  func New(target, query *linear.Seq, selfComp bool, m *morass.Morass, tubeOffset int, mem *uintptr, log Logger) *PALS {
    81  	return &PALS{
    82  		target:      target,
    83  		query:       query,
    84  		selfCompare: selfComp,
    85  		log:         log,
    86  		tubeOffset:  tubeOffset,
    87  		Costs:       defaultCosts,
    88  		maxMem:      mem,
    89  		morass:      m,
    90  	}
    91  }
    92  
    93  // Optimise the PALS parameters for given memory, kmer length, hit length and sequence identity.
    94  // An error is returned if no satisfactory parameters can be found.
    95  func (p *PALS) Optimise(minHitLen int, minId float64) error {
    96  	if minId < 0 || minId > 1.0 {
    97  		return errors.New("pals: minimum identity out of range")
    98  	}
    99  	if minHitLen <= MinWordLength {
   100  		return errors.New("pals: minimum hit length too short")
   101  	}
   102  
   103  	if p.log != nil {
   104  		p.log.Print("Optimising filter parameters")
   105  	}
   106  
   107  	filterParams := &filter.Params{}
   108  
   109  	// Lower bound on word length k by requiring manageable index.
   110  	// Given kmer occurs once every 4^k positions.
   111  	// Hence average number of index entries is i = N/(4^k) for random
   112  	// string of length N.
   113  	// Require i <= I, then k > log_4(N/i).
   114  	minWordSize := int(util.Log4(float64(p.target.Len())) - util.Log4(MaxAvgIndexListLen) + 0.5)
   115  
   116  	// First choice is that filter criteria are same as DP criteria,
   117  	// but this may not be possible.
   118  	seedLength := minHitLen
   119  	seedDiffs := int(float64(minHitLen) * (1 - minId))
   120  
   121  	// Find filter valid filter parameters, starting from preferred case.
   122  	for {
   123  		minWords := -1
   124  		if MaxKmerLen < minWordSize {
   125  			if p.log != nil {
   126  				p.log.Printf("Word size too small: %d < %d\n", MaxKmerLen, minWordSize)
   127  			}
   128  		}
   129  		for wordSize := MaxKmerLen; wordSize >= minWordSize; wordSize-- {
   130  			filterParams.WordSize = wordSize
   131  			filterParams.MinMatch = seedLength
   132  			filterParams.MaxError = seedDiffs
   133  			if p.tubeOffset > 0 {
   134  				filterParams.TubeOffset = p.tubeOffset
   135  			} else {
   136  				filterParams.TubeOffset = filterParams.MaxError + TubeOffsetDelta
   137  			}
   138  
   139  			mem := p.MemRequired(filterParams)
   140  			if p.maxMem != nil && mem > *p.maxMem {
   141  				if p.log != nil {
   142  					p.log.Printf("Parameters n=%d k=%d e=%d, mem=%d MB > maxmem=%d MB\n",
   143  						filterParams.MinMatch,
   144  						filterParams.WordSize,
   145  						filterParams.MaxError,
   146  						mem/1e6,
   147  						*p.maxMem/1e6)
   148  				}
   149  				minWords = -1
   150  				continue
   151  			}
   152  
   153  			minWords = filter.MinWordsPerFilterHit(seedLength, wordSize, seedDiffs)
   154  			if minWords <= 0 {
   155  				if p.log != nil {
   156  					p.log.Printf("Parameters n=%d k=%d e=%d, B=%d\n",
   157  						filterParams.MinMatch,
   158  						filterParams.WordSize,
   159  						filterParams.MaxError,
   160  						minWords)
   161  				}
   162  				minWords = -1
   163  				continue
   164  			}
   165  
   166  			length := p.AvgIndexListLength(filterParams)
   167  			if length > MaxAvgIndexListLen {
   168  				if p.log != nil {
   169  					p.log.Printf("Parameters n=%d k=%d e=%d, B=%d avgixlen=%.2f > max = %.2f\n",
   170  						filterParams.MinMatch,
   171  						filterParams.WordSize,
   172  						filterParams.MaxError,
   173  						minWords,
   174  						length,
   175  						MaxAvgIndexListLen)
   176  				}
   177  				minWords = -1
   178  				continue
   179  			}
   180  			break
   181  		}
   182  		if minWords > 0 {
   183  			break
   184  		}
   185  
   186  		// Failed to find filter parameters, try
   187  		// fewer errors and shorter seed.
   188  		if seedLength >= minHitLen/4 {
   189  			seedLength /= 2
   190  			continue
   191  		}
   192  		if seedDiffs > 0 {
   193  			seedDiffs--
   194  			continue
   195  		}
   196  
   197  		return errors.New("pals: failed to find filter parameters")
   198  	}
   199  
   200  	p.FilterParams = filterParams
   201  
   202  	p.DPParams = &dp.Params{
   203  		MinHitLength: minHitLen,
   204  		MinId:        minId,
   205  	}
   206  
   207  	return nil
   208  }
   209  
   210  // Return an estimate of the average number of hits for any given kmer.
   211  func (p *PALS) AvgIndexListLength(filterParams *filter.Params) float64 {
   212  	d := int(1) << (uint(filterParams.WordSize) * 2)
   213  	return float64(p.target.Len()) / float64(d)
   214  }
   215  
   216  // Return an estimate of the amount of memory required for the filter.
   217  func (p *PALS) filterMemRequired(filterParams *filter.Params) uintptr {
   218  	words := util.Pow4(filterParams.WordSize)
   219  	tubeWidth := filterParams.TubeOffset + filterParams.MaxError
   220  	maxActiveTubes := (p.target.Len()+tubeWidth-1)/filterParams.TubeOffset + 1
   221  	tubes := uintptr(maxActiveTubes) * unsafe.Sizeof(tubeState{})
   222  	finger := unsafe.Sizeof(uint32(0)) * uintptr(words)
   223  	pos := unsafe.Sizeof(0) * uintptr(p.target.Len())
   224  
   225  	return finger + pos + tubes
   226  }
   227  
   228  // filter.tubeState is repeated here to allow memory calculation without
   229  // exporting tubeState from filter package.
   230  type tubeState struct {
   231  	QLo   int
   232  	QHi   int
   233  	Count int
   234  }
   235  
   236  // Return an estimate of the total amount of memory required.
   237  func (p *PALS) MemRequired(filterParams *filter.Params) uintptr {
   238  	filter := p.filterMemRequired(filterParams)
   239  	sequence := uintptr(p.target.Len()) + unsafe.Sizeof(p.target)
   240  	if p.target != p.query {
   241  		sequence += uintptr(p.query.Len()) + unsafe.Sizeof(p.query)
   242  	}
   243  
   244  	return filter + sequence
   245  }
   246  
   247  // Build the kmerindex for filtering.
   248  func (p *PALS) BuildIndex() error {
   249  	p.notify("Indexing")
   250  	ki, err := kmerindex.New(p.FilterParams.WordSize, p.target)
   251  	if err != nil {
   252  		return err
   253  	} else {
   254  		ki.Build()
   255  		p.notify("Indexed")
   256  	}
   257  	p.index = ki
   258  	p.hitFilter = filter.New(p.index, p.FilterParams)
   259  
   260  	return nil
   261  }
   262  
   263  // Share allows the receiver to use the index and parameters of m.
   264  func (p *PALS) Share(m *PALS) {
   265  	p.index = m.index
   266  	p.FilterParams = m.FilterParams
   267  	p.DPParams = m.DPParams
   268  	p.hitFilter = filter.New(p.index, p.FilterParams)
   269  }
   270  
   271  // Align performs filtering and alignment for one strand of query.
   272  func (p *PALS) Align(complement bool) (dp.Hits, error) {
   273  	if p.err != nil {
   274  		return nil, p.err
   275  	}
   276  	var (
   277  		working *linear.Seq
   278  		err     error
   279  	)
   280  	if complement {
   281  		p.notify("Complementing query")
   282  		working = p.query.Clone().(*linear.Seq)
   283  		working.RevComp()
   284  		p.notify("Complemented query")
   285  	} else {
   286  		working = p.query
   287  	}
   288  
   289  	p.notify("Filtering")
   290  	err = p.hitFilter.Filter(working, p.selfCompare, complement, p.morass)
   291  	if err != nil {
   292  		return nil, err
   293  	}
   294  	p.notifyf("Identified %d filter hits", p.morass.Len())
   295  
   296  	p.notify("Merging")
   297  	merger := filter.NewMerger(p.index, working, p.FilterParams, p.MaxIGap, p.selfCompare)
   298  	var h filter.Hit
   299  	for {
   300  		if err = p.morass.Pull(&h); err != nil {
   301  			break
   302  		}
   303  		merger.MergeFilterHit(&h)
   304  	}
   305  	if err != nil && err != io.EOF {
   306  		return nil, err
   307  	}
   308  	p.err = p.morass.Clear()
   309  	p.trapezoids = merger.FinaliseMerge()
   310  	lt, lq := p.trapezoids.Sum()
   311  	p.notifyf("Merged %d trapezoids covering %d x %d", len(p.trapezoids), lt, lq)
   312  
   313  	p.notify("Aligning")
   314  	aligner := dp.NewAligner(
   315  		p.target, working,
   316  		p.FilterParams.WordSize, p.DPParams.MinHitLength, p.DPParams.MinId,
   317  	)
   318  	aligner.Costs = &p.Costs
   319  	hits := aligner.AlignTraps(p.trapezoids)
   320  	hitCoverageA, hitCoverageB, err := hits.Sum()
   321  	if err != nil {
   322  		return nil, err
   323  	}
   324  	p.notifyf("Aligned %d hits covering %d x %d", len(hits), hitCoverageA, hitCoverageB)
   325  
   326  	return hits, nil
   327  }
   328  
   329  // Trapezoids returns the filter trapezoids identified during a call to Align.
   330  func (p *PALS) Trapezoids() filter.Trapezoids { return p.trapezoids }
   331  
   332  // AlignFrom performs filtering and alignment for one strand of query using the
   333  // provided filter trapezoids as seeds.
   334  func (p *PALS) AlignFrom(traps filter.Trapezoids, complement bool) (dp.Hits, error) {
   335  	if p.err != nil {
   336  		return nil, p.err
   337  	}
   338  	var (
   339  		working *linear.Seq
   340  		err     error
   341  	)
   342  	if complement {
   343  		p.notify("Complementing query")
   344  		working = p.query.Clone().(*linear.Seq)
   345  		working.RevComp()
   346  		p.notify("Complemented query")
   347  	} else {
   348  		working = p.query
   349  	}
   350  
   351  	p.notify("Aligning")
   352  	aligner := dp.NewAligner(
   353  		p.target, working,
   354  		p.FilterParams.WordSize, p.DPParams.MinHitLength, p.DPParams.MinId,
   355  	)
   356  	aligner.Costs = &p.Costs
   357  	hits := aligner.AlignTraps(traps)
   358  	hitCoverageA, hitCoverageB, err := hits.Sum()
   359  	if err != nil {
   360  		return nil, err
   361  	}
   362  	p.notifyf("Aligned %d hits covering %d x %d", len(hits), hitCoverageA, hitCoverageB)
   363  
   364  	return hits, nil
   365  }
   366  
   367  // Remove file system components of filter. This should be called after
   368  // the last use of the aligner.
   369  func (p *PALS) CleanUp() error { return p.morass.CleanUp() }
   370  
   371  // Interface for logger used by PALS.
   372  type Logger interface {
   373  	Print(v ...interface{})
   374  	Printf(format string, v ...interface{})
   375  }
   376  
   377  func (p *PALS) notify(n string) {
   378  	if p.log != nil {
   379  		p.log.Print(n)
   380  	}
   381  }
   382  
   383  func (p *PALS) notifyf(f string, n ...interface{}) {
   384  	if p.log != nil {
   385  		p.log.Printf(f, n...)
   386  	}
   387  }