github.com/biogo/biogo@v1.0.4/index/kmerindex/kmerindex.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package kmerindex performs Kmer indexing package based on Bob Edgar and
     6  // Gene Meyers' approach used in PALS.
     7  package kmerindex
     8  
     9  import (
    10  	"errors"
    11  	"fmt"
    12  	"math"
    13  	"unsafe"
    14  
    15  	"github.com/biogo/biogo/alphabet"
    16  	"github.com/biogo/biogo/seq/linear"
    17  	"github.com/biogo/biogo/util"
    18  )
    19  
    20  var (
    21  	ErrKTooLarge      = errors.New("kmerindex: k too large")
    22  	ErrKTooSmall      = errors.New("kmerindex: k too small")
    23  	ErrShortSeq       = errors.New("kmerindex: sequence to short for k")
    24  	ErrBadAlphabet    = errors.New("kmerindex: alphabet size != 4")
    25  	ErrBadKmer        = errors.New("kmerindex: kmer out of range")
    26  	ErrBadKmerTextLen = errors.New("kmerindex: kmertext length != k")
    27  	ErrBadKmerText    = errors.New("kmerindex: kmertext contains illegal character")
    28  )
    29  
    30  // Constraints on Kmer length.
    31  var (
    32  	MinKmerLen = 4 // default minimum
    33  
    34  	// MaxKmerLen is the maximum Kmer length that can be used.
    35  	// It is 16 on 64 bit architectures and 14 on 32 bit architectures.
    36  	MaxKmerLen = 16 - offset
    37  )
    38  
    39  const offset = int(unsafe.Sizeof(int(0))%0x4) / 2
    40  
    41  var Debug = false // Set Debug to true to prevent recovering from panics in ForEachKmer f Eval function.
    42  
    43  // 2-bit per base packed word
    44  type Kmer uint32 // Sensible size for word type uint64 will double the size of the index (already large for high k)
    45  
    46  // Kmer index type
    47  type Index struct {
    48  	finger  []Kmer
    49  	pos     []int
    50  	seq     *linear.Seq
    51  	lookUp  alphabet.Index
    52  	k       int
    53  	kMask   Kmer
    54  	indexed bool
    55  }
    56  
    57  // Create a new Kmer Index with a word size k based on sequence
    58  func New(k int, s *linear.Seq) (*Index, error) {
    59  	switch {
    60  	case k > MaxKmerLen:
    61  		return nil, ErrKTooLarge
    62  	case k < MinKmerLen:
    63  		return nil, ErrKTooSmall
    64  	case k+1 > s.Len():
    65  		return nil, ErrShortSeq
    66  	case s.Alpha.Len() != 4:
    67  		return nil, ErrBadAlphabet
    68  	}
    69  
    70  	ki := &Index{
    71  		finger:  make([]Kmer, util.Pow4(k)+1), // Need a Tn+1 finger position so that Tn can be recognised
    72  		k:       k,
    73  		kMask:   Kmer(util.Pow4(k) - 1),
    74  		seq:     s,
    75  		lookUp:  s.Alpha.LetterIndex(),
    76  		indexed: false,
    77  	}
    78  	ki.buildKmerTable()
    79  
    80  	return ki, nil
    81  }
    82  
    83  // Build the table of Kmer frequencies - called by New
    84  func (ki *Index) buildKmerTable() {
    85  	incrementFinger := func(index *Index, _, kmer int) {
    86  		index.finger[kmer]++
    87  	}
    88  	ki.ForEachKmerOf(ki.seq, 0, ki.seq.Len(), incrementFinger)
    89  }
    90  
    91  // Build the Kmer position table destructively replacing Kmer frequencies
    92  func (ki *Index) Build() {
    93  	var sum Kmer
    94  	for i, v := range ki.finger {
    95  		ki.finger[i], sum = sum, sum+v
    96  	}
    97  
    98  	locatePositions := func(index *Index, position, kmer int) {
    99  		index.pos[index.finger[kmer]] = position
   100  		index.finger[kmer]++
   101  	}
   102  	ki.pos = make([]int, ki.seq.Len()-ki.k+1)
   103  	ki.ForEachKmerOf(ki.seq, 0, ki.seq.Len(), locatePositions)
   104  
   105  	ki.indexed = true
   106  }
   107  
   108  // Return an array of positions for the Kmer string kmertext
   109  func (ki *Index) KmerPositionsString(kmertext string) (positions []int, err error) {
   110  	switch {
   111  	case len(kmertext) != ki.k:
   112  		return nil, ErrBadKmerTextLen
   113  	case !ki.indexed:
   114  		return nil, errors.New("kmerindex: index not built: call Build()")
   115  	}
   116  
   117  	var kmer Kmer
   118  	if kmer, err = ki.KmerOf(kmertext); err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	return ki.KmerPositions(kmer)
   123  }
   124  
   125  // Return an array of positions for the Kmer kmer
   126  func (ki *Index) KmerPositions(kmer Kmer) (positions []int, err error) {
   127  	if kmer > ki.kMask {
   128  		return nil, ErrBadKmer
   129  	}
   130  
   131  	i := Kmer(0)
   132  	if kmer > 0 { // special case: An has no predecessor
   133  		i = ki.finger[kmer-1]
   134  	}
   135  	j := ki.finger[kmer]
   136  	if i == j {
   137  		return
   138  	}
   139  
   140  	positions = make([]int, j-i)
   141  	for l, p := range ki.pos[i:j] {
   142  		positions[l] = int(p)
   143  	}
   144  
   145  	return
   146  }
   147  
   148  // Return a map containing absolute Kmer frequencies and true if called before Build().
   149  // If called after Build returns a nil map and false.
   150  func (ki *Index) KmerFrequencies() (map[Kmer]int, bool) {
   151  	if ki.indexed {
   152  		return nil, false
   153  	}
   154  
   155  	m := map[Kmer]int{}
   156  
   157  	for i, f := range ki.finger {
   158  		if f > 0 {
   159  			m[Kmer(i)] = int(f) // not always safe - perhaps check that Kmer <= MaxInt
   160  		}
   161  	}
   162  
   163  	return m, true
   164  }
   165  
   166  // Return a map containing relative Kmer frequencies and true if called before Build().
   167  // If called after Build returns a nil map and false.
   168  func (ki *Index) NormalisedKmerFrequencies() (map[Kmer]float64, bool) {
   169  	if ki.indexed {
   170  		return nil, false
   171  	}
   172  
   173  	m := map[Kmer]float64{}
   174  
   175  	l := float64(ki.seq.Len())
   176  	for i, f := range ki.finger {
   177  		if f > 0 {
   178  			m[Kmer(i)] = float64(f) / l
   179  		}
   180  	}
   181  
   182  	return m, true
   183  }
   184  
   185  // Returns a Kmer-keyed map containing slices of kmer positions and true if called after Build,
   186  // otherwise nil and false.
   187  func (ki *Index) KmerIndex() (map[Kmer][]int, bool) {
   188  	if !ki.indexed {
   189  		return nil, false
   190  	}
   191  
   192  	m := make(map[Kmer][]int)
   193  
   194  	for i := range ki.finger {
   195  		if p, _ := ki.KmerPositions(Kmer(i)); len(p) > 0 {
   196  			m[Kmer(i)] = p
   197  		}
   198  	}
   199  
   200  	return m, true
   201  }
   202  
   203  // Returns a string-keyed map containing slices of kmer positions and true if called after Build,
   204  // otherwise nil and false.
   205  func (ki *Index) StringKmerIndex() (map[string][]int, bool) {
   206  	if !ki.indexed {
   207  		return nil, false
   208  	}
   209  
   210  	m := make(map[string][]int)
   211  
   212  	for i := range ki.finger {
   213  		if p, _ := ki.KmerPositions(Kmer(i)); len(p) > 0 {
   214  			m[ki.Format(Kmer(i))] = p
   215  		}
   216  	}
   217  
   218  	return m, true
   219  }
   220  
   221  // errors should be handled through a panic which will be recovered by ForEachKmerOf
   222  type Eval func(index *Index, j, kmer int)
   223  
   224  // Applies the f Eval func to all kmers in s from start to end. Returns any panic raised by f as an error.
   225  func (ki *Index) ForEachKmerOf(s *linear.Seq, start, end int, f Eval) (err error) {
   226  	if !Debug {
   227  		defer func() {
   228  			if r := recover(); r != nil {
   229  				var ok bool
   230  				err, ok = r.(error)
   231  				if !ok {
   232  					err = fmt.Errorf("kmerindex: %v", r)
   233  				}
   234  			}
   235  		}()
   236  	}
   237  
   238  	kmer := Kmer(0)
   239  	high := 0
   240  	var currentBase int
   241  
   242  	// Preload the first k-1 bases of the first well defined k-mer or set high to the next position
   243  	basePosition := start
   244  	for ; basePosition < start+ki.k-1; basePosition++ {
   245  		currentBase = ki.lookUp[s.Seq[basePosition]]
   246  		if currentBase >= 0 {
   247  			kmer = (kmer << 2) | Kmer(currentBase)
   248  		} else {
   249  			kmer = 0
   250  			high = basePosition + 1
   251  		}
   252  	}
   253  
   254  	// Call f(position, kmer) for each of the next well defined k-mers
   255  	for position := basePosition - ki.k + 1; basePosition < end; position++ {
   256  		currentBase = ki.lookUp[s.Seq[basePosition]]
   257  		basePosition++
   258  		if currentBase >= 0 {
   259  			kmer = ((kmer << 2) | Kmer(currentBase)) & ki.kMask
   260  		} else {
   261  			kmer = 0
   262  			high = basePosition
   263  		}
   264  		if position >= high {
   265  			f(ki, position, int(kmer))
   266  		}
   267  	}
   268  
   269  	return
   270  }
   271  
   272  // Return the Kmer length of the Index.
   273  func (ki *Index) K() int {
   274  	return ki.k
   275  }
   276  
   277  // Returns a pointer to the indexed seq.Seq.
   278  func (ki *Index) Seq() *linear.Seq {
   279  	return ki.seq
   280  }
   281  
   282  // Returns the value of the finger slice at p. This signifies the absolute kmer frequency of the Kmer(p)
   283  // if called before Build() and points to the relevant position lookup if called after.
   284  func (ki *Index) FingerAt(p int) int {
   285  	return int(ki.finger[p])
   286  }
   287  
   288  // Returns the value of the pos slice at p. This signifies the position of the pth kmer if called after Build().
   289  // Not valid before Build() - will panic.
   290  func (ki *Index) PosAt(p int) int {
   291  	return ki.pos[p]
   292  }
   293  
   294  // Convert a Kmer into a string of bases
   295  func (ki *Index) Format(kmer Kmer) string {
   296  	s, _ := Format(kmer, ki.k, ki.seq.Alpha)
   297  	return s
   298  }
   299  
   300  // Convert a string of bases into a len k Kmer, returns an error if string length does not match k.
   301  // lookUp is an index lookup table as returned by alphabet.Alphabet.LetterIndex().
   302  func KmerOf(k int, lookUp alphabet.Index, kmertext string) (kmer Kmer, err error) {
   303  	if len(kmertext) != k {
   304  		return 0, ErrBadKmerTextLen
   305  	}
   306  
   307  	for _, v := range kmertext {
   308  		x := lookUp[v]
   309  		if x < 0 {
   310  			return 0, ErrBadKmerText
   311  		}
   312  		kmer = (kmer << 2) | Kmer(x)
   313  	}
   314  
   315  	return
   316  }
   317  
   318  // Return the GC fraction of a Kmer
   319  func (ki *Index) GCof(kmer Kmer) float64 {
   320  	return GCof(ki.k, kmer)
   321  }
   322  
   323  // Return the GC fraction of a Kmer of len k
   324  func GCof(k int, kmer Kmer) float64 {
   325  	gc := 0
   326  	for i := k - 1; i >= 0; i, kmer = i-1, kmer>>2 {
   327  		gc += int((kmer & 1) ^ ((kmer & 2) >> 1))
   328  	}
   329  
   330  	return float64(gc) / float64(k)
   331  }
   332  
   333  // Convert a Kmer into a string of bases
   334  func Format(kmer Kmer, k int, alpha alphabet.Alphabet) (string, error) {
   335  	if alpha.Len() != 4 {
   336  		return "", ErrBadAlphabet
   337  	}
   338  	kmertext := make([]byte, k)
   339  
   340  	for i := k - 1; i >= 0; i, kmer = i-1, kmer>>2 {
   341  		kmertext[i] = byte(alpha.Letter(int(kmer & 3)))
   342  	}
   343  
   344  	return string(kmertext), nil
   345  }
   346  
   347  // Reverse complement a Kmer. Complementation is performed according to letter index:
   348  //
   349  //  0, 1, 2, 3 = 3, 2, 1, 0
   350  func (ki *Index) ComplementOf(kmer Kmer) (c Kmer) {
   351  	return ComplementOf(ki.k, kmer)
   352  }
   353  
   354  // Reverse complement a Kmer of len k. Complementation is performed according to letter index:
   355  //
   356  //  0, 1, 2, 3 = 3, 2, 1, 0
   357  func ComplementOf(k int, kmer Kmer) (c Kmer) {
   358  	for i, j := uint(0), uint(k-1)*2; i <= j; i, j = i+2, j-2 {
   359  		c |= (^(kmer >> (j - i)) & (3 << i)) | (^(kmer>>i)&3)<<j
   360  	}
   361  
   362  	return
   363  }
   364  
   365  // Convert a string of bases into a Kmer, returns an error if string length does not match word length
   366  func (ki *Index) KmerOf(kmertext string) (kmer Kmer, err error) {
   367  	if len(kmertext) != ki.k {
   368  		return 0, ErrBadKmerTextLen
   369  	}
   370  
   371  	for _, v := range kmertext {
   372  		x := ki.lookUp[v]
   373  		if x < 0 {
   374  			return 0, ErrBadKmerText
   375  		}
   376  		kmer = (kmer << 2) | Kmer(x)
   377  	}
   378  
   379  	return
   380  }
   381  
   382  // Return the Euclidean distance between two sequences measured by abolsolute kmer frequencies.
   383  func Distance(a, b map[Kmer]float64) (dist float64) {
   384  	c := make(map[Kmer]struct{}, len(a)+len(b))
   385  	for k := range a {
   386  		c[k] = struct{}{}
   387  	}
   388  	for k := range b {
   389  		c[k] = struct{}{}
   390  	}
   391  	for k := range c {
   392  		dist += math.Pow(a[k]-b[k], 2)
   393  	}
   394  
   395  	return math.Sqrt(dist)
   396  }
   397  
   398  // Confirm that a Build() is correct. Returns boolean indicating this and the number of kmers indexed.
   399  func (ki *Index) Check() (ok bool, found int) {
   400  	ok = true
   401  	f := func(index *Index, position, kmer int) {
   402  		hit := false
   403  		var base Kmer
   404  		if kmer == 0 {
   405  			base = 0
   406  		} else {
   407  			base = index.finger[kmer-1]
   408  		}
   409  		for j := base; j < index.finger[kmer]; j++ {
   410  			if index.pos[j] == position {
   411  				found++
   412  				hit = true
   413  				break
   414  			}
   415  		}
   416  		if !hit {
   417  			ok = false
   418  		}
   419  	}
   420  
   421  	if err := ki.ForEachKmerOf(ki.seq, 0, ki.seq.Len(), f); err != nil {
   422  		ok = false
   423  	}
   424  
   425  	return
   426  }
   427  
   428  // Return a copy of the internal finger slice.
   429  func (ki *Index) Finger() (f []Kmer) {
   430  	f = make([]Kmer, len(ki.finger))
   431  	copy(f, ki.finger)
   432  	return
   433  }
   434  
   435  // Return a copy of the internal pos slice.
   436  func (ki *Index) Pos() (p []int) {
   437  	p = make([]int, len(ki.pos))
   438  	copy(p, ki.pos)
   439  	return
   440  }