github.com/biogo/biogo@v1.0.4/pwm/pwm.go (about)

     1  // Copyright ©2011-2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package pwm implements a position weight matrix search based on an
     6  // algorithm by Deborah Toledo Flores.
     7  package pwm
     8  
     9  import (
    10  	"github.com/biogo/biogo/feat"
    11  	"github.com/biogo/biogo/seq"
    12  	"github.com/biogo/biogo/seq/sequtils"
    13  
    14  	"fmt"
    15  	"math"
    16  	"sort"
    17  )
    18  
    19  type probs struct {
    20  	score      float64
    21  	freqs      []int
    22  	occurrence int
    23  }
    24  
    25  type probTable []*probs
    26  
    27  func (m probTable) Len() int           { return len(m) }
    28  func (m probTable) Less(i, j int) bool { return m[i].score > m[j].score }
    29  func (m probTable) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
    30  
    31  type Feature struct {
    32  	MotifLocation feat.Feature
    33  	MotifStart    int
    34  	MotifEnd      int
    35  	MotifScore    float64
    36  	MotifProb     float64
    37  	MotifSeq      seq.Sequence
    38  	MotifOrient   feat.Orientation
    39  	Moltype       feat.Moltype
    40  }
    41  
    42  func (f *Feature) Start() int { return f.MotifStart }
    43  func (f *Feature) End() int   { return f.MotifEnd }
    44  func (f *Feature) Len() int   { return f.MotifEnd - f.MotifStart }
    45  func (f *Feature) Name() string {
    46  	return fmt.Sprintf("%s:[%d,%d)", f.Location().Name(), f.MotifStart, f.MotifEnd)
    47  }
    48  func (f *Feature) Description() string           { return "PWM motif" }
    49  func (f *Feature) Location() feat.Feature        { return f.MotifLocation }
    50  func (f *Feature) MolType() feat.Moltype         { return f.Moltype }
    51  func (f *Feature) Orientation() feat.Orientation { return f.MotifOrient }
    52  
    53  type PWM struct {
    54  	matrix    [][]float64
    55  	lookAhead []float64
    56  	table     probTable
    57  	minScore  float64
    58  	Format    string // Format for probability values in attributes.
    59  }
    60  
    61  func New(matrix [][]float64) (m *PWM) {
    62  	m = &PWM{
    63  		matrix:    matrix,
    64  		lookAhead: make([]float64, len(matrix)),
    65  		minScore:  math.MaxFloat64,
    66  		Format:    "%e",
    67  	}
    68  
    69  	var maxVal, maxScore float64
    70  
    71  	for i := len(matrix) - 1; i >= 0; i-- {
    72  		maxVal = 0
    73  		for _, v := range matrix[i] {
    74  			if v > maxVal {
    75  				maxVal = v
    76  			}
    77  		}
    78  		maxScore += maxVal
    79  		m.lookAhead[i] = maxScore
    80  	}
    81  
    82  	for i := range matrix {
    83  		for j := range matrix[i] {
    84  			matrix[i][j] /= maxScore
    85  		}
    86  		m.lookAhead[i] /= maxScore
    87  	}
    88  
    89  	return
    90  }
    91  
    92  func (m *PWM) genTable(minScore, score float64, pos int, motif []byte) {
    93  	for i, s := range m.matrix[pos] {
    94  		motif[pos] = byte(i)
    95  		if pos < len(m.matrix)-1 {
    96  			if minScore-(score+s) > m.lookAhead[pos+1] { // will not be able to achieve minScore
    97  				continue
    98  			}
    99  			m.genTable(minScore, score+s, pos+1, motif)
   100  		} else {
   101  			if score+s < minScore { // will not be able to achieve minScore
   102  				continue
   103  			}
   104  			// count frequencies of states in current motif
   105  			freqs := make([]int, 4)
   106  			for _, j := range motif {
   107  				freqs[j]++
   108  			}
   109  			found := false
   110  			for j := len(m.table) - 1; j >= 0; j-- {
   111  				table := m.table[j]
   112  				if table.score != score+s {
   113  					continue // if using insertion sort, if table.score > score+s then we can found = false and break
   114  				}
   115  				match := true
   116  				for k := range freqs {
   117  					if freqs[k] != table.freqs[k] {
   118  						match = false
   119  						break
   120  					}
   121  				}
   122  				if match {
   123  					table.occurrence++
   124  					found = true
   125  					break
   126  				}
   127  			}
   128  
   129  			if !found {
   130  				m.table = append(m.table, &probs{score: score + s, freqs: freqs, occurrence: 1})
   131  			}
   132  		}
   133  	}
   134  
   135  }
   136  
   137  type Sequence interface {
   138  	seq.Sequence
   139  	Moltype() feat.Moltype
   140  	Orientation() feat.Orientation
   141  }
   142  
   143  func (m *PWM) Search(s Sequence, start, end int, minScore float64) []feat.Feature {
   144  	if minScore < m.minScore {
   145  		m.table = make(probTable, 0)
   146  		m.genTable(minScore, 0, 0, make([]byte, len(m.matrix)))
   147  		sort.Sort(m.table)
   148  	}
   149  
   150  	var (
   151  		index  = s.Alphabet().LetterIndex()
   152  		length = len(m.matrix)
   153  
   154  		freqs = make([]float64, 4)
   155  		zeros = make([]float64, 4)
   156  
   157  		diff = 1 / float64(length)
   158  
   159  		f []feat.Feature
   160  	)
   161  
   162  LOOP:
   163  	for pos := start; pos+length < end; pos++ {
   164  		// Determine the score for this position.
   165  		var score = 0.
   166  		for i := 0; i < length; i++ {
   167  			base := index[s.At(pos+i).L]
   168  			if base < 0 || minScore-score > m.lookAhead[i] { // not valid base or will not be able to achieve minScore
   169  				continue LOOP
   170  			} else {
   171  				score += m.matrix[i][base]
   172  			}
   173  		}
   174  
   175  		if score < minScore {
   176  			continue
   177  		}
   178  
   179  		// Calculate base frequencies for window.
   180  		copy(freqs, zeros)
   181  		for i := pos; i < pos+length; i++ {
   182  			base := index[s.At(i).L]
   183  			if base >= 0 {
   184  				freqs[base] += diff
   185  			} else { // Probability for this pos will be meaningless; if N is tolerated, include N in valid alphabet - make special case?
   186  				continue LOOP
   187  			}
   188  		}
   189  
   190  		// Descend probability function summing probabilities.
   191  		var (
   192  			prob = 0.
   193  			sp   = 0.
   194  		)
   195  		for _, e := range m.table {
   196  			sp = 1
   197  			if e.score < score {
   198  				break
   199  			}
   200  			for i, f := range freqs {
   201  				sp *= math.Pow(f, float64(e.freqs[i]))
   202  			}
   203  			sp *= float64(e.occurrence)
   204  			prob += sp
   205  		}
   206  
   207  		mot := s.New()
   208  		sequtils.Truncate(mot, s, pos, pos+length)
   209  		f = append(f, &Feature{
   210  			MotifLocation: s,
   211  			MotifStart:    pos,
   212  			MotifEnd:      pos + length,
   213  			MotifScore:    score,
   214  			MotifProb:     prob,
   215  			MotifSeq:      mot,
   216  			MotifOrient:   s.Orientation(),
   217  			Moltype:       s.Moltype(),
   218  		})
   219  	}
   220  
   221  	return f
   222  }