github.com/biogo/biogo@v1.0.4/pwm/pwm.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package pwm implements a position weight matrix search based on an 6 // algorithm by Deborah Toledo Flores. 7 package pwm 8 9 import ( 10 "github.com/biogo/biogo/feat" 11 "github.com/biogo/biogo/seq" 12 "github.com/biogo/biogo/seq/sequtils" 13 14 "fmt" 15 "math" 16 "sort" 17 ) 18 19 type probs struct { 20 score float64 21 freqs []int 22 occurrence int 23 } 24 25 type probTable []*probs 26 27 func (m probTable) Len() int { return len(m) } 28 func (m probTable) Less(i, j int) bool { return m[i].score > m[j].score } 29 func (m probTable) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 30 31 type Feature struct { 32 MotifLocation feat.Feature 33 MotifStart int 34 MotifEnd int 35 MotifScore float64 36 MotifProb float64 37 MotifSeq seq.Sequence 38 MotifOrient feat.Orientation 39 Moltype feat.Moltype 40 } 41 42 func (f *Feature) Start() int { return f.MotifStart } 43 func (f *Feature) End() int { return f.MotifEnd } 44 func (f *Feature) Len() int { return f.MotifEnd - f.MotifStart } 45 func (f *Feature) Name() string { 46 return fmt.Sprintf("%s:[%d,%d)", f.Location().Name(), f.MotifStart, f.MotifEnd) 47 } 48 func (f *Feature) Description() string { return "PWM motif" } 49 func (f *Feature) Location() feat.Feature { return f.MotifLocation } 50 func (f *Feature) MolType() feat.Moltype { return f.Moltype } 51 func (f *Feature) Orientation() feat.Orientation { return f.MotifOrient } 52 53 type PWM struct { 54 matrix [][]float64 55 lookAhead []float64 56 table probTable 57 minScore float64 58 Format string // Format for probability values in attributes. 59 } 60 61 func New(matrix [][]float64) (m *PWM) { 62 m = &PWM{ 63 matrix: matrix, 64 lookAhead: make([]float64, len(matrix)), 65 minScore: math.MaxFloat64, 66 Format: "%e", 67 } 68 69 var maxVal, maxScore float64 70 71 for i := len(matrix) - 1; i >= 0; i-- { 72 maxVal = 0 73 for _, v := range matrix[i] { 74 if v > maxVal { 75 maxVal = v 76 } 77 } 78 maxScore += maxVal 79 m.lookAhead[i] = maxScore 80 } 81 82 for i := range matrix { 83 for j := range matrix[i] { 84 matrix[i][j] /= maxScore 85 } 86 m.lookAhead[i] /= maxScore 87 } 88 89 return 90 } 91 92 func (m *PWM) genTable(minScore, score float64, pos int, motif []byte) { 93 for i, s := range m.matrix[pos] { 94 motif[pos] = byte(i) 95 if pos < len(m.matrix)-1 { 96 if minScore-(score+s) > m.lookAhead[pos+1] { // will not be able to achieve minScore 97 continue 98 } 99 m.genTable(minScore, score+s, pos+1, motif) 100 } else { 101 if score+s < minScore { // will not be able to achieve minScore 102 continue 103 } 104 // count frequencies of states in current motif 105 freqs := make([]int, 4) 106 for _, j := range motif { 107 freqs[j]++ 108 } 109 found := false 110 for j := len(m.table) - 1; j >= 0; j-- { 111 table := m.table[j] 112 if table.score != score+s { 113 continue // if using insertion sort, if table.score > score+s then we can found = false and break 114 } 115 match := true 116 for k := range freqs { 117 if freqs[k] != table.freqs[k] { 118 match = false 119 break 120 } 121 } 122 if match { 123 table.occurrence++ 124 found = true 125 break 126 } 127 } 128 129 if !found { 130 m.table = append(m.table, &probs{score: score + s, freqs: freqs, occurrence: 1}) 131 } 132 } 133 } 134 135 } 136 137 type Sequence interface { 138 seq.Sequence 139 Moltype() feat.Moltype 140 Orientation() feat.Orientation 141 } 142 143 func (m *PWM) Search(s Sequence, start, end int, minScore float64) []feat.Feature { 144 if minScore < m.minScore { 145 m.table = make(probTable, 0) 146 m.genTable(minScore, 0, 0, make([]byte, len(m.matrix))) 147 sort.Sort(m.table) 148 } 149 150 var ( 151 index = s.Alphabet().LetterIndex() 152 length = len(m.matrix) 153 154 freqs = make([]float64, 4) 155 zeros = make([]float64, 4) 156 157 diff = 1 / float64(length) 158 159 f []feat.Feature 160 ) 161 162 LOOP: 163 for pos := start; pos+length < end; pos++ { 164 // Determine the score for this position. 165 var score = 0. 166 for i := 0; i < length; i++ { 167 base := index[s.At(pos+i).L] 168 if base < 0 || minScore-score > m.lookAhead[i] { // not valid base or will not be able to achieve minScore 169 continue LOOP 170 } else { 171 score += m.matrix[i][base] 172 } 173 } 174 175 if score < minScore { 176 continue 177 } 178 179 // Calculate base frequencies for window. 180 copy(freqs, zeros) 181 for i := pos; i < pos+length; i++ { 182 base := index[s.At(i).L] 183 if base >= 0 { 184 freqs[base] += diff 185 } else { // Probability for this pos will be meaningless; if N is tolerated, include N in valid alphabet - make special case? 186 continue LOOP 187 } 188 } 189 190 // Descend probability function summing probabilities. 191 var ( 192 prob = 0. 193 sp = 0. 194 ) 195 for _, e := range m.table { 196 sp = 1 197 if e.score < score { 198 break 199 } 200 for i, f := range freqs { 201 sp *= math.Pow(f, float64(e.freqs[i])) 202 } 203 sp *= float64(e.occurrence) 204 prob += sp 205 } 206 207 mot := s.New() 208 sequtils.Truncate(mot, s, pos, pos+length) 209 f = append(f, &Feature{ 210 MotifLocation: s, 211 MotifStart: pos, 212 MotifEnd: pos + length, 213 MotifScore: score, 214 MotifProb: prob, 215 MotifSeq: mot, 216 MotifOrient: s.Orientation(), 217 Moltype: s.Moltype(), 218 }) 219 } 220 221 return f 222 }