github.com/biogo/biogo@v1.0.4/complexity/complexity.go (about) 1 // Copyright ©2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package complexity provides routines for evaluating sequence complexity. 6 package complexity 7 8 import ( 9 "github.com/biogo/biogo/seq" 10 11 "compress/zlib" 12 "fmt" 13 "math" 14 ) 15 16 const tableLength = 10000 17 18 var lnFacTable = genLnFac(tableLength) 19 20 func genLnFac(l int) (table []float64) { 21 table = make([]float64, l) 22 lnfac := 0. 23 24 for i := 1; i < l; i++ { 25 lnfac += math.Log(float64(i)) 26 table[i] = lnfac 27 } 28 29 return 30 } 31 32 const ln2pi = 1.8378770664093454835606594728112352797227949472755668 33 34 func lnFac(x int) float64 { 35 if x < len(lnFacTable) { 36 return lnFacTable[x] 37 } 38 // use Sterling's approximation for queries outside the table: 39 return (float64(x)+0.5)*math.Log(float64(x)) - float64(x) + ln2pi/2 40 } 41 42 func logBaseK(logk, x float64) float64 { 43 return math.Log(x) / logk 44 } 45 46 // Entropic returns the entropic complexity of a segment of s defined by 47 // start and end. 48 func Entropic(s seq.Sequence, start, end int) (ce float64, err error) { 49 if start < s.Start() || end > s.End() { 50 err = fmt.Errorf("complex: index out of range") 51 return 52 } 53 if start == end { 54 return 0, nil 55 } 56 57 var N float64 58 k := s.Alphabet().Len() 59 logk := math.Log(float64(k)) 60 n := make([]float64, k) 61 62 // tally classes 63 it := s.Alphabet().LetterIndex() 64 for i := start; i < end; i++ { 65 if ind := it[s.At(i).L]; ind >= 0 { 66 N++ 67 n[ind]++ 68 } 69 } 70 71 // -∑i=1..k((n_i/N)*log_k(n_i/N)) 72 for i := 0; i < k; i++ { 73 if n[i] != 0 { // ignore zero counts 74 ce += n[i] * logBaseK(logk, n[i]/N) 75 } 76 } 77 ce = -ce / N 78 79 return 80 } 81 82 // WF returns the Wootton and Federhen complexity of a segment of s defined by 83 // start and end. 84 func WF(s seq.Sequence, start, end int) (cwf float64, err error) { 85 if start < s.Start() || end > s.End() { 86 err = fmt.Errorf("complex: index out of range") 87 return 88 } 89 if start == end { 90 return 0, nil 91 } 92 93 var N int 94 k := s.Alphabet().Len() 95 logk := math.Log(float64(k)) 96 n := make([]int, k) 97 98 // tally classes 99 it := s.Alphabet().LetterIndex() 100 for i := start; i < end; i++ { 101 if ind := it[s.At(i).L]; ind >= 0 { 102 N++ 103 n[ind]++ 104 } 105 } 106 107 // 1/N*log_k(N!/∏i=1..k(n_i!)) 108 cwf = lnFac(N) 109 for i := 0; i < k; i++ { 110 cwf -= lnFac(n[i]) 111 } 112 cwf /= float64(N) * logk 113 114 return 115 } 116 117 type byteCounter int 118 119 func (b *byteCounter) Write(p []byte) (n int, err error) { 120 *b += byteCounter(len(p)) 121 return len(p), nil 122 } 123 124 var overhead = calcOverhead() 125 126 func calcOverhead() byteCounter { 127 b := new(byteCounter) 128 z := zlib.NewWriter(b) 129 z.Write([]byte{0}) 130 z.Close() 131 132 return *b - 1 133 } 134 135 // Z returns the zlib compression estimate of complexity of a segment of s defined by 136 // start and end. 137 func Z(s seq.Sequence, start, end int) (cz float64, err error) { 138 if start < s.Start() || end > s.End() { 139 err = fmt.Errorf("complex: index out of range") 140 return 141 } 142 if start == end { 143 return 0, nil 144 } 145 146 bc := new(byteCounter) 147 z := zlib.NewWriter(bc) 148 defer z.Close() 149 it := s.Alphabet().LetterIndex() 150 var N float64 151 for i := start; i < end; i++ { 152 if b := byte(s.At(i).L); it[b] >= 0 { 153 N++ 154 z.Write([]byte{b}) 155 } 156 } 157 z.Close() 158 159 cz = (float64(*bc - overhead)) / N 160 161 return 162 }