github.com/biogo/biogo@v1.0.4/complexity/complexity.go (about)

     1  // Copyright ©2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package complexity provides routines for evaluating sequence complexity.
     6  package complexity
     7  
     8  import (
     9  	"github.com/biogo/biogo/seq"
    10  
    11  	"compress/zlib"
    12  	"fmt"
    13  	"math"
    14  )
    15  
    16  const tableLength = 10000
    17  
    18  var lnFacTable = genLnFac(tableLength)
    19  
    20  func genLnFac(l int) (table []float64) {
    21  	table = make([]float64, l)
    22  	lnfac := 0.
    23  
    24  	for i := 1; i < l; i++ {
    25  		lnfac += math.Log(float64(i))
    26  		table[i] = lnfac
    27  	}
    28  
    29  	return
    30  }
    31  
    32  const ln2pi = 1.8378770664093454835606594728112352797227949472755668
    33  
    34  func lnFac(x int) float64 {
    35  	if x < len(lnFacTable) {
    36  		return lnFacTable[x]
    37  	}
    38  	// use Sterling's approximation for queries outside the table:
    39  	return (float64(x)+0.5)*math.Log(float64(x)) - float64(x) + ln2pi/2
    40  }
    41  
    42  func logBaseK(logk, x float64) float64 {
    43  	return math.Log(x) / logk
    44  }
    45  
    46  // Entropic returns the entropic complexity of a segment of s defined by
    47  // start and end.
    48  func Entropic(s seq.Sequence, start, end int) (ce float64, err error) {
    49  	if start < s.Start() || end > s.End() {
    50  		err = fmt.Errorf("complex: index out of range")
    51  		return
    52  	}
    53  	if start == end {
    54  		return 0, nil
    55  	}
    56  
    57  	var N float64
    58  	k := s.Alphabet().Len()
    59  	logk := math.Log(float64(k))
    60  	n := make([]float64, k)
    61  
    62  	// tally classes
    63  	it := s.Alphabet().LetterIndex()
    64  	for i := start; i < end; i++ {
    65  		if ind := it[s.At(i).L]; ind >= 0 {
    66  			N++
    67  			n[ind]++
    68  		}
    69  	}
    70  
    71  	// -∑i=1..k((n_i/N)*log_k(n_i/N))
    72  	for i := 0; i < k; i++ {
    73  		if n[i] != 0 { // ignore zero counts
    74  			ce += n[i] * logBaseK(logk, n[i]/N)
    75  		}
    76  	}
    77  	ce = -ce / N
    78  
    79  	return
    80  }
    81  
    82  // WF returns the Wootton and Federhen complexity of a segment of s defined by
    83  // start and end.
    84  func WF(s seq.Sequence, start, end int) (cwf float64, err error) {
    85  	if start < s.Start() || end > s.End() {
    86  		err = fmt.Errorf("complex: index out of range")
    87  		return
    88  	}
    89  	if start == end {
    90  		return 0, nil
    91  	}
    92  
    93  	var N int
    94  	k := s.Alphabet().Len()
    95  	logk := math.Log(float64(k))
    96  	n := make([]int, k)
    97  
    98  	// tally classes
    99  	it := s.Alphabet().LetterIndex()
   100  	for i := start; i < end; i++ {
   101  		if ind := it[s.At(i).L]; ind >= 0 {
   102  			N++
   103  			n[ind]++
   104  		}
   105  	}
   106  
   107  	// 1/N*log_k(N!/∏i=1..k(n_i!))
   108  	cwf = lnFac(N)
   109  	for i := 0; i < k; i++ {
   110  		cwf -= lnFac(n[i])
   111  	}
   112  	cwf /= float64(N) * logk
   113  
   114  	return
   115  }
   116  
   117  type byteCounter int
   118  
   119  func (b *byteCounter) Write(p []byte) (n int, err error) {
   120  	*b += byteCounter(len(p))
   121  	return len(p), nil
   122  }
   123  
   124  var overhead = calcOverhead()
   125  
   126  func calcOverhead() byteCounter {
   127  	b := new(byteCounter)
   128  	z := zlib.NewWriter(b)
   129  	z.Write([]byte{0})
   130  	z.Close()
   131  
   132  	return *b - 1
   133  }
   134  
   135  // Z returns the zlib compression estimate of complexity of a segment of s defined by
   136  // start and end.
   137  func Z(s seq.Sequence, start, end int) (cz float64, err error) {
   138  	if start < s.Start() || end > s.End() {
   139  		err = fmt.Errorf("complex: index out of range")
   140  		return
   141  	}
   142  	if start == end {
   143  		return 0, nil
   144  	}
   145  
   146  	bc := new(byteCounter)
   147  	z := zlib.NewWriter(bc)
   148  	defer z.Close()
   149  	it := s.Alphabet().LetterIndex()
   150  	var N float64
   151  	for i := start; i < end; i++ {
   152  		if b := byte(s.At(i).L); it[b] >= 0 {
   153  			N++
   154  			z.Write([]byte{b})
   155  		}
   156  	}
   157  	z.Close()
   158  
   159  	cz = (float64(*bc - overhead)) / N
   160  
   161  	return
   162  }