github.com/bir3/gocompiler@v0.9.2202/extra/compress/compressible.go (about)

     1  package compress
     2  
     3  import "math"
     4  
     5  // Estimate returns a normalized compressibility estimate of block b.
     6  // Values close to zero are likely uncompressible.
     7  // Values above 0.1 are likely to be compressible.
     8  // Values above 0.5 are very compressible.
     9  // Very small lengths will return 0.
    10  func Estimate(b []byte) float64 {
    11  	if len(b) < 16 {
    12  		return 0
    13  	}
    14  
    15  	// Correctly predicted order 1
    16  	hits := 0
    17  	lastMatch := false
    18  	var o1 [256]byte
    19  	var hist [256]int
    20  	c1 := byte(0)
    21  	for _, c := range b {
    22  		if c == o1[c1] {
    23  			// We only count a hit if there was two correct predictions in a row.
    24  			if lastMatch {
    25  				hits++
    26  			}
    27  			lastMatch = true
    28  		} else {
    29  			lastMatch = false
    30  		}
    31  		o1[c1] = c
    32  		c1 = c
    33  		hist[c]++
    34  	}
    35  
    36  	// Use x^0.6 to give better spread
    37  	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
    38  
    39  	// Calculate histogram distribution
    40  	variance := float64(0)
    41  	avg := float64(len(b)) / 256
    42  
    43  	for _, v := range hist {
    44  		Δ := float64(v) - avg
    45  		variance += Δ * Δ
    46  	}
    47  
    48  	stddev := math.Sqrt(float64(variance)) / float64(len(b))
    49  	exp := math.Sqrt(1 / float64(len(b)))
    50  
    51  	// Subtract expected stddev
    52  	stddev -= exp
    53  	if stddev < 0 {
    54  		stddev = 0
    55  	}
    56  	stddev *= 1 + exp
    57  
    58  	// Use x^0.4 to give better spread
    59  	entropy := math.Pow(stddev, 0.4)
    60  
    61  	// 50/50 weight between prediction and histogram distribution
    62  	return math.Pow((prediction+entropy)/2, 0.9)
    63  }
    64  
    65  // ShannonEntropyBits returns the number of bits minimum required to represent
    66  // an entropy encoding of the input bytes.
    67  // https://en.wiktionary.org/wiki/Shannon_entropy
    68  func ShannonEntropyBits(b []byte) int {
    69  	if len(b) == 0 {
    70  		return 0
    71  	}
    72  	var hist [256]int
    73  	for _, c := range b {
    74  		hist[c]++
    75  	}
    76  	shannon := float64(0)
    77  	invTotal := 1.0 / float64(len(b))
    78  	for _, v := range hist[:] {
    79  		if v > 0 {
    80  			n := float64(v)
    81  			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
    82  		}
    83  	}
    84  	return int(math.Ceil(shannon))
    85  }