You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
85 lines
1.8 KiB
85 lines
1.8 KiB
package compress |
|
|
|
import "math" |
|
|
|
// Estimate returns a normalized compressibility estimate of block b. |
|
// Values close to zero are likely uncompressible. |
|
// Values above 0.1 are likely to be compressible. |
|
// Values above 0.5 are very compressible. |
|
// Very small lengths will return 0. |
|
func Estimate(b []byte) float64 { |
|
if len(b) < 16 { |
|
return 0 |
|
} |
|
|
|
// Correctly predicted order 1 |
|
hits := 0 |
|
lastMatch := false |
|
var o1 [256]byte |
|
var hist [256]int |
|
c1 := byte(0) |
|
for _, c := range b { |
|
if c == o1[c1] { |
|
// We only count a hit if there was two correct predictions in a row. |
|
if lastMatch { |
|
hits++ |
|
} |
|
lastMatch = true |
|
} else { |
|
lastMatch = false |
|
} |
|
o1[c1] = c |
|
c1 = c |
|
hist[c]++ |
|
} |
|
|
|
// Use x^0.6 to give better spread |
|
prediction := math.Pow(float64(hits)/float64(len(b)), 0.6) |
|
|
|
// Calculate histogram distribution |
|
variance := float64(0) |
|
avg := float64(len(b)) / 256 |
|
|
|
for _, v := range hist { |
|
Δ := float64(v) - avg |
|
variance += Δ * Δ |
|
} |
|
|
|
stddev := math.Sqrt(float64(variance)) / float64(len(b)) |
|
exp := math.Sqrt(1 / float64(len(b))) |
|
|
|
// Subtract expected stddev |
|
stddev -= exp |
|
if stddev < 0 { |
|
stddev = 0 |
|
} |
|
stddev *= 1 + exp |
|
|
|
// Use x^0.4 to give better spread |
|
entropy := math.Pow(stddev, 0.4) |
|
|
|
// 50/50 weight between prediction and histogram distribution |
|
return math.Pow((prediction+entropy)/2, 0.9) |
|
} |
|
|
|
// ShannonEntropyBits returns the number of bits minimum required to represent |
|
// an entropy encoding of the input bytes. |
|
// https://en.wiktionary.org/wiki/Shannon_entropy |
|
func ShannonEntropyBits(b []byte) int { |
|
if len(b) == 0 { |
|
return 0 |
|
} |
|
var hist [256]int |
|
for _, c := range b { |
|
hist[c]++ |
|
} |
|
shannon := float64(0) |
|
invTotal := 1.0 / float64(len(b)) |
|
for _, v := range hist[:] { |
|
if v > 0 { |
|
n := float64(v) |
|
shannon += math.Ceil(-math.Log2(n*invTotal) * n) |
|
} |
|
} |
|
return int(math.Ceil(shannon)) |
|
}
|
|
|