github.com/zhangyunhao116/wyhash@v0.4.1-0.20220217162229-7d42996fa899/smhasher_test.go (about)

     1  package wyhash
     2  
     3  // From src/hash/maphash/smhasher_test.go
     4  
     5  import (
     6  	"fmt"
     7  	"math"
     8  	"math/rand"
     9  	"runtime"
    10  	"strings"
    11  	"testing"
    12  	"unsafe"
    13  )
    14  
    15  // Smhasher is a torture test for hash functions.
    16  // https://code.google.com/p/smhasher/
    17  // This code is a port of some of the Smhasher tests to Go.
    18  
    19  var fixedSeed = makeSeed()
    20  
    21  // Sanity checks.
    22  // hash should not depend on values outside key.
    23  // hash should not depend on alignment.
    24  func TestSmhasherSanity(t *testing.T) {
    25  	r := rand.New(rand.NewSource(1234))
    26  	const REP = 10
    27  	const KEYMAX = 128
    28  	const PAD = 16
    29  	const OFFMAX = 16
    30  	for k := 0; k < REP; k++ {
    31  		for n := 0; n < KEYMAX; n++ {
    32  			for i := 0; i < OFFMAX; i++ {
    33  				var b [KEYMAX + OFFMAX + 2*PAD]byte
    34  				var c [KEYMAX + OFFMAX + 2*PAD]byte
    35  				randBytes(r, b[:])
    36  				randBytes(r, c[:])
    37  				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
    38  				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
    39  					t.Errorf("hash depends on bytes outside key")
    40  				}
    41  			}
    42  		}
    43  	}
    44  }
    45  
    46  func bytesHash(b []byte) uint64 {
    47  	return Sum64(b)
    48  }
    49  func stringHash(s string) uint64 {
    50  	return Sum64String(s)
    51  }
    52  
    53  const hashSize = 64
    54  
    55  func randBytes(r *rand.Rand, b []byte) {
    56  	r.Read(b) // can't fail
    57  }
    58  
    59  // A hashSet measures the frequency of hash collisions.
    60  type hashSet struct {
    61  	m map[uint64]struct{} // set of hashes added
    62  	n int                 // number of hashes added
    63  }
    64  
    65  func newHashSet() *hashSet {
    66  	return &hashSet{make(map[uint64]struct{}), 0}
    67  }
    68  func (s *hashSet) add(h uint64) {
    69  	s.m[h] = struct{}{}
    70  	s.n++
    71  }
    72  func (s *hashSet) addS(x string) {
    73  	s.add(stringHash(x))
    74  }
    75  
    76  func (s *hashSet) addB(x []byte) {
    77  	s.add(bytesHash(x))
    78  }
    79  
    80  func (s *hashSet) addS_seed(x string, seed seed) {
    81  	s.add(Sum64StringWithSeed(x, seed.s))
    82  }
    83  
    84  func (s *hashSet) check(t *testing.T) {
    85  	const SLOP = 10.0
    86  	collisions := s.n - len(s.m)
    87  	pairs := int64(s.n) * int64(s.n-1) / 2
    88  	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
    89  	stddev := math.Sqrt(expected)
    90  	if float64(collisions) > expected+SLOP*(3*stddev+1) {
    91  		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
    92  	}
    93  }
    94  
    95  // a string plus adding zeros must make distinct hashes
    96  func TestSmhasherAppendedZeros(t *testing.T) {
    97  	s := "hello" + strings.Repeat("\x00", 256)
    98  	h := newHashSet()
    99  	for i := 0; i <= len(s); i++ {
   100  		h.addS(s[:i])
   101  	}
   102  	h.check(t)
   103  }
   104  
   105  // All 0-3 byte strings have distinct hashes.
   106  func TestSmhasherSmallKeys(t *testing.T) {
   107  	h := newHashSet()
   108  	var b [3]byte
   109  	for i := 0; i < 256; i++ {
   110  		b[0] = byte(i)
   111  		h.addB(b[:1])
   112  		for j := 0; j < 256; j++ {
   113  			b[1] = byte(j)
   114  			h.addB(b[:2])
   115  			if !testing.Short() {
   116  				for k := 0; k < 256; k++ {
   117  					b[2] = byte(k)
   118  					h.addB(b[:3])
   119  				}
   120  			}
   121  		}
   122  	}
   123  	h.check(t)
   124  }
   125  
   126  // Different length strings of all zeros have distinct hashes.
   127  func TestSmhasherZeros(t *testing.T) {
   128  	N := 256 * 1024
   129  	if testing.Short() {
   130  		N = 1024
   131  	}
   132  	h := newHashSet()
   133  	b := make([]byte, N)
   134  	for i := 0; i <= N; i++ {
   135  		h.addB(b[:i])
   136  	}
   137  	h.check(t)
   138  }
   139  
   140  // Strings with up to two nonzero bytes all have distinct hashes.
   141  func TestSmhasherTwoNonzero(t *testing.T) {
   142  	if runtime.GOARCH == "wasm" {
   143  		t.Skip("Too slow on wasm")
   144  	}
   145  	if testing.Short() {
   146  		t.Skip("Skipping in short mode")
   147  	}
   148  	h := newHashSet()
   149  	for n := 2; n <= 16; n++ {
   150  		twoNonZero(h, n)
   151  	}
   152  	h.check(t)
   153  }
   154  func twoNonZero(h *hashSet, n int) {
   155  	b := make([]byte, n)
   156  
   157  	// all zero
   158  	h.addB(b)
   159  
   160  	// one non-zero byte
   161  	for i := 0; i < n; i++ {
   162  		for x := 1; x < 256; x++ {
   163  			b[i] = byte(x)
   164  			h.addB(b)
   165  			b[i] = 0
   166  		}
   167  	}
   168  
   169  	// two non-zero bytes
   170  	for i := 0; i < n; i++ {
   171  		for x := 1; x < 256; x++ {
   172  			b[i] = byte(x)
   173  			for j := i + 1; j < n; j++ {
   174  				for y := 1; y < 256; y++ {
   175  					b[j] = byte(y)
   176  					h.addB(b)
   177  					b[j] = 0
   178  				}
   179  			}
   180  			b[i] = 0
   181  		}
   182  	}
   183  }
   184  
   185  // Test strings with repeats, like "abcdabcdabcdabcd..."
   186  func TestSmhasherCyclic(t *testing.T) {
   187  	if testing.Short() {
   188  		t.Skip("Skipping in short mode")
   189  	}
   190  	r := rand.New(rand.NewSource(1234))
   191  	const REPEAT = 8
   192  	const N = 1000000
   193  	for n := 4; n <= 12; n++ {
   194  		h := newHashSet()
   195  		b := make([]byte, REPEAT*n)
   196  		for i := 0; i < N; i++ {
   197  			b[0] = byte(i * 79 % 97)
   198  			b[1] = byte(i * 43 % 137)
   199  			b[2] = byte(i * 151 % 197)
   200  			b[3] = byte(i * 199 % 251)
   201  			randBytes(r, b[4:n])
   202  			for j := n; j < n*REPEAT; j++ {
   203  				b[j] = b[j-n]
   204  			}
   205  			h.addB(b)
   206  		}
   207  		h.check(t)
   208  	}
   209  }
   210  
   211  // Test strings with only a few bits set
   212  func TestSmhasherSparse(t *testing.T) {
   213  	if runtime.GOARCH == "wasm" {
   214  		t.Skip("Too slow on wasm")
   215  	}
   216  	if testing.Short() {
   217  		t.Skip("Skipping in short mode")
   218  	}
   219  	sparse(t, 32, 6)
   220  	sparse(t, 40, 6)
   221  	sparse(t, 48, 5)
   222  	sparse(t, 56, 5)
   223  	sparse(t, 64, 5)
   224  	sparse(t, 96, 4)
   225  	sparse(t, 256, 3)
   226  	sparse(t, 2048, 2)
   227  }
   228  func sparse(t *testing.T, n int, k int) {
   229  	b := make([]byte, n/8)
   230  	h := newHashSet()
   231  	setbits(h, b, 0, k)
   232  	h.check(t)
   233  }
   234  
   235  // set up to k bits at index i and greater
   236  func setbits(h *hashSet, b []byte, i int, k int) {
   237  	h.addB(b)
   238  	if k == 0 {
   239  		return
   240  	}
   241  	for j := i; j < len(b)*8; j++ {
   242  		b[j/8] |= byte(1 << uint(j&7))
   243  		setbits(h, b, j+1, k-1)
   244  		b[j/8] &= byte(^(1 << uint(j&7)))
   245  	}
   246  }
   247  
   248  // Test all possible combinations of n blocks from the set s.
   249  // "permutation" is a bad name here, but it is what Smhasher uses.
   250  func TestSmhasherPermutation(t *testing.T) {
   251  	if runtime.GOARCH == "wasm" {
   252  		t.Skip("Too slow on wasm")
   253  	}
   254  	if testing.Short() {
   255  		t.Skip("Skipping in short mode")
   256  	}
   257  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
   258  	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
   259  	permutation(t, []uint32{0, 1}, 20)
   260  	permutation(t, []uint32{0, 1 << 31}, 20)
   261  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
   262  }
   263  func permutation(t *testing.T, s []uint32, n int) {
   264  	b := make([]byte, n*4)
   265  	h := newHashSet()
   266  	genPerm(h, b, s, 0)
   267  	h.check(t)
   268  }
   269  func genPerm(h *hashSet, b []byte, s []uint32, n int) {
   270  	h.addB(b[:n])
   271  	if n == len(b) {
   272  		return
   273  	}
   274  	for _, v := range s {
   275  		b[n] = byte(v)
   276  		b[n+1] = byte(v >> 8)
   277  		b[n+2] = byte(v >> 16)
   278  		b[n+3] = byte(v >> 24)
   279  		genPerm(h, b, s, n+4)
   280  	}
   281  }
   282  
   283  type key interface {
   284  	clear()              // set bits all to 0
   285  	random(r *rand.Rand) // set key to something random
   286  	bits() int           // how many bits key has
   287  	flipBit(i int)       // flip bit i of the key
   288  	hash() uint64        // hash the key
   289  	name() string        // for error reporting
   290  }
   291  
   292  type bytesKey struct {
   293  	b []byte
   294  }
   295  
   296  func (k *bytesKey) clear() {
   297  	for i := range k.b {
   298  		k.b[i] = 0
   299  	}
   300  }
   301  func (k *bytesKey) random(r *rand.Rand) {
   302  	randBytes(r, k.b)
   303  }
   304  func (k *bytesKey) bits() int {
   305  	return len(k.b) * 8
   306  }
   307  func (k *bytesKey) flipBit(i int) {
   308  	k.b[i>>3] ^= byte(1 << uint(i&7))
   309  }
   310  func (k *bytesKey) hash() uint64 {
   311  	return bytesHash(k.b)
   312  }
   313  func (k *bytesKey) name() string {
   314  	return fmt.Sprintf("bytes%d", len(k.b))
   315  }
   316  
   317  // Flipping a single bit of a key should flip each output bit with 50% probability.
   318  func TestSmhasherAvalanche(t *testing.T) {
   319  	if runtime.GOARCH == "wasm" {
   320  		t.Skip("Too slow on wasm")
   321  	}
   322  	if testing.Short() {
   323  		t.Skip("Skipping in short mode")
   324  	}
   325  	avalancheTest1(t, &bytesKey{make([]byte, 2)})
   326  	avalancheTest1(t, &bytesKey{make([]byte, 4)})
   327  	avalancheTest1(t, &bytesKey{make([]byte, 8)})
   328  	avalancheTest1(t, &bytesKey{make([]byte, 16)})
   329  	avalancheTest1(t, &bytesKey{make([]byte, 32)})
   330  	avalancheTest1(t, &bytesKey{make([]byte, 200)})
   331  }
   332  func avalancheTest1(t *testing.T, k key) {
   333  	const REP = 100000
   334  	r := rand.New(rand.NewSource(1234))
   335  	n := k.bits()
   336  
   337  	// grid[i][j] is a count of whether flipping
   338  	// input bit i affects output bit j.
   339  	grid := make([][hashSize]int, n)
   340  
   341  	for z := 0; z < REP; z++ {
   342  		// pick a random key, hash it
   343  		k.random(r)
   344  		h := k.hash()
   345  
   346  		// flip each bit, hash & compare the results
   347  		for i := 0; i < n; i++ {
   348  			k.flipBit(i)
   349  			d := h ^ k.hash()
   350  			k.flipBit(i)
   351  
   352  			// record the effects of that bit flip
   353  			g := &grid[i]
   354  			for j := 0; j < hashSize; j++ {
   355  				g[j] += int(d & 1)
   356  				d >>= 1
   357  			}
   358  		}
   359  	}
   360  
   361  	// Each entry in the grid should be about REP/2.
   362  	// More precisely, we did N = k.bits() * hashSize experiments where
   363  	// each is the sum of REP coin flips. We want to find bounds on the
   364  	// sum of coin flips such that a truly random experiment would have
   365  	// all sums inside those bounds with 99% probability.
   366  	N := n * hashSize
   367  	var c float64
   368  	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
   369  	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
   370  	}
   371  	c *= 4.0 // allowed slack - we don't need to be perfectly random
   372  	mean := .5 * REP
   373  	stddev := .5 * math.Sqrt(REP)
   374  	low := int(mean - c*stddev)
   375  	high := int(mean + c*stddev)
   376  	for i := 0; i < n; i++ {
   377  		for j := 0; j < hashSize; j++ {
   378  			x := grid[i][j]
   379  			if x < low || x > high {
   380  				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
   381  			}
   382  		}
   383  	}
   384  }
   385  
   386  // All bit rotations of a set of distinct keys
   387  func TestSmhasherWindowed(t *testing.T) {
   388  	windowed(t, &bytesKey{make([]byte, 128)})
   389  }
   390  func windowed(t *testing.T, k key) {
   391  	if runtime.GOARCH == "wasm" {
   392  		t.Skip("Too slow on wasm")
   393  	}
   394  	if testing.Short() {
   395  		t.Skip("Skipping in short mode")
   396  	}
   397  	const BITS = 16
   398  
   399  	for r := 0; r < k.bits(); r++ {
   400  		h := newHashSet()
   401  		for i := 0; i < 1<<BITS; i++ {
   402  			k.clear()
   403  			for j := 0; j < BITS; j++ {
   404  				if i>>uint(j)&1 != 0 {
   405  					k.flipBit((j + r) % k.bits())
   406  				}
   407  			}
   408  			h.add(k.hash())
   409  		}
   410  		h.check(t)
   411  	}
   412  }
   413  
   414  // All keys of the form prefix + [A-Za-z0-9]*N + suffix.
   415  func TestSmhasherText(t *testing.T) {
   416  	if testing.Short() {
   417  		t.Skip("Skipping in short mode")
   418  	}
   419  	text(t, "Foo", "Bar")
   420  	text(t, "FooBar", "")
   421  	text(t, "", "FooBar")
   422  }
   423  func text(t *testing.T, prefix, suffix string) {
   424  	const N = 4
   425  	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
   426  	const L = len(S)
   427  	b := make([]byte, len(prefix)+N+len(suffix))
   428  	copy(b, prefix)
   429  	copy(b[len(prefix)+N:], suffix)
   430  	h := newHashSet()
   431  	c := b[len(prefix):]
   432  	for i := 0; i < L; i++ {
   433  		c[0] = S[i]
   434  		for j := 0; j < L; j++ {
   435  			c[1] = S[j]
   436  			for k := 0; k < L; k++ {
   437  				c[2] = S[k]
   438  				for x := 0; x < L; x++ {
   439  					c[3] = S[x]
   440  					h.addB(b)
   441  				}
   442  			}
   443  		}
   444  	}
   445  	h.check(t)
   446  }
   447  
   448  // Make sure different seed values generate different hashes.
   449  func TestSmhasherSeed(t *testing.T) {
   450  	if unsafe.Sizeof(uintptr(0)) == 4 {
   451  		t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
   452  	}
   453  	h := newHashSet()
   454  	const N = 100000
   455  	s := "hello"
   456  	for i := 0; i < N; i++ {
   457  		h.addS_seed(s, seed{s: uint64(i + 1)})
   458  		h.addS_seed(s, seed{s: uint64(i+1) << 32}) // make sure high bits are used
   459  	}
   460  	h.check(t)
   461  }
   462  
   463  type seed struct {
   464  	s uint64
   465  }
   466  
   467  // makeSeed returns a new random seed.
   468  func makeSeed() seed {
   469  	var s1, s2 uint64
   470  	for {
   471  		s1 = uint64(runtime_fastrand())
   472  		s2 = uint64(runtime_fastrand())
   473  		// We use seed 0 to indicate an uninitialized seed/hash,
   474  		// so keep trying until we get a non-zero seed.
   475  		if s1|s2 != 0 {
   476  			break
   477  		}
   478  	}
   479  	return seed{s: s1<<32 + s2}
   480  }