github.com/pidato/unsafe@v0.1.4/memory/hash/smash_test.go (about)

     1  package hash
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"math/rand"
     7  	"runtime"
     8  	"strings"
     9  	"testing"
    10  	"unsafe"
    11  )
    12  
    13  // Smhasher is a torture test for hash functions.
    14  // https://code.google.com/p/smhasher/
    15  // This code is a port of some of the Smhasher tests to Go.
    16  
    17  var fixedSeed = makeSeed()
    18  
    19  // Sanity checks.
    20  // hash should not depend on values outside key.
    21  // hash should not depend on alignment.
    22  func TestSmhasherSanity(t *testing.T) {
    23  	r := rand.New(rand.NewSource(1234))
    24  	const REP = 10
    25  	const KEYMAX = 128
    26  	const PAD = 16
    27  	const OFFMAX = 16
    28  	for k := 0; k < REP; k++ {
    29  		for n := 0; n < KEYMAX; n++ {
    30  			for i := 0; i < OFFMAX; i++ {
    31  				var b [KEYMAX + OFFMAX + 2*PAD]byte
    32  				var c [KEYMAX + OFFMAX + 2*PAD]byte
    33  				randBytes(r, b[:])
    34  				randBytes(r, c[:])
    35  				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
    36  				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
    37  					t.Errorf("hash depends on bytes outside key")
    38  				}
    39  			}
    40  		}
    41  	}
    42  }
    43  
    44  func bytesHash(b []byte) uint64 {
    45  	return Hash(*(*unsafe.Pointer)(unsafe.Pointer(&b)), uint64(len(b)), DefaultSeed)
    46  }
    47  func stringHash(s string) uint64 {
    48  	return String(s)
    49  }
    50  
    51  const hashSize = 64
    52  
    53  func randBytes(r *rand.Rand, b []byte) {
    54  	r.Read(b) // can't fail
    55  }
    56  
    57  // A hashSet measures the frequency of hash collisions.
    58  type hashSet struct {
    59  	m map[uint64]struct{} // set of hashes added
    60  	n int                 // number of hashes added
    61  }
    62  
    63  func newHashSet() *hashSet {
    64  	return &hashSet{make(map[uint64]struct{}), 0}
    65  }
    66  func (s *hashSet) add(h uint64) {
    67  	s.m[h] = struct{}{}
    68  	s.n++
    69  }
    70  func (s *hashSet) addS(x string) {
    71  	s.add(stringHash(x))
    72  }
    73  
    74  func (s *hashSet) addB(x []byte) {
    75  	s.add(bytesHash(x))
    76  }
    77  
    78  func (s *hashSet) addS_seed(x string, seed seed) {
    79  	s.add(Hash(*(*unsafe.Pointer)(unsafe.Pointer(&x)), uint64(len(x)), seed.s))
    80  }
    81  
    82  func (s *hashSet) check(t *testing.T) {
    83  	const SLOP = 10.0
    84  	collisions := s.n - len(s.m)
    85  	pairs := int64(s.n) * int64(s.n-1) / 2
    86  	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
    87  	stddev := math.Sqrt(expected)
    88  	if float64(collisions) > expected+SLOP*(3*stddev+1) {
    89  		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
    90  	}
    91  }
    92  
    93  // a string plus adding zeros must make distinct hashes
    94  func TestSmhasherAppendedZeros(t *testing.T) {
    95  	s := "hello" + strings.Repeat("\x00", 256)
    96  	h := newHashSet()
    97  	for i := 0; i <= len(s); i++ {
    98  		h.addS(s[:i])
    99  	}
   100  	h.check(t)
   101  }
   102  
   103  // All 0-3 byte strings have distinct hashes.
   104  func TestSmhasherSmallKeys(t *testing.T) {
   105  	h := newHashSet()
   106  	var b [3]byte
   107  	for i := 0; i < 256; i++ {
   108  		b[0] = byte(i)
   109  		h.addB(b[:1])
   110  		for j := 0; j < 256; j++ {
   111  			b[1] = byte(j)
   112  			h.addB(b[:2])
   113  			if !testing.Short() {
   114  				for k := 0; k < 256; k++ {
   115  					b[2] = byte(k)
   116  					h.addB(b[:3])
   117  				}
   118  			}
   119  		}
   120  	}
   121  	h.check(t)
   122  }
   123  
   124  // Different length strings of all zeros have distinct hashes.
   125  func TestSmhasherZeros(t *testing.T) {
   126  	N := 256 * 1024
   127  	if testing.Short() {
   128  		N = 1024
   129  	}
   130  	h := newHashSet()
   131  	b := make([]byte, N)
   132  	for i := 0; i <= N; i++ {
   133  		h.addB(b[:i])
   134  	}
   135  	h.check(t)
   136  }
   137  
   138  // Strings with up to two nonzero bytes all have distinct hashes.
   139  func TestSmhasherTwoNonzero(t *testing.T) {
   140  	if runtime.GOARCH == "wasm" {
   141  		t.Skip("Too slow on wasm")
   142  	}
   143  	if testing.Short() {
   144  		t.Skip("Skipping in short mode")
   145  	}
   146  	h := newHashSet()
   147  	for n := 2; n <= 16; n++ {
   148  		twoNonZero(h, n)
   149  	}
   150  	h.check(t)
   151  }
   152  func twoNonZero(h *hashSet, n int) {
   153  	b := make([]byte, n)
   154  
   155  	// all zero
   156  	h.addB(b)
   157  
   158  	// one non-zero byte
   159  	for i := 0; i < n; i++ {
   160  		for x := 1; x < 256; x++ {
   161  			b[i] = byte(x)
   162  			h.addB(b)
   163  			b[i] = 0
   164  		}
   165  	}
   166  
   167  	// two non-zero bytes
   168  	for i := 0; i < n; i++ {
   169  		for x := 1; x < 256; x++ {
   170  			b[i] = byte(x)
   171  			for j := i + 1; j < n; j++ {
   172  				for y := 1; y < 256; y++ {
   173  					b[j] = byte(y)
   174  					h.addB(b)
   175  					b[j] = 0
   176  				}
   177  			}
   178  			b[i] = 0
   179  		}
   180  	}
   181  }
   182  
   183  // Test strings with repeats, like "abcdabcdabcdabcd..."
   184  func TestSmhasherCyclic(t *testing.T) {
   185  	if testing.Short() {
   186  		t.Skip("Skipping in short mode")
   187  	}
   188  	r := rand.New(rand.NewSource(1234))
   189  	const REPEAT = 8
   190  	const N = 1000000
   191  	for n := 4; n <= 12; n++ {
   192  		h := newHashSet()
   193  		b := make([]byte, REPEAT*n)
   194  		for i := 0; i < N; i++ {
   195  			b[0] = byte(i * 79 % 97)
   196  			b[1] = byte(i * 43 % 137)
   197  			b[2] = byte(i * 151 % 197)
   198  			b[3] = byte(i * 199 % 251)
   199  			randBytes(r, b[4:n])
   200  			for j := n; j < n*REPEAT; j++ {
   201  				b[j] = b[j-n]
   202  			}
   203  			h.addB(b)
   204  		}
   205  		h.check(t)
   206  	}
   207  }
   208  
   209  // Test strings with only a few bits set
   210  func TestSmhasherSparse(t *testing.T) {
   211  	if runtime.GOARCH == "wasm" {
   212  		t.Skip("Too slow on wasm")
   213  	}
   214  	if testing.Short() {
   215  		t.Skip("Skipping in short mode")
   216  	}
   217  	sparse(t, 32, 6)
   218  	sparse(t, 40, 6)
   219  	sparse(t, 48, 5)
   220  	sparse(t, 56, 5)
   221  	sparse(t, 64, 5)
   222  	sparse(t, 96, 4)
   223  	sparse(t, 256, 3)
   224  	sparse(t, 2048, 2)
   225  }
   226  func sparse(t *testing.T, n int, k int) {
   227  	b := make([]byte, n/8)
   228  	h := newHashSet()
   229  	setbits(h, b, 0, k)
   230  	h.check(t)
   231  }
   232  
   233  // set up to k bits at index i and greater
   234  func setbits(h *hashSet, b []byte, i int, k int) {
   235  	h.addB(b)
   236  	if k == 0 {
   237  		return
   238  	}
   239  	for j := i; j < len(b)*8; j++ {
   240  		b[j/8] |= byte(1 << uint(j&7))
   241  		setbits(h, b, j+1, k-1)
   242  		b[j/8] &= byte(^(1 << uint(j&7)))
   243  	}
   244  }
   245  
   246  // Test all possible combinations of n blocks from the set s.
   247  // "permutation" is a bad name here, but it is what Smhasher uses.
   248  func TestSmhasherPermutation(t *testing.T) {
   249  	if runtime.GOARCH == "wasm" {
   250  		t.Skip("Too slow on wasm")
   251  	}
   252  	if testing.Short() {
   253  		t.Skip("Skipping in short mode")
   254  	}
   255  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
   256  	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
   257  	permutation(t, []uint32{0, 1}, 20)
   258  	permutation(t, []uint32{0, 1 << 31}, 20)
   259  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
   260  }
   261  func permutation(t *testing.T, s []uint32, n int) {
   262  	b := make([]byte, n*4)
   263  	h := newHashSet()
   264  	genPerm(h, b, s, 0)
   265  	h.check(t)
   266  }
   267  func genPerm(h *hashSet, b []byte, s []uint32, n int) {
   268  	h.addB(b[:n])
   269  	if n == len(b) {
   270  		return
   271  	}
   272  	for _, v := range s {
   273  		b[n] = byte(v)
   274  		b[n+1] = byte(v >> 8)
   275  		b[n+2] = byte(v >> 16)
   276  		b[n+3] = byte(v >> 24)
   277  		genPerm(h, b, s, n+4)
   278  	}
   279  }
   280  
   281  type key interface {
   282  	clear()              // set bits all to 0
   283  	random(r *rand.Rand) // set key to something random
   284  	bits() int           // how many bits key has
   285  	flipBit(i int)       // flip bit i of the key
   286  	hash() uint64        // hash the key
   287  	name() string        // for error reporting
   288  }
   289  
   290  type bytesKey struct {
   291  	b []byte
   292  }
   293  
   294  func (k *bytesKey) clear() {
   295  	for i := range k.b {
   296  		k.b[i] = 0
   297  	}
   298  }
   299  func (k *bytesKey) random(r *rand.Rand) {
   300  	randBytes(r, k.b)
   301  }
   302  func (k *bytesKey) bits() int {
   303  	return len(k.b) * 8
   304  }
   305  func (k *bytesKey) flipBit(i int) {
   306  	k.b[i>>3] ^= byte(1 << uint(i&7))
   307  }
   308  func (k *bytesKey) hash() uint64 {
   309  	return bytesHash(k.b)
   310  }
   311  func (k *bytesKey) name() string {
   312  	return fmt.Sprintf("bytes%d", len(k.b))
   313  }
   314  
   315  // Flipping a single bit of a key should flip each output bit with 50% probability.
   316  func TestSmhasherAvalanche(t *testing.T) {
   317  	if runtime.GOARCH == "wasm" {
   318  		t.Skip("Too slow on wasm")
   319  	}
   320  	if testing.Short() {
   321  		t.Skip("Skipping in short mode")
   322  	}
   323  	avalancheTest1(t, &bytesKey{make([]byte, 2)})
   324  	avalancheTest1(t, &bytesKey{make([]byte, 4)})
   325  	avalancheTest1(t, &bytesKey{make([]byte, 8)})
   326  	avalancheTest1(t, &bytesKey{make([]byte, 16)})
   327  	avalancheTest1(t, &bytesKey{make([]byte, 32)})
   328  	avalancheTest1(t, &bytesKey{make([]byte, 200)})
   329  }
   330  func avalancheTest1(t *testing.T, k key) {
   331  	const REP = 100000
   332  	r := rand.New(rand.NewSource(1234))
   333  	n := k.bits()
   334  
   335  	// grid[i][j] is a count of whether flipping
   336  	// input bit i affects output bit j.
   337  	grid := make([][hashSize]int, n)
   338  
   339  	for z := 0; z < REP; z++ {
   340  		// pick a random key, hash it
   341  		k.random(r)
   342  		h := k.hash()
   343  
   344  		// flip each bit, hash & compare the results
   345  		for i := 0; i < n; i++ {
   346  			k.flipBit(i)
   347  			d := h ^ k.hash()
   348  			k.flipBit(i)
   349  
   350  			// record the effects of that bit flip
   351  			g := &grid[i]
   352  			for j := 0; j < hashSize; j++ {
   353  				g[j] += int(d & 1)
   354  				d >>= 1
   355  			}
   356  		}
   357  	}
   358  
   359  	// Each entry in the grid should be about REP/2.
   360  	// More precisely, we did N = k.bits() * hashSize experiments where
   361  	// each is the sum of REP coin flips. We want to find bounds on the
   362  	// sum of coin flips such that a truly random experiment would have
   363  	// all sums inside those bounds with 99% probability.
   364  	N := n * hashSize
   365  	var c float64
   366  	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
   367  	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
   368  	}
   369  	c *= 4.0 // allowed slack - we don't need to be perfectly random
   370  	mean := .5 * REP
   371  	stddev := .5 * math.Sqrt(REP)
   372  	low := int(mean - c*stddev)
   373  	high := int(mean + c*stddev)
   374  	for i := 0; i < n; i++ {
   375  		for j := 0; j < hashSize; j++ {
   376  			x := grid[i][j]
   377  			if x < low || x > high {
   378  				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
   379  			}
   380  		}
   381  	}
   382  }
   383  
   384  // All bit rotations of a set of distinct keys
   385  func TestSmhasherWindowed(t *testing.T) {
   386  	windowed(t, &bytesKey{make([]byte, 128)})
   387  }
   388  func windowed(t *testing.T, k key) {
   389  	if runtime.GOARCH == "wasm" {
   390  		t.Skip("Too slow on wasm")
   391  	}
   392  	if testing.Short() {
   393  		t.Skip("Skipping in short mode")
   394  	}
   395  	const BITS = 16
   396  
   397  	for r := 0; r < k.bits(); r++ {
   398  		h := newHashSet()
   399  		for i := 0; i < 1<<BITS; i++ {
   400  			k.clear()
   401  			for j := 0; j < BITS; j++ {
   402  				if i>>uint(j)&1 != 0 {
   403  					k.flipBit((j + r) % k.bits())
   404  				}
   405  			}
   406  			h.add(k.hash())
   407  		}
   408  		h.check(t)
   409  	}
   410  }
   411  
   412  // All keys of the form prefix + [A-Za-z0-9]*N + suffix.
   413  func TestSmhasherText(t *testing.T) {
   414  	if testing.Short() {
   415  		t.Skip("Skipping in short mode")
   416  	}
   417  	text(t, "Foo", "Bar")
   418  	text(t, "FooBar", "")
   419  	text(t, "", "FooBar")
   420  }
   421  func text(t *testing.T, prefix, suffix string) {
   422  	const N = 4
   423  	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
   424  	const L = len(S)
   425  	b := make([]byte, len(prefix)+N+len(suffix))
   426  	copy(b, prefix)
   427  	copy(b[len(prefix)+N:], suffix)
   428  	h := newHashSet()
   429  	c := b[len(prefix):]
   430  	for i := 0; i < L; i++ {
   431  		c[0] = S[i]
   432  		for j := 0; j < L; j++ {
   433  			c[1] = S[j]
   434  			for k := 0; k < L; k++ {
   435  				c[2] = S[k]
   436  				for x := 0; x < L; x++ {
   437  					c[3] = S[x]
   438  					h.addB(b)
   439  				}
   440  			}
   441  		}
   442  	}
   443  	h.check(t)
   444  }
   445  
   446  // Make sure different seed values generate different hashes.
   447  func TestSmhasherSeed(t *testing.T) {
   448  	if unsafe.Sizeof(uintptr(0)) == 4 {
   449  		t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
   450  	}
   451  	h := newHashSet()
   452  	const N = 100000
   453  	s := "hello"
   454  	for i := 0; i < N; i++ {
   455  		h.addS_seed(s, seed{s: uint64(i + 1)})
   456  		h.addS_seed(s, seed{s: uint64(i+1) << 32}) // make sure high bits are used
   457  	}
   458  	h.check(t)
   459  }
   460  
   461  type seed struct {
   462  	s uint64
   463  }
   464  
   465  //go:linkname runtime_fastrand runtime.fastrand
   466  func runtime_fastrand() uint32
   467  
   468  // makeSeed returns a new random seed.
   469  func makeSeed() seed {
   470  	var s1, s2 uint64
   471  	for {
   472  		s1 = uint64(runtime_fastrand())
   473  		s2 = uint64(runtime_fastrand())
   474  		// We use seed 0 to indicate an uninitialized seed/hash,
   475  		// so keep trying until we get a non-zero seed.
   476  		if s1|s2 != 0 {
   477  			break
   478  		}
   479  	}
   480  	return seed{s: s1<<32 + s2}
   481  }