github.com/zhangyunhao116/wyhash@v0.4.1-0.20220217162229-7d42996fa899/smhasher_test.go (about) 1 package wyhash 2 3 // From src/hash/maphash/smhasher_test.go 4 5 import ( 6 "fmt" 7 "math" 8 "math/rand" 9 "runtime" 10 "strings" 11 "testing" 12 "unsafe" 13 ) 14 15 // Smhasher is a torture test for hash functions. 16 // https://code.google.com/p/smhasher/ 17 // This code is a port of some of the Smhasher tests to Go. 18 19 var fixedSeed = makeSeed() 20 21 // Sanity checks. 22 // hash should not depend on values outside key. 23 // hash should not depend on alignment. 24 func TestSmhasherSanity(t *testing.T) { 25 r := rand.New(rand.NewSource(1234)) 26 const REP = 10 27 const KEYMAX = 128 28 const PAD = 16 29 const OFFMAX = 16 30 for k := 0; k < REP; k++ { 31 for n := 0; n < KEYMAX; n++ { 32 for i := 0; i < OFFMAX; i++ { 33 var b [KEYMAX + OFFMAX + 2*PAD]byte 34 var c [KEYMAX + OFFMAX + 2*PAD]byte 35 randBytes(r, b[:]) 36 randBytes(r, c[:]) 37 copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n]) 38 if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) { 39 t.Errorf("hash depends on bytes outside key") 40 } 41 } 42 } 43 } 44 } 45 46 func bytesHash(b []byte) uint64 { 47 return Sum64(b) 48 } 49 func stringHash(s string) uint64 { 50 return Sum64String(s) 51 } 52 53 const hashSize = 64 54 55 func randBytes(r *rand.Rand, b []byte) { 56 r.Read(b) // can't fail 57 } 58 59 // A hashSet measures the frequency of hash collisions. 60 type hashSet struct { 61 m map[uint64]struct{} // set of hashes added 62 n int // number of hashes added 63 } 64 65 func newHashSet() *hashSet { 66 return &hashSet{make(map[uint64]struct{}), 0} 67 } 68 func (s *hashSet) add(h uint64) { 69 s.m[h] = struct{}{} 70 s.n++ 71 } 72 func (s *hashSet) addS(x string) { 73 s.add(stringHash(x)) 74 } 75 76 func (s *hashSet) addB(x []byte) { 77 s.add(bytesHash(x)) 78 } 79 80 func (s *hashSet) addS_seed(x string, seed seed) { 81 s.add(Sum64StringWithSeed(x, seed.s)) 82 } 83 84 func (s *hashSet) check(t *testing.T) { 85 const SLOP = 10.0 86 collisions := s.n - len(s.m) 87 pairs := int64(s.n) * int64(s.n-1) / 2 88 expected := float64(pairs) / math.Pow(2.0, float64(hashSize)) 89 stddev := math.Sqrt(expected) 90 if float64(collisions) > expected+SLOP*(3*stddev+1) { 91 t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev) 92 } 93 } 94 95 // a string plus adding zeros must make distinct hashes 96 func TestSmhasherAppendedZeros(t *testing.T) { 97 s := "hello" + strings.Repeat("\x00", 256) 98 h := newHashSet() 99 for i := 0; i <= len(s); i++ { 100 h.addS(s[:i]) 101 } 102 h.check(t) 103 } 104 105 // All 0-3 byte strings have distinct hashes. 106 func TestSmhasherSmallKeys(t *testing.T) { 107 h := newHashSet() 108 var b [3]byte 109 for i := 0; i < 256; i++ { 110 b[0] = byte(i) 111 h.addB(b[:1]) 112 for j := 0; j < 256; j++ { 113 b[1] = byte(j) 114 h.addB(b[:2]) 115 if !testing.Short() { 116 for k := 0; k < 256; k++ { 117 b[2] = byte(k) 118 h.addB(b[:3]) 119 } 120 } 121 } 122 } 123 h.check(t) 124 } 125 126 // Different length strings of all zeros have distinct hashes. 127 func TestSmhasherZeros(t *testing.T) { 128 N := 256 * 1024 129 if testing.Short() { 130 N = 1024 131 } 132 h := newHashSet() 133 b := make([]byte, N) 134 for i := 0; i <= N; i++ { 135 h.addB(b[:i]) 136 } 137 h.check(t) 138 } 139 140 // Strings with up to two nonzero bytes all have distinct hashes. 141 func TestSmhasherTwoNonzero(t *testing.T) { 142 if runtime.GOARCH == "wasm" { 143 t.Skip("Too slow on wasm") 144 } 145 if testing.Short() { 146 t.Skip("Skipping in short mode") 147 } 148 h := newHashSet() 149 for n := 2; n <= 16; n++ { 150 twoNonZero(h, n) 151 } 152 h.check(t) 153 } 154 func twoNonZero(h *hashSet, n int) { 155 b := make([]byte, n) 156 157 // all zero 158 h.addB(b) 159 160 // one non-zero byte 161 for i := 0; i < n; i++ { 162 for x := 1; x < 256; x++ { 163 b[i] = byte(x) 164 h.addB(b) 165 b[i] = 0 166 } 167 } 168 169 // two non-zero bytes 170 for i := 0; i < n; i++ { 171 for x := 1; x < 256; x++ { 172 b[i] = byte(x) 173 for j := i + 1; j < n; j++ { 174 for y := 1; y < 256; y++ { 175 b[j] = byte(y) 176 h.addB(b) 177 b[j] = 0 178 } 179 } 180 b[i] = 0 181 } 182 } 183 } 184 185 // Test strings with repeats, like "abcdabcdabcdabcd..." 186 func TestSmhasherCyclic(t *testing.T) { 187 if testing.Short() { 188 t.Skip("Skipping in short mode") 189 } 190 r := rand.New(rand.NewSource(1234)) 191 const REPEAT = 8 192 const N = 1000000 193 for n := 4; n <= 12; n++ { 194 h := newHashSet() 195 b := make([]byte, REPEAT*n) 196 for i := 0; i < N; i++ { 197 b[0] = byte(i * 79 % 97) 198 b[1] = byte(i * 43 % 137) 199 b[2] = byte(i * 151 % 197) 200 b[3] = byte(i * 199 % 251) 201 randBytes(r, b[4:n]) 202 for j := n; j < n*REPEAT; j++ { 203 b[j] = b[j-n] 204 } 205 h.addB(b) 206 } 207 h.check(t) 208 } 209 } 210 211 // Test strings with only a few bits set 212 func TestSmhasherSparse(t *testing.T) { 213 if runtime.GOARCH == "wasm" { 214 t.Skip("Too slow on wasm") 215 } 216 if testing.Short() { 217 t.Skip("Skipping in short mode") 218 } 219 sparse(t, 32, 6) 220 sparse(t, 40, 6) 221 sparse(t, 48, 5) 222 sparse(t, 56, 5) 223 sparse(t, 64, 5) 224 sparse(t, 96, 4) 225 sparse(t, 256, 3) 226 sparse(t, 2048, 2) 227 } 228 func sparse(t *testing.T, n int, k int) { 229 b := make([]byte, n/8) 230 h := newHashSet() 231 setbits(h, b, 0, k) 232 h.check(t) 233 } 234 235 // set up to k bits at index i and greater 236 func setbits(h *hashSet, b []byte, i int, k int) { 237 h.addB(b) 238 if k == 0 { 239 return 240 } 241 for j := i; j < len(b)*8; j++ { 242 b[j/8] |= byte(1 << uint(j&7)) 243 setbits(h, b, j+1, k-1) 244 b[j/8] &= byte(^(1 << uint(j&7))) 245 } 246 } 247 248 // Test all possible combinations of n blocks from the set s. 249 // "permutation" is a bad name here, but it is what Smhasher uses. 250 func TestSmhasherPermutation(t *testing.T) { 251 if runtime.GOARCH == "wasm" { 252 t.Skip("Too slow on wasm") 253 } 254 if testing.Short() { 255 t.Skip("Skipping in short mode") 256 } 257 permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8) 258 permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8) 259 permutation(t, []uint32{0, 1}, 20) 260 permutation(t, []uint32{0, 1 << 31}, 20) 261 permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6) 262 } 263 func permutation(t *testing.T, s []uint32, n int) { 264 b := make([]byte, n*4) 265 h := newHashSet() 266 genPerm(h, b, s, 0) 267 h.check(t) 268 } 269 func genPerm(h *hashSet, b []byte, s []uint32, n int) { 270 h.addB(b[:n]) 271 if n == len(b) { 272 return 273 } 274 for _, v := range s { 275 b[n] = byte(v) 276 b[n+1] = byte(v >> 8) 277 b[n+2] = byte(v >> 16) 278 b[n+3] = byte(v >> 24) 279 genPerm(h, b, s, n+4) 280 } 281 } 282 283 type key interface { 284 clear() // set bits all to 0 285 random(r *rand.Rand) // set key to something random 286 bits() int // how many bits key has 287 flipBit(i int) // flip bit i of the key 288 hash() uint64 // hash the key 289 name() string // for error reporting 290 } 291 292 type bytesKey struct { 293 b []byte 294 } 295 296 func (k *bytesKey) clear() { 297 for i := range k.b { 298 k.b[i] = 0 299 } 300 } 301 func (k *bytesKey) random(r *rand.Rand) { 302 randBytes(r, k.b) 303 } 304 func (k *bytesKey) bits() int { 305 return len(k.b) * 8 306 } 307 func (k *bytesKey) flipBit(i int) { 308 k.b[i>>3] ^= byte(1 << uint(i&7)) 309 } 310 func (k *bytesKey) hash() uint64 { 311 return bytesHash(k.b) 312 } 313 func (k *bytesKey) name() string { 314 return fmt.Sprintf("bytes%d", len(k.b)) 315 } 316 317 // Flipping a single bit of a key should flip each output bit with 50% probability. 318 func TestSmhasherAvalanche(t *testing.T) { 319 if runtime.GOARCH == "wasm" { 320 t.Skip("Too slow on wasm") 321 } 322 if testing.Short() { 323 t.Skip("Skipping in short mode") 324 } 325 avalancheTest1(t, &bytesKey{make([]byte, 2)}) 326 avalancheTest1(t, &bytesKey{make([]byte, 4)}) 327 avalancheTest1(t, &bytesKey{make([]byte, 8)}) 328 avalancheTest1(t, &bytesKey{make([]byte, 16)}) 329 avalancheTest1(t, &bytesKey{make([]byte, 32)}) 330 avalancheTest1(t, &bytesKey{make([]byte, 200)}) 331 } 332 func avalancheTest1(t *testing.T, k key) { 333 const REP = 100000 334 r := rand.New(rand.NewSource(1234)) 335 n := k.bits() 336 337 // grid[i][j] is a count of whether flipping 338 // input bit i affects output bit j. 339 grid := make([][hashSize]int, n) 340 341 for z := 0; z < REP; z++ { 342 // pick a random key, hash it 343 k.random(r) 344 h := k.hash() 345 346 // flip each bit, hash & compare the results 347 for i := 0; i < n; i++ { 348 k.flipBit(i) 349 d := h ^ k.hash() 350 k.flipBit(i) 351 352 // record the effects of that bit flip 353 g := &grid[i] 354 for j := 0; j < hashSize; j++ { 355 g[j] += int(d & 1) 356 d >>= 1 357 } 358 } 359 } 360 361 // Each entry in the grid should be about REP/2. 362 // More precisely, we did N = k.bits() * hashSize experiments where 363 // each is the sum of REP coin flips. We want to find bounds on the 364 // sum of coin flips such that a truly random experiment would have 365 // all sums inside those bounds with 99% probability. 366 N := n * hashSize 367 var c float64 368 // find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999 369 for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 { 370 } 371 c *= 4.0 // allowed slack - we don't need to be perfectly random 372 mean := .5 * REP 373 stddev := .5 * math.Sqrt(REP) 374 low := int(mean - c*stddev) 375 high := int(mean + c*stddev) 376 for i := 0; i < n; i++ { 377 for j := 0; j < hashSize; j++ { 378 x := grid[i][j] 379 if x < low || x > high { 380 t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP) 381 } 382 } 383 } 384 } 385 386 // All bit rotations of a set of distinct keys 387 func TestSmhasherWindowed(t *testing.T) { 388 windowed(t, &bytesKey{make([]byte, 128)}) 389 } 390 func windowed(t *testing.T, k key) { 391 if runtime.GOARCH == "wasm" { 392 t.Skip("Too slow on wasm") 393 } 394 if testing.Short() { 395 t.Skip("Skipping in short mode") 396 } 397 const BITS = 16 398 399 for r := 0; r < k.bits(); r++ { 400 h := newHashSet() 401 for i := 0; i < 1<<BITS; i++ { 402 k.clear() 403 for j := 0; j < BITS; j++ { 404 if i>>uint(j)&1 != 0 { 405 k.flipBit((j + r) % k.bits()) 406 } 407 } 408 h.add(k.hash()) 409 } 410 h.check(t) 411 } 412 } 413 414 // All keys of the form prefix + [A-Za-z0-9]*N + suffix. 415 func TestSmhasherText(t *testing.T) { 416 if testing.Short() { 417 t.Skip("Skipping in short mode") 418 } 419 text(t, "Foo", "Bar") 420 text(t, "FooBar", "") 421 text(t, "", "FooBar") 422 } 423 func text(t *testing.T, prefix, suffix string) { 424 const N = 4 425 const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789" 426 const L = len(S) 427 b := make([]byte, len(prefix)+N+len(suffix)) 428 copy(b, prefix) 429 copy(b[len(prefix)+N:], suffix) 430 h := newHashSet() 431 c := b[len(prefix):] 432 for i := 0; i < L; i++ { 433 c[0] = S[i] 434 for j := 0; j < L; j++ { 435 c[1] = S[j] 436 for k := 0; k < L; k++ { 437 c[2] = S[k] 438 for x := 0; x < L; x++ { 439 c[3] = S[x] 440 h.addB(b) 441 } 442 } 443 } 444 } 445 h.check(t) 446 } 447 448 // Make sure different seed values generate different hashes. 449 func TestSmhasherSeed(t *testing.T) { 450 if unsafe.Sizeof(uintptr(0)) == 4 { 451 t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)") 452 } 453 h := newHashSet() 454 const N = 100000 455 s := "hello" 456 for i := 0; i < N; i++ { 457 h.addS_seed(s, seed{s: uint64(i + 1)}) 458 h.addS_seed(s, seed{s: uint64(i+1) << 32}) // make sure high bits are used 459 } 460 h.check(t) 461 } 462 463 type seed struct { 464 s uint64 465 } 466 467 // makeSeed returns a new random seed. 468 func makeSeed() seed { 469 var s1, s2 uint64 470 for { 471 s1 = uint64(runtime_fastrand()) 472 s2 = uint64(runtime_fastrand()) 473 // We use seed 0 to indicate an uninitialized seed/hash, 474 // so keep trying until we get a non-zero seed. 475 if s1|s2 != 0 { 476 break 477 } 478 } 479 return seed{s: s1<<32 + s2} 480 }