github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/hash/maphash/smhasher_test.go (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package maphash 6 7 import ( 8 "fmt" 9 "math" 10 "math/rand" 11 "runtime" 12 "strings" 13 "testing" 14 "unsafe" 15 ) 16 17 // Smhasher is a torture test for hash functions. 18 // https://code.google.com/p/smhasher/ 19 // This code is a port of some of the Smhasher tests to Go. 20 21 var fixedSeed = MakeSeed() 22 23 // Sanity checks. 24 // hash should not depend on values outside key. 25 // hash should not depend on alignment. 26 func TestSmhasherSanity(t *testing.T) { 27 r := rand.New(rand.NewSource(1234)) 28 const REP = 10 29 const KEYMAX = 128 30 const PAD = 16 31 const OFFMAX = 16 32 for k := 0; k < REP; k++ { 33 for n := 0; n < KEYMAX; n++ { 34 for i := 0; i < OFFMAX; i++ { 35 var b [KEYMAX + OFFMAX + 2*PAD]byte 36 var c [KEYMAX + OFFMAX + 2*PAD]byte 37 randBytes(r, b[:]) 38 randBytes(r, c[:]) 39 copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n]) 40 if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) { 41 t.Errorf("hash depends on bytes outside key") 42 } 43 } 44 } 45 } 46 } 47 48 func bytesHash(b []byte) uint64 { 49 var h Hash 50 h.SetSeed(fixedSeed) 51 h.Write(b) 52 return h.Sum64() 53 } 54 func stringHash(s string) uint64 { 55 var h Hash 56 h.SetSeed(fixedSeed) 57 h.WriteString(s) 58 return h.Sum64() 59 } 60 61 const hashSize = 64 62 63 func randBytes(r *rand.Rand, b []byte) { 64 r.Read(b) // can't fail 65 } 66 67 // A hashSet measures the frequency of hash collisions. 68 type hashSet struct { 69 m map[uint64]struct{} // set of hashes added 70 n int // number of hashes added 71 } 72 73 func newHashSet() *hashSet { 74 return &hashSet{make(map[uint64]struct{}), 0} 75 } 76 func (s *hashSet) add(h uint64) { 77 s.m[h] = struct{}{} 78 s.n++ 79 } 80 func (s *hashSet) addS(x string) { 81 s.add(stringHash(x)) 82 } 83 func (s *hashSet) addB(x []byte) { 84 s.add(bytesHash(x)) 85 } 86 func (s *hashSet) addS_seed(x string, seed Seed) { 87 var h Hash 88 h.SetSeed(seed) 89 h.WriteString(x) 90 s.add(h.Sum64()) 91 } 92 func (s *hashSet) check(t *testing.T) { 93 const SLOP = 10.0 94 collisions := s.n - len(s.m) 95 pairs := int64(s.n) * int64(s.n-1) / 2 96 expected := float64(pairs) / math.Pow(2.0, float64(hashSize)) 97 stddev := math.Sqrt(expected) 98 if float64(collisions) > expected+SLOP*(3*stddev+1) { 99 t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev) 100 } 101 } 102 103 // a string plus adding zeros must make distinct hashes 104 func TestSmhasherAppendedZeros(t *testing.T) { 105 s := "hello" + strings.Repeat("\x00", 256) 106 h := newHashSet() 107 for i := 0; i <= len(s); i++ { 108 h.addS(s[:i]) 109 } 110 h.check(t) 111 } 112 113 // All 0-3 byte strings have distinct hashes. 114 func TestSmhasherSmallKeys(t *testing.T) { 115 h := newHashSet() 116 var b [3]byte 117 for i := 0; i < 256; i++ { 118 b[0] = byte(i) 119 h.addB(b[:1]) 120 for j := 0; j < 256; j++ { 121 b[1] = byte(j) 122 h.addB(b[:2]) 123 if !testing.Short() { 124 for k := 0; k < 256; k++ { 125 b[2] = byte(k) 126 h.addB(b[:3]) 127 } 128 } 129 } 130 } 131 h.check(t) 132 } 133 134 // Different length strings of all zeros have distinct hashes. 135 func TestSmhasherZeros(t *testing.T) { 136 N := 256 * 1024 137 if testing.Short() { 138 N = 1024 139 } 140 h := newHashSet() 141 b := make([]byte, N) 142 for i := 0; i <= N; i++ { 143 h.addB(b[:i]) 144 } 145 h.check(t) 146 } 147 148 // Strings with up to two nonzero bytes all have distinct hashes. 149 func TestSmhasherTwoNonzero(t *testing.T) { 150 if runtime.GOARCH == "wasm" { 151 t.Skip("Too slow on wasm") 152 } 153 if testing.Short() { 154 t.Skip("Skipping in short mode") 155 } 156 h := newHashSet() 157 for n := 2; n <= 16; n++ { 158 twoNonZero(h, n) 159 } 160 h.check(t) 161 } 162 func twoNonZero(h *hashSet, n int) { 163 b := make([]byte, n) 164 165 // all zero 166 h.addB(b) 167 168 // one non-zero byte 169 for i := 0; i < n; i++ { 170 for x := 1; x < 256; x++ { 171 b[i] = byte(x) 172 h.addB(b) 173 b[i] = 0 174 } 175 } 176 177 // two non-zero bytes 178 for i := 0; i < n; i++ { 179 for x := 1; x < 256; x++ { 180 b[i] = byte(x) 181 for j := i + 1; j < n; j++ { 182 for y := 1; y < 256; y++ { 183 b[j] = byte(y) 184 h.addB(b) 185 b[j] = 0 186 } 187 } 188 b[i] = 0 189 } 190 } 191 } 192 193 // Test strings with repeats, like "abcdabcdabcdabcd..." 194 func TestSmhasherCyclic(t *testing.T) { 195 if testing.Short() { 196 t.Skip("Skipping in short mode") 197 } 198 r := rand.New(rand.NewSource(1234)) 199 const REPEAT = 8 200 const N = 1000000 201 for n := 4; n <= 12; n++ { 202 h := newHashSet() 203 b := make([]byte, REPEAT*n) 204 for i := 0; i < N; i++ { 205 b[0] = byte(i * 79 % 97) 206 b[1] = byte(i * 43 % 137) 207 b[2] = byte(i * 151 % 197) 208 b[3] = byte(i * 199 % 251) 209 randBytes(r, b[4:n]) 210 for j := n; j < n*REPEAT; j++ { 211 b[j] = b[j-n] 212 } 213 h.addB(b) 214 } 215 h.check(t) 216 } 217 } 218 219 // Test strings with only a few bits set 220 func TestSmhasherSparse(t *testing.T) { 221 if runtime.GOARCH == "wasm" { 222 t.Skip("Too slow on wasm") 223 } 224 if testing.Short() { 225 t.Skip("Skipping in short mode") 226 } 227 sparse(t, 32, 6) 228 sparse(t, 40, 6) 229 sparse(t, 48, 5) 230 sparse(t, 56, 5) 231 sparse(t, 64, 5) 232 sparse(t, 96, 4) 233 sparse(t, 256, 3) 234 sparse(t, 2048, 2) 235 } 236 func sparse(t *testing.T, n int, k int) { 237 b := make([]byte, n/8) 238 h := newHashSet() 239 setbits(h, b, 0, k) 240 h.check(t) 241 } 242 243 // set up to k bits at index i and greater 244 func setbits(h *hashSet, b []byte, i int, k int) { 245 h.addB(b) 246 if k == 0 { 247 return 248 } 249 for j := i; j < len(b)*8; j++ { 250 b[j/8] |= byte(1 << uint(j&7)) 251 setbits(h, b, j+1, k-1) 252 b[j/8] &= byte(^(1 << uint(j&7))) 253 } 254 } 255 256 // Test all possible combinations of n blocks from the set s. 257 // "permutation" is a bad name here, but it is what Smhasher uses. 258 func TestSmhasherPermutation(t *testing.T) { 259 if runtime.GOARCH == "wasm" { 260 t.Skip("Too slow on wasm") 261 } 262 if testing.Short() { 263 t.Skip("Skipping in short mode") 264 } 265 permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8) 266 permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8) 267 permutation(t, []uint32{0, 1}, 20) 268 permutation(t, []uint32{0, 1 << 31}, 20) 269 permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6) 270 } 271 func permutation(t *testing.T, s []uint32, n int) { 272 b := make([]byte, n*4) 273 h := newHashSet() 274 genPerm(h, b, s, 0) 275 h.check(t) 276 } 277 func genPerm(h *hashSet, b []byte, s []uint32, n int) { 278 h.addB(b[:n]) 279 if n == len(b) { 280 return 281 } 282 for _, v := range s { 283 b[n] = byte(v) 284 b[n+1] = byte(v >> 8) 285 b[n+2] = byte(v >> 16) 286 b[n+3] = byte(v >> 24) 287 genPerm(h, b, s, n+4) 288 } 289 } 290 291 type key interface { 292 clear() // set bits all to 0 293 random(r *rand.Rand) // set key to something random 294 bits() int // how many bits key has 295 flipBit(i int) // flip bit i of the key 296 hash() uint64 // hash the key 297 name() string // for error reporting 298 } 299 300 type bytesKey struct { 301 b []byte 302 } 303 304 func (k *bytesKey) clear() { 305 for i := range k.b { 306 k.b[i] = 0 307 } 308 } 309 func (k *bytesKey) random(r *rand.Rand) { 310 randBytes(r, k.b) 311 } 312 func (k *bytesKey) bits() int { 313 return len(k.b) * 8 314 } 315 func (k *bytesKey) flipBit(i int) { 316 k.b[i>>3] ^= byte(1 << uint(i&7)) 317 } 318 func (k *bytesKey) hash() uint64 { 319 return bytesHash(k.b) 320 } 321 func (k *bytesKey) name() string { 322 return fmt.Sprintf("bytes%d", len(k.b)) 323 } 324 325 // Flipping a single bit of a key should flip each output bit with 50% probability. 326 func TestSmhasherAvalanche(t *testing.T) { 327 if runtime.GOARCH == "wasm" { 328 t.Skip("Too slow on wasm") 329 } 330 if testing.Short() { 331 t.Skip("Skipping in short mode") 332 } 333 avalancheTest1(t, &bytesKey{make([]byte, 2)}) 334 avalancheTest1(t, &bytesKey{make([]byte, 4)}) 335 avalancheTest1(t, &bytesKey{make([]byte, 8)}) 336 avalancheTest1(t, &bytesKey{make([]byte, 16)}) 337 avalancheTest1(t, &bytesKey{make([]byte, 32)}) 338 avalancheTest1(t, &bytesKey{make([]byte, 200)}) 339 } 340 func avalancheTest1(t *testing.T, k key) { 341 const REP = 100000 342 r := rand.New(rand.NewSource(1234)) 343 n := k.bits() 344 345 // grid[i][j] is a count of whether flipping 346 // input bit i affects output bit j. 347 grid := make([][hashSize]int, n) 348 349 for z := 0; z < REP; z++ { 350 // pick a random key, hash it 351 k.random(r) 352 h := k.hash() 353 354 // flip each bit, hash & compare the results 355 for i := 0; i < n; i++ { 356 k.flipBit(i) 357 d := h ^ k.hash() 358 k.flipBit(i) 359 360 // record the effects of that bit flip 361 g := &grid[i] 362 for j := 0; j < hashSize; j++ { 363 g[j] += int(d & 1) 364 d >>= 1 365 } 366 } 367 } 368 369 // Each entry in the grid should be about REP/2. 370 // More precisely, we did N = k.bits() * hashSize experiments where 371 // each is the sum of REP coin flips. We want to find bounds on the 372 // sum of coin flips such that a truly random experiment would have 373 // all sums inside those bounds with 99% probability. 374 N := n * hashSize 375 var c float64 376 // find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999 377 for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 { 378 } 379 c *= 4.0 // allowed slack - we don't need to be perfectly random 380 mean := .5 * REP 381 stddev := .5 * math.Sqrt(REP) 382 low := int(mean - c*stddev) 383 high := int(mean + c*stddev) 384 for i := 0; i < n; i++ { 385 for j := 0; j < hashSize; j++ { 386 x := grid[i][j] 387 if x < low || x > high { 388 t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP) 389 } 390 } 391 } 392 } 393 394 // All bit rotations of a set of distinct keys 395 func TestSmhasherWindowed(t *testing.T) { 396 windowed(t, &bytesKey{make([]byte, 128)}) 397 } 398 func windowed(t *testing.T, k key) { 399 if runtime.GOARCH == "wasm" { 400 t.Skip("Too slow on wasm") 401 } 402 if testing.Short() { 403 t.Skip("Skipping in short mode") 404 } 405 const BITS = 16 406 407 for r := 0; r < k.bits(); r++ { 408 h := newHashSet() 409 for i := 0; i < 1<<BITS; i++ { 410 k.clear() 411 for j := 0; j < BITS; j++ { 412 if i>>uint(j)&1 != 0 { 413 k.flipBit((j + r) % k.bits()) 414 } 415 } 416 h.add(k.hash()) 417 } 418 h.check(t) 419 } 420 } 421 422 // All keys of the form prefix + [A-Za-z0-9]*N + suffix. 423 func TestSmhasherText(t *testing.T) { 424 if testing.Short() { 425 t.Skip("Skipping in short mode") 426 } 427 text(t, "Foo", "Bar") 428 text(t, "FooBar", "") 429 text(t, "", "FooBar") 430 } 431 func text(t *testing.T, prefix, suffix string) { 432 const N = 4 433 const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789" 434 const L = len(S) 435 b := make([]byte, len(prefix)+N+len(suffix)) 436 copy(b, prefix) 437 copy(b[len(prefix)+N:], suffix) 438 h := newHashSet() 439 c := b[len(prefix):] 440 for i := 0; i < L; i++ { 441 c[0] = S[i] 442 for j := 0; j < L; j++ { 443 c[1] = S[j] 444 for k := 0; k < L; k++ { 445 c[2] = S[k] 446 for x := 0; x < L; x++ { 447 c[3] = S[x] 448 h.addB(b) 449 } 450 } 451 } 452 } 453 h.check(t) 454 } 455 456 // Make sure different seed values generate different hashes. 457 func TestSmhasherSeed(t *testing.T) { 458 if unsafe.Sizeof(uintptr(0)) == 4 { 459 t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)") 460 } 461 h := newHashSet() 462 const N = 100000 463 s := "hello" 464 for i := 0; i < N; i++ { 465 h.addS_seed(s, Seed{s: uint64(i + 1)}) 466 h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used 467 } 468 h.check(t) 469 }