github.com/rolandhe/saber@v0.0.4/hash/cityhash.go (about)

     1  // Golang concurrent tools like java juc.
     2  //
     3  // Copyright 2023 The saber Authors. All rights reserved.
     4  
     5  // Package hash, 实现常见的hash算法,当前实现 cityhash 算法,未来会扩展其他hash 算法。
     6  // 当前cityhash 完全移植c++版本的cityhash算法, 由于没有找到对_mm_crc32_u64的支持,所以不支持CityHashCrc256, 后续找到办法后会继续支持
     7  package hash
     8  
     9  import (
    10  	"github.com/rolandhe/saber/utils/strutil"
    11  	"math/bits"
    12  	"unsafe"
    13  )
    14  
    15  // 判断当前系统的大小端属性
    16  var littleEndian bool
    17  
    18  const k0 uint64 = 0xc3a5c85c97cb3127
    19  const k1 uint64 = 0xb492b66fbe98f273
    20  const k2 uint64 = 0x9ae16a3b2f90404f
    21  const kMul uint64 = 0x9ddfea08eb382d69
    22  
    23  // Magic numbers for 32-bit hashing.  Copied from Murmur3.
    24  const c1 uint32 = 0xcc9e2d51
    25  const c2 uint32 = 0x1b873593
    26  
    27  func init() {
    28  	littleEndian = IsLittleEndian()
    29  }
    30  
    31  // Uint128 描述 128位无符号整数,它本质上由两个 uint64组成
    32  type Uint128 struct {
    33  	low  uint64
    34  	high uint64
    35  }
    36  
    37  // MakeUint128 构建一个无符号128位整数对象,需要低位、高位 uint64两个参数
    38  func MakeUint128(low uint64, high uint64) *Uint128 {
    39  	return &Uint128{
    40  		low, high,
    41  	}
    42  }
    43  
    44  // IsLittleEndian 判断当前系统的字节序是否是小端
    45  func IsLittleEndian() bool {
    46  	n := 0x1234
    47  	f := *((*byte)(unsafe.Pointer(&n)))
    48  	return (f ^ 0x34) == 0
    49  }
    50  
    51  func fetch64(data []byte) uint64 {
    52  	v := uint64(data[0])
    53  	v |= uint64(data[1]) << 8
    54  	v |= uint64(data[2]) << 16
    55  	v |= uint64(data[3]) << 24
    56  	v |= uint64(data[4]) << 32
    57  	v |= uint64(data[5]) << 40
    58  	v |= uint64(data[6]) << 48
    59  	v |= uint64(data[7]) << 56
    60  	if littleEndian {
    61  		return v
    62  	}
    63  
    64  	return bits.ReverseBytes64(v)
    65  }
    66  
    67  func fetch32(data []byte) uint32 {
    68  	v := uint32(data[0])
    69  	v |= uint32(data[1]) << 8
    70  	v |= uint32(data[2]) << 16
    71  	v |= uint32(data[3]) << 24
    72  
    73  	if littleEndian {
    74  		return v
    75  	}
    76  
    77  	return bits.ReverseBytes32(v)
    78  }
    79  
    80  func fmix(h uint32) uint32 {
    81  	h ^= h >> 16
    82  	h *= 0x85ebca6b
    83  	h ^= h >> 13
    84  	h *= 0xc2b2ae35
    85  	h ^= h >> 16
    86  	return h
    87  }
    88  
    89  func rotate32(val uint32, shift int) uint32 {
    90  	// Avoid shifting by 32: doing so yields an undefined result.
    91  	if shift == 0 {
    92  		return val
    93  	}
    94  	return (val >> shift) | (val << (32 - shift))
    95  }
    96  
    97  func mur(a uint32, h uint32) uint32 {
    98  	// Helper from Murmur3 for combining two 32-bit values.
    99  	a *= c1
   100  	a = rotate32(a, 17)
   101  	a *= c2
   102  	h ^= a
   103  	h = rotate32(h, 19)
   104  	return h*5 + 0xe6546b64
   105  }
   106  
   107  func hash32Len13to24(s []byte, len uint) uint32 {
   108  	a := fetch32(s[len>>1-4:])
   109  	b := fetch32(s[4:])
   110  	c := fetch32(s[len-8:])
   111  	d := fetch32(s[len>>1:])
   112  	e := fetch32(s)
   113  	f := fetch32(s[len-4:])
   114  	h := uint32(len)
   115  	return fmix(mur(f, mur(e, mur(d, mur(c, mur(b, mur(a, h)))))))
   116  }
   117  
   118  func hash32Len0to4(s []byte, len uint) uint32 {
   119  	b := uint32(0)
   120  	c := uint32(9)
   121  	for i := uint(0); i < len; i++ {
   122  		v := int8(s[i])
   123  		b = b*c1 + uint32(v)
   124  		c ^= b
   125  	}
   126  	return fmix(mur(b, mur(uint32(len), c)))
   127  }
   128  
   129  func hash32Len5to12(s []byte, len uint) uint32 {
   130  	a := uint32(len)
   131  	b := a * 5
   132  	c := uint32(9)
   133  	d := b
   134  	a += fetch32(s)
   135  	b += fetch32(s[len-4:])
   136  	pos := (len >> 1) & 4
   137  	c += fetch32(s[pos:])
   138  	return fmix(mur(c, mur(b, mur(a, d))))
   139  }
   140  
   141  func CityHash32String(str string) uint32 {
   142  	s := strutil.DetachBytesString(str)
   143  	length := uint(len(str))
   144  	return CityHash32(s, length)
   145  }
   146  
   147  // CityHash32 产生32位的hash
   148  func CityHash32(s []byte, length uint) uint32 {
   149  	if length <= 24 {
   150  		if length <= 12 {
   151  			if length <= 4 {
   152  				return hash32Len0to4(s, length)
   153  			} else {
   154  				return hash32Len5to12(s, length)
   155  			}
   156  		} else {
   157  			return hash32Len13to24(s, length)
   158  		}
   159  	}
   160  
   161  	// length > 24
   162  	h := uint32(length)
   163  	g := c1 * h
   164  	f := g
   165  	a0 := rotate32(fetch32(s[length-4:])*c1, 17) * c2
   166  	a1 := rotate32(fetch32(s[length-8:])*c1, 17) * c2
   167  	a2 := rotate32(fetch32(s[length-16:])*c1, 17) * c2
   168  	a3 := rotate32(fetch32(s[length-12:])*c1, 17) * c2
   169  	a4 := rotate32(fetch32(s[length-20:])*c1, 17) * c2
   170  	h ^= a0
   171  	h = rotate32(h, 19)
   172  	h = h*5 + 0xe6546b64
   173  	h ^= a2
   174  	h = rotate32(h, 19)
   175  	h = h*5 + 0xe6546b64
   176  	g ^= a1
   177  	g = rotate32(g, 19)
   178  	g = g*5 + 0xe6546b64
   179  	g ^= a3
   180  	g = rotate32(g, 19)
   181  	g = g*5 + 0xe6546b64
   182  	f += a4
   183  	f = rotate32(f, 19)
   184  	f = f*5 + 0xe6546b64
   185  	iters := (length - 1) / 20
   186  	for {
   187  		a0 = rotate32(fetch32(s)*c1, 17) * c2
   188  		a1 = fetch32(s[4:])
   189  		a2 = rotate32(fetch32(s[8:])*c1, 17) * c2
   190  		a3 = rotate32(fetch32(s[12:])*c1, 17) * c2
   191  		a4 = fetch32(s[16:])
   192  		h ^= a0
   193  		h = rotate32(h, 18)
   194  		h = h*5 + 0xe6546b64
   195  		f += a1
   196  		f = rotate32(f, 19)
   197  		f = f * c1
   198  		g += a2
   199  		g = rotate32(g, 18)
   200  		g = g*5 + 0xe6546b64
   201  		h ^= a3 + a1
   202  		h = rotate32(h, 19)
   203  		h = h*5 + 0xe6546b64
   204  		g ^= a4
   205  		g = bits.ReverseBytes32(g) * 5
   206  		h += a4 * 5
   207  		h = bits.ReverseBytes32(h)
   208  		f += a0
   209  		f, h, g = g, f, h
   210  		s = s[20:]
   211  		iters--
   212  		if iters == 0 {
   213  			break
   214  		}
   215  	}
   216  
   217  	g = rotate32(g, 11) * c1
   218  	g = rotate32(g, 17) * c1
   219  	f = rotate32(f, 11) * c1
   220  	f = rotate32(f, 17) * c1
   221  	h = rotate32(h+g, 19)
   222  	h = h*5 + 0xe6546b64
   223  	h = rotate32(h, 17) * c1
   224  	h = rotate32(h+f, 19)
   225  	h = h*5 + 0xe6546b64
   226  	h = rotate32(h, 17) * c1
   227  	return h
   228  }
   229  
   230  func rotate64(val uint64, shift int) uint64 {
   231  	// Avoid shifting by 64: doing so yields an undefined result.
   232  	if shift == 0 {
   233  		return val
   234  	}
   235  	return (val >> shift) | (val << (64 - shift))
   236  }
   237  
   238  func shiftMix(val uint64) uint64 {
   239  	return val ^ (val >> 47)
   240  }
   241  
   242  func hash128to64(u uint64, v uint64) uint64 {
   243  	// Murmur-inspired hashing.
   244  	a := (u ^ v) * kMul
   245  	a ^= a >> 47
   246  	b := (v ^ a) * kMul
   247  	b ^= b >> 47
   248  	b *= kMul
   249  	return b
   250  }
   251  
   252  func hashLen16(u uint64, v uint64) uint64 {
   253  	return hash128to64(u, v)
   254  }
   255  
   256  func hashLen16WithMul(u uint64, v uint64, mul uint64) uint64 {
   257  	// Murmur-inspired hashing.
   258  	a := (u ^ v) * mul
   259  	a ^= a >> 47
   260  	b := (v ^ a) * mul
   261  	b ^= b >> 47
   262  	b *= mul
   263  	return b
   264  }
   265  
   266  func hashLen0to16(s []byte, len uint) uint64 {
   267  	if len >= 8 {
   268  		mul := k2 + uint64(len)*2
   269  		a := fetch64(s) + k2
   270  		b := fetch64(s[len-8:])
   271  		c := rotate64(b, 37)*mul + a
   272  		d := (rotate64(a, 25) + b) * mul
   273  		return hashLen16WithMul(c, d, mul)
   274  	}
   275  	if len >= 4 {
   276  		mul := k2 + uint64(len)*2
   277  		a := uint64(fetch32(s))
   278  		return hashLen16WithMul(uint64(len)+(a<<3), uint64(fetch32(s[len-4:])), mul)
   279  	}
   280  	if len > 0 {
   281  		a := s[0]
   282  		b := s[len>>1]
   283  		c := s[len-1]
   284  		y := uint32(a) + (uint32(b) << 8)
   285  		z := uint32(len) + (uint32(c) << 2)
   286  		return shiftMix(uint64(y)*k2^uint64(z)*k0) * k2
   287  	}
   288  	return k2
   289  }
   290  
   291  // This probably works well for 16-byte strings as well, but it may be over kill
   292  // in that case.
   293  func hashLen17to32(s []byte, len uint) uint64 {
   294  	mul := k2 + uint64(len)*2
   295  	a := fetch64(s) * k1
   296  	b := fetch64(s[8:])
   297  	c := fetch64(s[len-8:]) * mul
   298  	d := fetch64(s[len-16:]) * k2
   299  	return hashLen16WithMul(rotate64(a+b, 43)+rotate64(c, 30)+d,
   300  		a+rotate64(b+k2, 18)+c, mul)
   301  }
   302  
   303  // Return a 16-byte hash for 48 bytes.  Quick and dirty.
   304  // Callers do best to use "random-looking" values for a and b.
   305  func weakHashLen32WithSeedsBaseNumber(
   306  	w uint64, x uint64, y uint64, z uint64, a uint64, b uint64) *Uint128 {
   307  	a += w
   308  	b = rotate64(b+a+z, 21)
   309  	c := a
   310  	a += x
   311  	a += y
   312  	b += rotate64(a, 44)
   313  	return MakeUint128(a+z, b+c)
   314  }
   315  
   316  // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
   317  func weakHashLen32WithSeeds(
   318  	s []byte, a uint64, b uint64) *Uint128 {
   319  	return weakHashLen32WithSeedsBaseNumber(fetch64(s),
   320  		fetch64(s[8:]),
   321  		fetch64(s[16:]),
   322  		fetch64(s[24:]),
   323  		a,
   324  		b)
   325  }
   326  
   327  func hashLen33to64(s []byte, length uint) uint64 {
   328  	mul := k2 + uint64(length)*2
   329  	a := fetch64(s) * k2
   330  	b := fetch64(s[8:])
   331  	c := fetch64(s[length-24:])
   332  	d := fetch64(s[length-32:])
   333  	e := fetch64(s[16:]) * k2
   334  	f := fetch64(s[24:]) * 9
   335  	g := fetch64(s[length-8:])
   336  	h := fetch64(s[length-16:]) * mul
   337  	u := rotate64(a+g, 43) + (rotate64(b, 30)+c)*9
   338  	v := ((a + g) ^ d) + f + 1
   339  	w := bits.ReverseBytes64((u+v)*mul) + h
   340  	x := rotate64(e+f, 42) + c
   341  	y := (bits.ReverseBytes64((v+w)*mul) + g) * mul
   342  	z := e + f + c
   343  	a = bits.ReverseBytes64((x+z)*mul+y) + b
   344  	b = shiftMix((z+a)*mul+d+h) * mul
   345  	return b + x
   346  }
   347  
   348  func CityHash64String(str string) uint64 {
   349  	s := strutil.DetachBytesString(str)
   350  	length := uint(len(str))
   351  	return CityHash64(s, length)
   352  }
   353  
   354  // CityHash64 产生64位的hash
   355  func CityHash64(s []byte, length uint) uint64 {
   356  	if length <= 32 {
   357  		if length <= 16 {
   358  			return hashLen0to16(s, length)
   359  		} else {
   360  			return hashLen17to32(s, length)
   361  		}
   362  	} else if length <= 64 {
   363  		return hashLen33to64(s, length)
   364  	}
   365  
   366  	// For strings over 64 bytes we hash the end first, and then as we
   367  	// loop we keep 56 bytes of state: v, w, x, y, and z.
   368  	x := fetch64(s[length-40:])
   369  	y := fetch64(s[length-16:]) + fetch64(s[length-56:])
   370  	z := hashLen16(fetch64(s[length-48:])+uint64(length), fetch64(s[length-24:]))
   371  	v := weakHashLen32WithSeeds(s[length-64:], uint64(length), z)
   372  	w := weakHashLen32WithSeeds(s[length-32:], y+k1, x)
   373  	x = x*k1 + fetch64(s)
   374  
   375  	// Decrease length to the nearest multiple of 64, and operate on 64-byte chunks.
   376  	slen := int(length)
   377  	slen = (slen - 1) & ^63
   378  	for {
   379  		x = rotate64(x+y+v.low+fetch64(s[8:]), 37) * k1
   380  		y = rotate64(y+v.high+fetch64(s[48:]), 42) * k1
   381  		x ^= w.high
   382  		y += v.low + fetch64(s[40:])
   383  		z = rotate64(z+w.low, 33) * k1
   384  		v = weakHashLen32WithSeeds(s, v.high*k1, x+w.low)
   385  		w = weakHashLen32WithSeeds(s[32:], z+w.high, y+fetch64(s[16:]))
   386  		z, x = x, z
   387  		s = s[64:]
   388  		slen -= 64
   389  		if slen == 0 {
   390  			break
   391  		}
   392  	}
   393  
   394  	return hashLen16(hashLen16(v.low, w.low)+shiftMix(y)*k1+z,
   395  		hashLen16(v.high, w.high)+x)
   396  }
   397  
   398  // cityMurmur  A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
   399  // of any length representable in signed long.  Based on City and Murmur.
   400  func cityMurmur(s []byte, len uint, seed *Uint128) *Uint128 {
   401  	a := seed.low
   402  	b := seed.high
   403  	c := uint64(0)
   404  	d := uint64(0)
   405  	if len <= 16 {
   406  		a = shiftMix(a*k1) * k1
   407  		c = b*k1 + hashLen0to16(s, len)
   408  		cv := c
   409  		if len >= 8 {
   410  			cv = fetch64(s)
   411  		}
   412  		d = shiftMix(a + cv)
   413  	} else {
   414  		c = hashLen16(fetch64(s[len-8:])+k1, a)
   415  		d = hashLen16(b+uint64(len), c+fetch64(s[len-16:]))
   416  		a += d
   417  		// len > 16 here, so do...while is safe
   418  		for {
   419  			a ^= shiftMix(fetch64(s)*k1) * k1
   420  			a *= k1
   421  			b ^= a
   422  			c ^= shiftMix(fetch64(s[8:])*k1) * k1
   423  			c *= k1
   424  			d ^= c
   425  			s = s[16:]
   426  			len -= 16
   427  			if len <= 16 {
   428  				break
   429  			}
   430  		}
   431  	}
   432  	a = hashLen16(a, c)
   433  	b = hashLen16(d, b)
   434  	return MakeUint128(a^b, hashLen16(b, a))
   435  }
   436  
   437  func cityHash128WithSeedCore(s []byte, length uint, seed *Uint128) *Uint128 {
   438  	if length < 128 {
   439  		return cityMurmur(s, length, seed)
   440  	}
   441  
   442  	// We expect length >= 128 to be the common case.  Keep 56 bytes of state:
   443  	// v, w, x, y, and z.
   444  	var v Uint128
   445  	var w Uint128
   446  	x := seed.low
   447  	y := seed.high
   448  	z := uint64(length) * k1
   449  	v.low = rotate64(y^k1, 49)*k1 + fetch64(s)
   450  	v.high = rotate64(v.low, 42)*k1 + fetch64(s[8:])
   451  	w.low = rotate64(y+z, 35)*k1 + x
   452  	w.high = rotate64(x+fetch64(s[88:]), 53) * k1
   453  
   454  	// This is the same inner loop as CityHash64(), manually unrolled.
   455  	for {
   456  		x = rotate64(x+y+v.low+fetch64(s[8:]), 37) * k1
   457  		y = rotate64(y+v.high+fetch64(s[48:]), 42) * k1
   458  		x ^= w.high
   459  		y += v.low + fetch64(s[40:])
   460  		z = rotate64(z+w.low, 33) * k1
   461  		v = *weakHashLen32WithSeeds(s, v.high*k1, x+w.low)
   462  		w = *weakHashLen32WithSeeds(s[32:], z+w.high, y+fetch64(s[16:]))
   463  		z, x = x, z
   464  		s = s[64:]
   465  		x = rotate64(x+y+v.low+fetch64(s[8:]), 37) * k1
   466  		y = rotate64(y+v.high+fetch64(s[48:]), 42) * k1
   467  		x ^= w.high
   468  		y += v.low + fetch64(s[40:])
   469  		z = rotate64(z+w.low, 33) * k1
   470  		v = *weakHashLen32WithSeeds(s, v.high*k1, x+w.low)
   471  		w = *weakHashLen32WithSeeds(s[32:], z+w.high, y+fetch64(s[16:]))
   472  		z, x = x, z
   473  		s = s[64:]
   474  		length -= 128
   475  		if length < 128 {
   476  			break
   477  		}
   478  	}
   479  	x += rotate64(v.low+z, 49) * k0
   480  	y = y*k0 + rotate64(w.high, 37)
   481  	z = z*k0 + rotate64(w.low, 27)
   482  	w.low *= 9
   483  	v.low *= k0
   484  	// If 0 < length < 128, hash up to 4 chunks of 32 bytes each from the end of s.
   485  	for tailDone := uint(0); tailDone < length; {
   486  		tailDone += 32
   487  		y = rotate64(x+y, 42)*k0 + v.high
   488  		w.low += fetch64(s[length-tailDone+16:])
   489  		x = x*k0 + w.low
   490  		z += w.high + fetch64(s[length-tailDone:])
   491  		w.high += v.low
   492  		v = *weakHashLen32WithSeeds(s[length-tailDone:], v.low+z, v.high)
   493  		v.low *= k0
   494  	}
   495  	// At this point our 56 bytes of state should contain more than
   496  	// enough information for a strong 128-bit hash.  We use two
   497  	// different 56-byte-to-8-byte hashes to get a 16-byte final result.
   498  	x = hashLen16(x, v.low)
   499  	y = hashLen16(y+z, w.low)
   500  	return MakeUint128(hashLen16(x+v.high, w.high)+y,
   501  		hashLen16(x+w.high, y+v.high))
   502  }
   503  
   504  // CityHash128String 对string算128位的cityhash值
   505  func CityHash128String(str string) *Uint128 {
   506  	s := strutil.DetachBytesString(str)
   507  	length := uint(len(str))
   508  	return CityHash128(s, length)
   509  }
   510  
   511  // CityHash128 产生128位的hash
   512  func CityHash128(s []byte, length uint) *Uint128 {
   513  	if length >= 16 {
   514  		seed := MakeUint128(fetch64(s), fetch64(s[8:])+k0)
   515  
   516  		return cityHash128WithSeedCore(s[16:], length-16, seed)
   517  	}
   518  	return CityHash128WithSeed(s, length, MakeUint128(k0, k1))
   519  }
   520  
   521  // CityHash128WithSeedString  计算指定 str 字符串的 128位hash, 需要指定 seed
   522  func CityHash128WithSeedString(str string, seed *Uint128) *Uint128 {
   523  	s := strutil.DetachBytesString(str)
   524  	length := uint(len(str))
   525  	return cityHash128WithSeedCore(s, length, seed)
   526  }
   527  
   528  // CityHash128WithSeed  计算指定 二进制数组的 128位hash, 需要指定 seed
   529  func CityHash128WithSeed(s []byte, length uint, seed *Uint128) *Uint128 {
   530  	return cityHash128WithSeedCore(s, length, seed)
   531  }
   532  
   533  func CityHash64WithSeedString(str string, seed uint64) uint64 {
   534  	s := strutil.DetachBytesString(str)
   535  	length := uint(len(str))
   536  	return CityHash64WithSeed(s, length, seed)
   537  }
   538  
   539  func CityHash64WithSeed(s []byte, length uint, seed uint64) uint64 {
   540  	return CityHash64WithSeeds(s, length, k2, seed)
   541  }
   542  
   543  func CityHash64WithSeedsString(str string, seed0 uint64, seed1 uint64) uint64 {
   544  	s := strutil.DetachBytesString(str)
   545  	length := uint(len(str))
   546  	return CityHash64WithSeeds(s, length, seed0, seed1)
   547  }
   548  
   549  func CityHash64WithSeeds(s []byte, length uint, seed0 uint64, seed1 uint64) uint64 {
   550  	return hashLen16(CityHash64(s, length)-seed0, seed1)
   551  }