github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/bindex/hash_index.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bindex
    16  
    17  import (
    18  	"arena"
    19  	"encoding/binary"
    20  	"sort"
    21  )
    22  
    23  const (
    24  	MaxLowBit             = 0xFFFF
    25  	HashIndexShardItemAvg = 1 << 8
    26  	HashIndexShardsNum    = 64 << 10
    27  	HashIndexShardSize    = 4
    28  	HashIndexItem32Size   = 6
    29  	HashIndexItem64Size   = 10
    30  )
    31  
    32  type HashIndex struct {
    33  	header     Header
    34  	size       uint32
    35  	length     uint32
    36  	itemOffset uint32
    37  	type32     bool
    38  	uniq       bool
    39  	data       []byte
    40  	data32     []FItem32Array
    41  	data64     []FItem64Array
    42  	arena      *arena.Arena
    43  }
    44  
    45  type FItem32 struct {
    46  	key   uint16
    47  	value uint32
    48  }
    49  
    50  type FItem64 struct {
    51  	key   uint16
    52  	value uint64
    53  }
    54  
    55  type FItem32Array []FItem32
    56  
    57  func (i32 FItem32Array) Len() int {
    58  	return len(i32)
    59  }
    60  
    61  func (i32 FItem32Array) Swap(i, j int) {
    62  	i32[i], i32[j] = i32[j], i32[i]
    63  }
    64  
    65  func (i32 FItem32Array) Less(i, j int) bool {
    66  	return i32[i].key < i32[j].key
    67  }
    68  
    69  type FItem64Array []FItem64
    70  
    71  func (i64 FItem64Array) Len() int {
    72  	return len(i64)
    73  }
    74  
    75  func (i64 FItem64Array) Swap(i, j int) {
    76  	i64[i], i64[j] = i64[j], i64[i]
    77  }
    78  
    79  func (i64 FItem64Array) Less(i, j int) bool {
    80  	return i64[i].key < i64[j].key
    81  }
    82  
    83  func NewHashIndex(type32 bool) *HashIndex {
    84  	offset := uint32(SuccinctHeaderSize + HashIndexShardsNum*HashIndexShardSize)
    85  
    86  	m := &HashIndex{
    87  		header:     Header{version: SuccinctVersion, reserved: 0, shards: HashIndexShardsNum},
    88  		size:       offset,
    89  		length:     0,
    90  		itemOffset: offset,
    91  		type32:     type32,
    92  		uniq:       false,
    93  		data:       nil,
    94  		data32:     nil,
    95  		data64:     nil,
    96  		arena:      nil,
    97  	}
    98  
    99  	return m
   100  }
   101  
   102  func (s *HashIndex) Size() uint32 {
   103  	if !s.uniq {
   104  		s.Unique()
   105  	}
   106  	return s.size
   107  }
   108  
   109  func (s *HashIndex) Length() uint32 {
   110  	return s.length
   111  }
   112  
   113  func (s *HashIndex) GetData() []byte {
   114  	return s.data
   115  }
   116  
   117  func (s *HashIndex) SetReader(d []byte) bool {
   118  	if d == nil || len(d) <= int(s.itemOffset) {
   119  		return false
   120  	}
   121  
   122  	s.data = d
   123  	s.header = s.readHeader(s.data)
   124  
   125  	return true
   126  }
   127  
   128  func (s *HashIndex) InitWriter() {
   129  	s.arena = arena.NewArena()
   130  
   131  	if s.type32 {
   132  		s.data32 = arena.MakeSlice[FItem32Array](s.arena, int(s.header.shards), int(s.header.shards))
   133  	} else {
   134  		s.data64 = arena.MakeSlice[FItem64Array](s.arena, int(s.header.shards), int(s.header.shards))
   135  	}
   136  }
   137  
   138  func (s *HashIndex) SetWriter(d []byte) bool {
   139  	if d == nil || len(d) < int(s.size) || cap(d) < int(s.size) {
   140  		return false
   141  	}
   142  
   143  	s.data = d
   144  
   145  	return true
   146  }
   147  
   148  func (s *HashIndex) Add(key uint32, value any) {
   149  	switch value.(type) {
   150  	case uint32:
   151  		if s.type32 {
   152  			s.add32Internal(key, value.(uint32))
   153  		}
   154  		return
   155  	case uint64:
   156  		if !s.type32 {
   157  			s.add64Internal(key, value.(uint64))
   158  		}
   159  		return
   160  	default:
   161  		return
   162  	}
   163  }
   164  
   165  func (s *HashIndex) Unique() {
   166  	if s.uniq {
   167  		return
   168  	}
   169  
   170  	if s.type32 {
   171  		s.unique32Internal()
   172  	} else {
   173  		s.unique64Internal()
   174  	}
   175  
   176  	s.uniq = true
   177  }
   178  
   179  func (s *HashIndex) Serialize() bool {
   180  	if !s.uniq {
   181  		s.Unique()
   182  	}
   183  
   184  	if s.type32 {
   185  		return s.serialize32Internal()
   186  	} else {
   187  		return s.serialize64Internal()
   188  	}
   189  }
   190  
   191  func (s *HashIndex) Get(key uint32) (any, bool) {
   192  	if s.type32 {
   193  		return s.Get32(key)
   194  	} else {
   195  		return s.Get64(key)
   196  	}
   197  }
   198  
   199  func (s *HashIndex) add32Internal(key uint32, value uint32) {
   200  	if s.header.shards <= 0 {
   201  		return
   202  	}
   203  
   204  	hid := s.highbits(key)
   205  	lid := s.lowbits(key)
   206  
   207  	if len(s.data32[hid]) == 0 {
   208  		s.data32[hid] = arena.MakeSlice[FItem32](s.arena, 0, HashIndexShardItemAvg)
   209  	}
   210  
   211  	s.data32[hid] = append(s.data32[hid], FItem32{key: lid, value: value})
   212  
   213  	s.size += HashIndexItem32Size
   214  	s.length++
   215  }
   216  
   217  func (s *HashIndex) add64Internal(key uint32, value uint64) {
   218  	if s.header.shards <= 0 {
   219  		return
   220  	}
   221  
   222  	hid := s.highbits(key)
   223  	lid := s.lowbits(key)
   224  
   225  	if len(s.data64[hid]) == 0 {
   226  		s.data64[hid] = arena.MakeSlice[FItem64](s.arena, 0, HashIndexShardItemAvg)
   227  	}
   228  
   229  	s.data64[hid] = append(s.data64[hid], FItem64{key: lid, value: value})
   230  
   231  	s.size += HashIndexItem64Size
   232  	s.length++
   233  }
   234  
   235  func (s *HashIndex) unique32Internal() {
   236  	if s.size <= s.itemOffset || s.length <= 0 || len(s.data32) <= 0 {
   237  		return
   238  	}
   239  
   240  	for i := uint32(0); i < s.header.shards; i++ {
   241  		itemsLen := uint32(len(s.data32[i]))
   242  		if itemsLen > 1 {
   243  			sort.Sort(s.data32[i])
   244  
   245  			uniqFlag := false
   246  			prevItem := int32(-1)
   247  			for j := uint32(0); j < itemsLen; j++ {
   248  				if prevItem == int32(s.data32[i][j].key) {
   249  					itemsLen--
   250  					s.length--
   251  					s.size -= HashIndexItem32Size
   252  					copy(s.data32[i][j:], s.data32[i][j+1:])
   253  					uniqFlag = true
   254  					continue
   255  				}
   256  
   257  				prevItem = int32(s.data32[i][j].key)
   258  			}
   259  
   260  			if uniqFlag {
   261  				s.data32[i] = s.data32[i][0:itemsLen]
   262  			}
   263  		}
   264  	}
   265  }
   266  
   267  func (s *HashIndex) unique64Internal() {
   268  	if s.size <= s.itemOffset || s.length <= 0 || len(s.data64) <= 0 {
   269  		return
   270  	}
   271  
   272  	for i := uint32(0); i < s.header.shards; i++ {
   273  		itemsLen := uint32(len(s.data64[i]))
   274  		if itemsLen > 1 {
   275  			sort.Sort(s.data64[i])
   276  
   277  			uniqFlag := false
   278  			prevItem := int32(-1)
   279  			for j := uint32(0); j < itemsLen; j++ {
   280  				if prevItem == int32(s.data64[i][j].key) {
   281  					itemsLen--
   282  					s.length--
   283  					s.size -= HashIndexItem64Size
   284  					copy(s.data64[i][j:], s.data64[i][j+1:])
   285  					uniqFlag = true
   286  					continue
   287  				}
   288  
   289  				prevItem = int32(s.data64[i][j].key)
   290  			}
   291  
   292  			if uniqFlag {
   293  				s.data64[i] = s.data64[i][0:itemsLen]
   294  			}
   295  		}
   296  	}
   297  }
   298  
   299  func (s *HashIndex) serialize32Internal() bool {
   300  	if s.size <= s.itemOffset || s.length <= 0 || len(s.data32) <= 0 {
   301  		return false
   302  	}
   303  
   304  	shardOffset := uint32(0)
   305  	itemOffset := s.itemOffset
   306  
   307  	if s.data == nil {
   308  		s.data = arena.MakeSlice[byte](s.arena, int(s.size), int(s.size))
   309  	}
   310  
   311  	s.writeHeader(s.data[shardOffset:], s.header)
   312  	shardOffset += SuccinctHeaderSize
   313  
   314  	totalCount := uint32(0)
   315  	for i := uint32(0); i < s.header.shards; i++ {
   316  		itemsLen := uint32(len(s.data32[i]))
   317  		totalCount += itemsLen
   318  		s.writeShard(s.data[shardOffset:], totalCount)
   319  		shardOffset += HashIndexShardSize
   320  
   321  		if itemsLen > 0 {
   322  			for j := uint32(0); j < itemsLen; j++ {
   323  				s.writeItem32(s.data[itemOffset:], s.data32[i][j])
   324  				itemOffset += HashIndexItem32Size
   325  			}
   326  		}
   327  	}
   328  
   329  	return true
   330  }
   331  
   332  func (s *HashIndex) serialize64Internal() bool {
   333  	if s.size <= s.itemOffset || s.length <= 0 || len(s.data64) <= 0 {
   334  		return false
   335  	}
   336  
   337  	shardOffset := uint32(0)
   338  	itemOffset := s.itemOffset
   339  
   340  	if s.data == nil {
   341  		s.data = arena.MakeSlice[byte](s.arena, int(s.size), int(s.size))
   342  	}
   343  
   344  	s.writeHeader(s.data[shardOffset:], s.header)
   345  	shardOffset += SuccinctHeaderSize
   346  
   347  	totalCount := uint32(0)
   348  	for i := uint32(0); i < s.header.shards; i++ {
   349  		itemsLen := uint32(len(s.data64[i]))
   350  		totalCount += itemsLen
   351  		s.writeShard(s.data[shardOffset:], totalCount)
   352  		shardOffset += HashIndexShardSize
   353  
   354  		if itemsLen > 0 {
   355  			for j := uint32(0); j < itemsLen; j++ {
   356  				s.writeItem64(s.data[itemOffset:], s.data64[i][j])
   357  				itemOffset += HashIndexItem64Size
   358  			}
   359  		}
   360  	}
   361  
   362  	return true
   363  }
   364  
   365  func (s *HashIndex) Get32(key uint32) (uint32, bool) {
   366  	if len(s.data) <= int(s.itemOffset) || s.header.shards <= 0 {
   367  		return 0, false
   368  	}
   369  
   370  	hid := s.highbits(key)
   371  	lid := s.lowbits(key)
   372  
   373  	originCount := uint32(0)
   374  	if hid > 0 {
   375  		originOffset := uint32(SuccinctHeaderSize) + uint32(hid-1)*HashIndexShardSize
   376  		originCount = s.readShard(s.data[originOffset:])
   377  	}
   378  
   379  	destOffset := uint32(SuccinctHeaderSize) + uint32(hid)*HashIndexShardSize
   380  	destCount := s.readShard(s.data[destOffset:])
   381  	if destCount <= originCount {
   382  		return 0, false
   383  	}
   384  
   385  	itemLength := destCount - originCount
   386  	curOffset := s.itemOffset + originCount*HashIndexItem32Size
   387  
   388  	ok, idx := s.findItem(lid, s.data[curOffset:], int(itemLength), HashIndexItem32Size)
   389  	if !ok {
   390  		return 0, false
   391  	}
   392  
   393  	curOffset += uint32(idx * HashIndexItem32Size)
   394  	value := s.readItem32Value(s.data[curOffset:])
   395  
   396  	return value, true
   397  }
   398  
   399  func (s *HashIndex) Get64(key uint32) (uint64, bool) {
   400  	if len(s.data) <= int(s.itemOffset) || s.header.shards <= 0 {
   401  		return 0, false
   402  	}
   403  
   404  	hid := s.highbits(key)
   405  	lid := s.lowbits(key)
   406  
   407  	originCount := uint32(0)
   408  	if hid > 0 {
   409  		originOffset := uint32(SuccinctHeaderSize) + uint32(hid-1)*HashIndexShardSize
   410  		originCount = s.readShard(s.data[originOffset:])
   411  	}
   412  
   413  	destOffset := uint32(SuccinctHeaderSize) + uint32(hid)*HashIndexShardSize
   414  	destCount := s.readShard(s.data[destOffset:])
   415  	if destCount <= originCount {
   416  		return 0, false
   417  	}
   418  
   419  	itemLength := destCount - originCount
   420  	curOffset := s.itemOffset + originCount*HashIndexItem64Size
   421  
   422  	ok, idx := s.findItem(lid, s.data[curOffset:], int(itemLength), HashIndexItem64Size)
   423  	if !ok {
   424  		return 0, false
   425  	}
   426  
   427  	curOffset += uint32(idx * HashIndexItem64Size)
   428  	value := s.readItem64Value(s.data[curOffset:])
   429  
   430  	return value, true
   431  }
   432  
   433  func (s *HashIndex) Finish() {
   434  	s.size = SuccinctHeaderSize
   435  	s.length = 0
   436  	s.uniq = false
   437  	s.data32 = nil
   438  	s.data64 = nil
   439  	if s.arena != nil {
   440  		s.arena.Free()
   441  		s.arena = nil
   442  	}
   443  }
   444  
   445  func (s *HashIndex) writeHeader(buf []byte, header Header) {
   446  	binary.BigEndian.PutUint16(buf[0:], header.version)
   447  	binary.BigEndian.PutUint16(buf[2:], header.reserved)
   448  	binary.BigEndian.PutUint32(buf[4:], header.shards)
   449  }
   450  
   451  func (s *HashIndex) writeShard(buf []byte, count uint32) {
   452  	binary.BigEndian.PutUint32(buf[0:], count)
   453  }
   454  
   455  func (s *HashIndex) writeItem32(buf []byte, item32 FItem32) {
   456  	binary.BigEndian.PutUint16(buf[0:], item32.key)
   457  	binary.BigEndian.PutUint32(buf[2:], item32.value)
   458  }
   459  
   460  func (s *HashIndex) writeItem64(buf []byte, item64 FItem64) {
   461  	binary.BigEndian.PutUint16(buf[0:], item64.key)
   462  	binary.BigEndian.PutUint64(buf[2:], item64.value)
   463  }
   464  
   465  func (s *HashIndex) readHeader(buf []byte) Header {
   466  	header := Header{
   467  		version:  binary.BigEndian.Uint16(buf[0:]),
   468  		reserved: binary.BigEndian.Uint16(buf[2:]),
   469  		shards:   binary.BigEndian.Uint32(buf[4:]),
   470  	}
   471  
   472  	return header
   473  }
   474  
   475  func (s *HashIndex) readShard(buf []byte) uint32 {
   476  	return binary.BigEndian.Uint32(buf[0:])
   477  }
   478  
   479  func (s *HashIndex) readItem32Value(buf []byte) uint32 {
   480  	return binary.BigEndian.Uint32(buf[2:])
   481  }
   482  
   483  func (s *HashIndex) readItem64Value(buf []byte) uint64 {
   484  	return binary.BigEndian.Uint64(buf[2:])
   485  }
   486  
   487  func (s *HashIndex) findItem(key uint16, buf []byte, n int, step int) (bool, int) {
   488  	i, j := 0, n
   489  	for i < j {
   490  		h := int(uint(i+j) >> 1)
   491  		if binary.BigEndian.Uint16(buf[step*h:]) < key {
   492  			i = h + 1
   493  		} else {
   494  			j = h
   495  		}
   496  	}
   497  
   498  	if i < n && binary.BigEndian.Uint16(buf[step*i:]) == key {
   499  		return true, i
   500  	}
   501  
   502  	return false, 0
   503  }
   504  
   505  func (s *HashIndex) highbits(x uint32) uint16 {
   506  	return uint16(x >> 16)
   507  }
   508  
   509  func (s *HashIndex) lowbits(x uint32) uint16 {
   510  	return uint16(x & MaxLowBit)
   511  }