github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/bindex/vector_index.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bindex
    16  
    17  import (
    18  	"arena"
    19  	"encoding/binary"
    20  	"math"
    21  	"unsafe"
    22  )
    23  
    24  type kv64 struct {
    25  	key   uint32
    26  	value uint64
    27  }
    28  
    29  type kv32 struct {
    30  	key   uint32
    31  	value uint32
    32  }
    33  
    34  type metadata [groupSize]int8
    35  type group32 [groupSize]kv32
    36  type group64 [groupSize]kv64
    37  
    38  const (
    39  	VectorVersion        = 2
    40  	h1Mask        uint32 = 0xffff_ff80
    41  	h2Mask        uint32 = 0x0000_007f
    42  	empty         int8   = -128 // 0b1000_0000
    43  )
    44  
    45  type h1 uint32
    46  
    47  type h2 int8
    48  
    49  func splitHash(h uint32) (h1, h2) {
    50  	return h1((h & h1Mask) >> 7), h2(h & h2Mask)
    51  }
    52  
    53  func probeStart(hi h1, groups int) uint32 {
    54  	return uint32(hi) % uint32(groups)
    55  }
    56  
    57  type VectorValType uint8
    58  
    59  const (
    60  	VectorValTypeUint32 VectorValType = 4
    61  	VectorValTypeUint64 VectorValType = 8
    62  )
    63  
    64  type VectorHeader struct {
    65  	version uint16
    66  	vtype   VectorValType
    67  	shards  uint32
    68  }
    69  
    70  type VectorIndex struct {
    71  	header     VectorHeader
    72  	ctrl       []metadata
    73  	groups32   []group32
    74  	groups64   []group64
    75  	resident   uint32
    76  	limit      uint32
    77  	count      uint32
    78  	groupBytes uint32
    79  	saveGroupN uint32
    80  	data       []byte
    81  	arena      *arena.Arena
    82  }
    83  
    84  func NewVectorIndex() (m *VectorIndex) {
    85  	m = &VectorIndex{}
    86  	return
    87  }
    88  
    89  func (m *VectorIndex) InitWriter(sz uint32, vtype VectorValType) {
    90  	groups := numGroups(sz)
    91  	m.header = VectorHeader{
    92  		version: VectorVersion,
    93  		vtype:   vtype,
    94  		shards:  groups,
    95  	}
    96  	m.ctrl = make([]metadata, groups)
    97  	m.limit = groups * maxAvgGroupLoad
    98  	m.groupBytes = groupSize * (5 + uint32(vtype))
    99  	m.arena = arena.NewArena()
   100  
   101  	switch vtype {
   102  	case VectorValTypeUint32:
   103  		m.groups32 = make([]group32, groups)
   104  	case VectorValTypeUint64:
   105  		m.groups64 = make([]group64, groups)
   106  	}
   107  	for i := range m.ctrl {
   108  		m.ctrl[i] = newEmptyMetadata()
   109  	}
   110  }
   111  
   112  func (m *VectorIndex) SetReader(d []byte) bool {
   113  	if d == nil {
   114  		return false
   115  	}
   116  
   117  	m.data = d
   118  	m.header = readHeader(m.data)
   119  	if m.header.vtype != VectorValTypeUint32 && m.header.vtype != VectorValTypeUint64 {
   120  		return false
   121  	}
   122  	m.groupBytes = groupSize * (5 + uint32(m.header.vtype))
   123  	return m.readMetadata()
   124  }
   125  
   126  func (m *VectorIndex) Get(key uint32) (any, bool) {
   127  	if m.header.vtype == VectorValTypeUint32 {
   128  		return m.Get32(key)
   129  	} else {
   130  		return m.Get64(key)
   131  	}
   132  }
   133  
   134  func (m *VectorIndex) innerMemGet32(key uint32) (value uint32, ok bool) {
   135  	hi, lo := splitHash(key)
   136  	g := probeStart(hi, len(m.groups32))
   137  	for {
   138  		matches := metaMatchH2(&m.ctrl[g], lo)
   139  		for matches != 0 {
   140  			s := nextMatch(&matches)
   141  			if key == m.groups32[g][s].key {
   142  				value, ok = m.groups32[g][s].value, true
   143  				return
   144  			}
   145  		}
   146  		matches = metaMatchEmpty(&m.ctrl[g])
   147  		if matches != 0 {
   148  			ok = false
   149  			return
   150  		}
   151  		g += 1
   152  		if g >= uint32(len(m.groups32)) {
   153  			g = 0
   154  		}
   155  	}
   156  }
   157  
   158  func (m *VectorIndex) innerMemGet64(key uint32) (value uint64, ok bool) {
   159  	hi, lo := splitHash(key)
   160  	g := probeStart(hi, len(m.groups64))
   161  	for {
   162  		matches := metaMatchH2(&m.ctrl[g], lo)
   163  		for matches != 0 {
   164  			s := nextMatch(&matches)
   165  			if key == m.groups64[g][s].key {
   166  				value, ok = m.groups64[g][s].value, true
   167  				return
   168  			}
   169  		}
   170  		matches = metaMatchEmpty(&m.ctrl[g])
   171  		if matches != 0 {
   172  			ok = false
   173  			return
   174  		}
   175  		g += 1
   176  		if g >= uint32(len(m.groups64)) {
   177  			g = 0
   178  		}
   179  	}
   180  }
   181  
   182  func (m *VectorIndex) rehash32(n uint32) {
   183  	groups, ctrl := m.groups32, m.ctrl
   184  	m.groups32 = make([]group32, n)
   185  	m.ctrl = make([]metadata, n)
   186  	for i := range m.ctrl {
   187  		m.ctrl[i] = newEmptyMetadata()
   188  	}
   189  	m.limit = n * maxAvgGroupLoad
   190  	m.resident = 0
   191  	for g := range ctrl {
   192  		for s := range ctrl[g] {
   193  			c := ctrl[g][s]
   194  			if c == empty {
   195  				continue
   196  			}
   197  			m.add32rehash(groups[g][s].key, groups[g][s].value)
   198  		}
   199  	}
   200  }
   201  
   202  func (m *VectorIndex) rehash64(n uint32) {
   203  	groups, ctrl := m.groups64, m.ctrl
   204  	m.groups64 = make([]group64, n)
   205  	m.ctrl = make([]metadata, n)
   206  	for i := range m.ctrl {
   207  		m.ctrl[i] = newEmptyMetadata()
   208  	}
   209  	m.limit = n * maxAvgGroupLoad
   210  	m.resident = 0
   211  	for g := range ctrl {
   212  		for s := range ctrl[g] {
   213  			c := ctrl[g][s]
   214  			if c == empty {
   215  				continue
   216  			}
   217  			m.add64rehash(groups[g][s].key, groups[g][s].value)
   218  		}
   219  	}
   220  }
   221  
   222  func (m *VectorIndex) add32rehash(key uint32, value uint32) {
   223  	hi, lo := splitHash(key)
   224  	g := probeStart(hi, len(m.groups32))
   225  	for {
   226  		matches := metaMatchH2(&m.ctrl[g], lo)
   227  		for matches != 0 {
   228  			s := nextMatch(&matches)
   229  			if key == m.groups32[g][s].key {
   230  				return
   231  			}
   232  		}
   233  		matches = metaMatchEmpty(&m.ctrl[g])
   234  		if matches != 0 {
   235  			s := nextMatch(&matches)
   236  			m.groups32[g][s].key = key
   237  			m.groups32[g][s].value = value
   238  			m.ctrl[g][s] = int8(lo)
   239  			m.resident++
   240  			return
   241  		}
   242  		g += 1
   243  		if g >= uint32(len(m.groups32)) {
   244  			g = 0
   245  		}
   246  	}
   247  }
   248  
   249  func (m *VectorIndex) add64rehash(key uint32, value uint64) {
   250  	hi, lo := splitHash(key)
   251  	g := probeStart(hi, len(m.groups64))
   252  	for {
   253  		matches := metaMatchH2(&m.ctrl[g], lo)
   254  		for matches != 0 {
   255  			s := nextMatch(&matches)
   256  			if key == m.groups64[g][s].key {
   257  				return
   258  			}
   259  		}
   260  		matches = metaMatchEmpty(&m.ctrl[g])
   261  		if matches != 0 {
   262  			s := nextMatch(&matches)
   263  			m.groups64[g][s].key = key
   264  			m.groups64[g][s].value = value
   265  			m.ctrl[g][s] = int8(lo)
   266  			m.resident++
   267  			return
   268  		}
   269  		g += 1
   270  		if g >= uint32(len(m.groups64)) {
   271  			g = 0
   272  		}
   273  	}
   274  }
   275  
   276  func (m *VectorIndex) Add32(key uint32, value uint32) {
   277  	if m.header.vtype != VectorValTypeUint32 {
   278  		return
   279  	}
   280  	if m.resident >= m.limit {
   281  		m.rehash32(uint32(math.Ceil(float64(len(m.groups32)) * 1.5)))
   282  	}
   283  	hi, lo := splitHash(key)
   284  	g := probeStart(hi, len(m.groups32))
   285  	for {
   286  		matches := metaMatchH2(&m.ctrl[g], lo)
   287  		for matches != 0 {
   288  			s := nextMatch(&matches)
   289  			if key == m.groups32[g][s].key {
   290  				m.groups32[g][s].value = value
   291  				return
   292  			}
   293  		}
   294  		matches = metaMatchEmpty(&m.ctrl[g])
   295  		if matches != 0 {
   296  			s := nextMatch(&matches)
   297  			m.groups32[g][s].key = key
   298  			m.groups32[g][s].value = value
   299  			m.ctrl[g][s] = int8(lo)
   300  			m.resident++
   301  			return
   302  		}
   303  		g += 1
   304  		if g >= uint32(len(m.groups32)) {
   305  			g = 0
   306  		}
   307  	}
   308  }
   309  
   310  func (m *VectorIndex) Add64(key uint32, value uint64) {
   311  	if m.header.vtype != VectorValTypeUint64 {
   312  		return
   313  	}
   314  	if m.resident >= m.limit {
   315  		m.rehash64(uint32(math.Ceil(float64(len(m.groups64)) * 1.2)))
   316  	}
   317  	hi, lo := splitHash(key)
   318  	g := probeStart(hi, len(m.groups64))
   319  	for {
   320  		matches := metaMatchH2(&m.ctrl[g], lo)
   321  		for matches != 0 {
   322  			s := nextMatch(&matches)
   323  			if key == m.groups64[g][s].key {
   324  				m.groups64[g][s].value = value
   325  				return
   326  			}
   327  		}
   328  		matches = metaMatchEmpty(&m.ctrl[g])
   329  		if matches != 0 {
   330  			s := nextMatch(&matches)
   331  			m.groups64[g][s].key = key
   332  			m.groups64[g][s].value = value
   333  			m.ctrl[g][s] = int8(lo)
   334  			m.resident++
   335  			return
   336  		}
   337  		g += 1
   338  		if g >= uint32(len(m.groups64)) {
   339  			g = 0
   340  		}
   341  	}
   342  }
   343  
   344  func (m *VectorIndex) Length() uint32 {
   345  	return m.resident
   346  }
   347  
   348  func (m *VectorIndex) Size() uint32 {
   349  	n := m.saveGroups()
   350  	// header+count+ctrl+groups(k+v)
   351  	return 12 + n*m.groupBytes
   352  }
   353  
   354  //go:inline
   355  func (m *VectorIndex) calGroupHead(g uint32) uint32 {
   356  	return 12 + g*m.groupBytes
   357  }
   358  
   359  //go:inline
   360  func (m *VectorIndex) calGroups(size uint32) uint32 {
   361  	// header+count+ctrl+groups(k+v)
   362  	return (size - 12) / m.groupBytes
   363  }
   364  
   365  func (m *VectorIndex) GetData() []byte {
   366  	return m.data
   367  }
   368  
   369  func (m *VectorIndex) Capacity() uint32 {
   370  	return m.limit - m.resident
   371  }
   372  
   373  //go:inline
   374  func (m *VectorIndex) saveGroups() uint32 {
   375  	n := uint32(math.Ceil(float64(m.resident) / float64(maxAvgGroupLoad)))
   376  	cn := uint32(len(m.groups32))
   377  	sub := cn - n
   378  	if sub > 100 || float32(sub)/float32(cn) > 0.25 {
   379  		return n
   380  	}
   381  	return cn
   382  }
   383  
   384  func (m *VectorIndex) Serialize() bool {
   385  	switch m.header.vtype {
   386  	case VectorValTypeUint32:
   387  		return m.Serialize32()
   388  	case VectorValTypeUint64:
   389  		return m.Serialize64()
   390  	default:
   391  		return false
   392  	}
   393  }
   394  
   395  func (m *VectorIndex) Serialize32() bool {
   396  	if m.resident <= 0 {
   397  		return false
   398  	}
   399  	m.saveGroupN = m.saveGroups()
   400  	if m.saveGroupN != uint32(len(m.ctrl)) {
   401  		m.rehash32(m.saveGroupN)
   402  	}
   403  
   404  	size := int(m.Size())
   405  
   406  	if m.data == nil {
   407  		m.data = arena.MakeSlice[byte](m.arena, size, size)
   408  	}
   409  	writeHeader(m.data, m.header)
   410  	writeCount(m.data[8:], m.resident)
   411  	tail := 12
   412  	for g := range m.ctrl {
   413  		copy(m.data[tail:], (*[groupSize]byte)(unsafe.Pointer(&m.ctrl[g]))[:])
   414  		tail += groupSize
   415  		for s := range m.groups32[g] {
   416  			if m.ctrl[g][s] != empty {
   417  				writeKV32(m.data[tail:], m.groups32[g][s])
   418  			}
   419  			tail += 8
   420  		}
   421  	}
   422  	return true
   423  }
   424  
   425  func (m *VectorIndex) Serialize64() bool {
   426  	if m.resident <= 0 {
   427  		return false
   428  	}
   429  	m.saveGroupN = m.saveGroups()
   430  	if m.saveGroupN != uint32(len(m.ctrl)) {
   431  		m.rehash64(m.saveGroupN)
   432  	}
   433  
   434  	size := int(m.Size())
   435  
   436  	if m.data == nil {
   437  		m.data = arena.MakeSlice[byte](m.arena, size, size)
   438  	}
   439  	writeHeader(m.data, m.header)
   440  	writeCount(m.data[8:], m.resident)
   441  	tail := 12
   442  	for g := range m.ctrl {
   443  		copy(m.data[tail:], (*[groupSize]byte)(unsafe.Pointer(&m.ctrl[g]))[:])
   444  		tail += groupSize
   445  		for s := range m.groups64[g] {
   446  			if m.ctrl[g][s] != empty {
   447  				writeKV64(m.data[tail:], m.groups64[g][s])
   448  			}
   449  			tail += 12
   450  		}
   451  	}
   452  	return true
   453  }
   454  
   455  func (m *VectorIndex) SetWriter(d []byte) bool {
   456  	if d == nil || len(d) < int(m.Size()) {
   457  		return false
   458  	}
   459  
   460  	m.data = d
   461  
   462  	return true
   463  }
   464  
   465  func (m *VectorIndex) readMetadata() bool {
   466  	m.count = readUint32(m.data[8:])
   467  	if m.count == 0 {
   468  		return false
   469  	}
   470  	m.resident = m.count
   471  	m.limit = m.count
   472  	gs := m.calGroups(uint32(len(m.data)))
   473  	m.ctrl = make([]metadata, gs)
   474  	for i, _ := range m.ctrl {
   475  		m.ctrl[i] = *(*metadata)(unsafe.Pointer(&m.data[12+uint32(i)*m.groupBytes]))
   476  	}
   477  	return true
   478  }
   479  
   480  func (m *VectorIndex) Get32(key uint32) (value uint32, ok bool) {
   481  	hi, lo := splitHash(key)
   482  	g := probeStart(hi, len(m.ctrl))
   483  	for {
   484  		matches := metaMatchH2(&m.ctrl[g], lo)
   485  		for matches != 0 {
   486  			s := nextMatch(&matches)
   487  			kIdx := m.calGroupHead(g) + groupSize + s*(4+uint32(m.header.vtype))
   488  			k := readUint32(m.data[kIdx:])
   489  			if key == k {
   490  				value, ok = readKV32Value(m.data[kIdx:]), true
   491  				return
   492  			}
   493  		}
   494  		matches = metaMatchEmpty(&m.ctrl[g])
   495  		if matches != 0 {
   496  			ok = false
   497  			return
   498  		}
   499  		g += 1
   500  		if g >= uint32(len(m.ctrl)) {
   501  			g = 0
   502  		}
   503  	}
   504  }
   505  
   506  func (m *VectorIndex) Get64(key uint32) (value uint64, ok bool) {
   507  	hi, lo := splitHash(key)
   508  	g := probeStart(hi, len(m.ctrl))
   509  	for {
   510  		matches := metaMatchH2(&m.ctrl[g], lo)
   511  		for matches != 0 {
   512  			s := nextMatch(&matches)
   513  			kIdx := m.calGroupHead(g) + groupSize + s*(4+uint32(m.header.vtype))
   514  			k := readUint32(m.data[kIdx:])
   515  			if key == k {
   516  				value, ok = readKV64Value(m.data[kIdx:]), true
   517  				return
   518  			}
   519  		}
   520  		matches = metaMatchEmpty(&m.ctrl[g])
   521  		if matches != 0 {
   522  			ok = false
   523  			return
   524  		}
   525  		g += 1
   526  		if g >= uint32(len(m.ctrl)) {
   527  			g = 0
   528  		}
   529  	}
   530  }
   531  
   532  func (m *VectorIndex) Finish() {
   533  	m.groups64 = nil
   534  	m.groups32 = nil
   535  	if m.arena != nil {
   536  		m.arena.Free()
   537  		m.arena = nil
   538  	}
   539  }
   540  
   541  func numGroups(n uint32) (groups uint32) {
   542  	groups = (n + maxAvgGroupLoad - 1) / maxAvgGroupLoad
   543  	if groups == 0 {
   544  		groups = 1
   545  	}
   546  	return
   547  }
   548  
   549  func newEmptyMetadata() (meta metadata) {
   550  	for i := range meta {
   551  		meta[i] = empty
   552  	}
   553  	return
   554  }
   555  
   556  func writeHeader(buf []byte, header VectorHeader) {
   557  	binary.BigEndian.PutUint16(buf[0:], header.version)
   558  	binary.BigEndian.PutUint16(buf[2:], uint16(header.vtype))
   559  	binary.BigEndian.PutUint32(buf[4:], header.shards)
   560  }
   561  
   562  func writeCount(buf []byte, count uint32) {
   563  	binary.BigEndian.PutUint32(buf[0:], count)
   564  }
   565  
   566  func writeKV32(buf []byte, item32 kv32) {
   567  	binary.BigEndian.PutUint32(buf[0:], item32.key)
   568  	binary.BigEndian.PutUint32(buf[4:], item32.value)
   569  }
   570  
   571  func writeKV64(buf []byte, item64 kv64) {
   572  	binary.BigEndian.PutUint32(buf[0:], item64.key)
   573  	binary.BigEndian.PutUint64(buf[4:], item64.value)
   574  }
   575  
   576  func readHeader(buf []byte) VectorHeader {
   577  	header := VectorHeader{
   578  		version: binary.BigEndian.Uint16(buf[0:]),
   579  		vtype:   VectorValType(binary.BigEndian.Uint16(buf[2:])),
   580  		shards:  binary.BigEndian.Uint32(buf[4:]),
   581  	}
   582  
   583  	return header
   584  }
   585  
   586  func readUint32(buf []byte) uint32 {
   587  	return binary.BigEndian.Uint32(buf[0:])
   588  }
   589  
   590  func readKV32Value(buf []byte) uint32 {
   591  	return binary.BigEndian.Uint32(buf[4:])
   592  }
   593  
   594  func readKV64Value(buf []byte) uint64 {
   595  	return binary.BigEndian.Uint64(buf[4:])
   596  }