github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/bytesRefHash.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/bytesRefHash.go (about)

     1  package util
     2  
     3  import (
     4  	"fmt"
     5  )
     6  
     7  /*
     8  BytesRefHash is a special purpose hash map like data structure
     9  optimized for BytesRef instances. BytesRefHash maintains mappings of
    10  byte arrays to ids (map[[]byte]int) sorting the hashed bytes
    11  efficiently in continuous storage. The mapping to the id is
    12  encapsulated inside BytesRefHash and is guaranteed to be increased
    13  for each added BytesRef.
    14  
    15  Note: The maximum capacity BytesRef instance passed to add() must not
    16  be longer than BYTE_BLOCK_SIZE-2. The internal storage is limited to
    17  2GB total byte storage.
    18  */
    19  type BytesRefHash struct {
    20  	pool       *ByteBlockPool
    21  	bytesStart []int
    22  
    23  	scratch1        *BytesRef
    24  	hashSize        int
    25  	hashHalfSize    int
    26  	hashMask        int
    27  	count           int
    28  	lastCount       int
    29  	ids             []int
    30  	bytesStartArray BytesStartArray
    31  	bytesUsed       Counter
    32  }
    33  
    34  func NewBytesRefHash(pool *ByteBlockPool, capacity int,
    35  	bytesStartArray BytesStartArray) *BytesRefHash {
    36  	ids := make([]int, capacity)
    37  	for i, _ := range ids {
    38  		ids[i] = -1
    39  	}
    40  	counter := bytesStartArray.BytesUsed()
    41  	if counter == nil {
    42  		counter = NewCounter()
    43  	}
    44  	counter.AddAndGet(int64(capacity) * NUM_BYTES_INT)
    45  	return &BytesRefHash{
    46  		scratch1:        NewEmptyBytesRef(),
    47  		hashSize:        capacity,
    48  		hashHalfSize:    capacity >> 1,
    49  		hashMask:        capacity - 1,
    50  		lastCount:       -1,
    51  		pool:            pool,
    52  		ids:             ids,
    53  		bytesStartArray: bytesStartArray,
    54  		bytesStart:      bytesStartArray.Init(),
    55  		bytesUsed:       counter,
    56  	}
    57  }
    58  
    59  /* Returns the number of values in this hash. */
    60  func (h *BytesRefHash) Size() int {
    61  	return h.count
    62  }
    63  
    64  /*
    65  Returns the ids array in arbitrary order. Valid ids start at offset
    66  of 0 and end at a limit of size() - 1
    67  
    68  Note: This is a destructive operation. clear() must be called in
    69  order to reuse this BytesRefHash instance.
    70  */
    71  func (h *BytesRefHash) compact() []int {
    72  	assert2(h.bytesStart != nil, "bytesStart is nil - not initialized")
    73  	upto := 0
    74  	for i := 0; i < h.hashSize; i++ {
    75  		if h.ids[i] != -1 {
    76  			if upto < i {
    77  				h.ids[upto] = h.ids[i]
    78  				h.ids[i] = -1
    79  			}
    80  			upto++
    81  		}
    82  	}
    83  
    84  	assert(upto == h.count)
    85  	h.lastCount = h.count
    86  	return h.ids
    87  }
    88  
    89  type bytesRefIntroSorter struct {
    90  	*IntroSorter
    91  	owner    *BytesRefHash
    92  	compact  []int
    93  	comp     func([]byte, []byte) bool
    94  	pivot    *BytesRef
    95  	scratch1 *BytesRef
    96  	scratch2 *BytesRef
    97  }
    98  
    99  func newBytesRefIntroSorter(owner *BytesRefHash, v []int,
   100  	comp func([]byte, []byte) bool) *bytesRefIntroSorter {
   101  	ans := &bytesRefIntroSorter{
   102  		owner:    owner,
   103  		compact:  v,
   104  		comp:     comp,
   105  		pivot:    NewEmptyBytesRef(),
   106  		scratch1: NewEmptyBytesRef(),
   107  		scratch2: NewEmptyBytesRef(),
   108  	}
   109  	ans.IntroSorter = NewIntroSorter(ans, ans)
   110  	return ans
   111  }
   112  
   113  func (a *bytesRefIntroSorter) Len() int      { return len(a.compact) }
   114  func (a *bytesRefIntroSorter) Swap(i, j int) { a.compact[i], a.compact[j] = a.compact[j], a.compact[i] }
   115  func (a *bytesRefIntroSorter) Less(i, j int) bool {
   116  	id1, id2 := a.compact[i], a.compact[j]
   117  	assert(len(a.owner.bytesStart) > id1 && len(a.owner.bytesStart) > id2)
   118  	a.owner.pool.SetBytesRef(a.scratch1, a.owner.bytesStart[id1])
   119  	a.owner.pool.SetBytesRef(a.scratch2, a.owner.bytesStart[id2])
   120  	return a.comp(a.scratch1.ToBytes(), a.scratch2.ToBytes())
   121  }
   122  
   123  func (a *bytesRefIntroSorter) SetPivot(i int) {
   124  	id := a.compact[i]
   125  	assert(len(a.owner.bytesStart) > id)
   126  	a.owner.pool.SetBytesRef(a.pivot, a.owner.bytesStart[id])
   127  }
   128  
   129  func (a *bytesRefIntroSorter) PivotLess(j int) bool {
   130  	id := a.compact[j]
   131  	assert(len(a.owner.bytesStart) > id)
   132  	a.owner.pool.SetBytesRef(a.scratch2, a.owner.bytesStart[id])
   133  	return a.comp(a.pivot.ToBytes(), a.scratch2.ToBytes())
   134  }
   135  
   136  /*
   137  Returns the values array sorted by the referenced byte values.
   138  
   139  Note: this is a destructive operation. clear() must be called in
   140  order to reuse this BytesRefHash instance.
   141  */
   142  func (h *BytesRefHash) Sort(comp func(a, b []byte) bool) []int {
   143  	compact := h.compact()
   144  	s := newBytesRefIntroSorter(h, compact, comp)
   145  	s.Sort(0, h.count)
   146  	// TODO remove this
   147  	// for i, _ := range compact {
   148  	// 	if compact[i+1] == -1 {
   149  	// 		break
   150  	// 	}
   151  	// 	assert(!s.Less(i+1, i))
   152  	// 	if ok := !s.Less(i+1, i); !ok {
   153  	// 		fmt.Println("DEBUG1", compact)
   154  	// 		assert(ok)
   155  	// 	}
   156  	// }
   157  	return compact
   158  }
   159  
   160  func (h *BytesRefHash) equals(id int, b []byte) bool {
   161  	h.pool.SetBytesRef(h.scratch1, h.bytesStart[id])
   162  	return h.scratch1.bytesEquals(b)
   163  }
   164  
   165  func (h *BytesRefHash) shrink(targetSize int) bool {
   166  	// Cannot use util.Shrink because we require power of 2:
   167  	newSize := h.hashSize
   168  	for newSize >= 8 && newSize/4 > targetSize {
   169  		newSize /= 2
   170  	}
   171  	if newSize != h.hashSize {
   172  		h.bytesUsed.AddAndGet(NUM_BYTES_INT * -int64(h.hashSize-newSize))
   173  		h.hashSize = newSize
   174  		h.ids = make([]int, h.hashSize)
   175  		for i, _ := range h.ids {
   176  			h.ids[i] = -1
   177  		}
   178  		h.hashHalfSize = newSize / 2
   179  		h.hashMask = newSize - 1
   180  		return true
   181  	}
   182  	return false
   183  }
   184  
   185  /* Clears the BytesRef which maps to the given BytesRef */
   186  func (h *BytesRefHash) Clear(resetPool bool) {
   187  	h.lastCount = h.count
   188  	h.count = 0
   189  	if resetPool {
   190  		h.pool.Reset(false, false) // we don't need to 0-fill the bufferes
   191  	}
   192  	h.bytesStart = h.bytesStartArray.Clear()
   193  	if h.lastCount != -1 && h.shrink(h.lastCount) {
   194  		// shurnk clears the hash entries
   195  		return
   196  	}
   197  	for i, _ := range h.ids {
   198  		h.ids[i] = -1
   199  	}
   200  }
   201  
   202  type MaxBytesLengthExceededError string
   203  
   204  func (e MaxBytesLengthExceededError) Error() string {
   205  	return string(e)
   206  }
   207  
   208  /* Adds a new BytesRef. */
   209  func (h *BytesRefHash) Add(bytes []byte) (int, error) {
   210  	assert2(h.bytesStart != nil, "Bytesstart is null - not initialized")
   211  	length := len(bytes)
   212  	// final position
   213  	hashPos := h.findHash(bytes)
   214  	e := h.ids[hashPos]
   215  
   216  	if e == -1 {
   217  		// new entry
   218  		if len2 := 2 + len(bytes); len2+h.pool.ByteUpto > BYTE_BLOCK_SIZE {
   219  			if len2 > BYTE_BLOCK_SIZE {
   220  				return 0, MaxBytesLengthExceededError(fmt.Sprintf(
   221  					"bytes can be at most %v in length; got %v",
   222  					BYTE_BLOCK_SIZE-2, len(bytes)))
   223  			}
   224  			h.pool.NextBuffer()
   225  		}
   226  		buffer := h.pool.Buffer
   227  		bufferUpto := h.pool.ByteUpto
   228  		if h.count >= len(h.bytesStart) {
   229  			h.bytesStart = h.bytesStartArray.Grow()
   230  			assert2(h.count < len(h.bytesStart)+1, "count: %v len: %v", h.count, len(h.bytesStart))
   231  		}
   232  		e = h.count
   233  		h.count++
   234  
   235  		h.bytesStart[e] = bufferUpto + h.pool.ByteOffset
   236  
   237  		// We first encode the length, followed by the bytes. Length is
   238  		// encoded as vint, but will consume 1 or 2 bytes at most (we
   239  		// reject too-long terms, above).
   240  		if length < 128 {
   241  			// 1 byte to store length
   242  			buffer[bufferUpto] = byte(length)
   243  			h.pool.ByteUpto += length + 1
   244  			assert2(length >= 0, "Length must be positive: %v", length)
   245  			copy(buffer[bufferUpto+1:], bytes)
   246  		} else {
   247  			// 2 bytes to store length
   248  			buffer[bufferUpto] = byte(0x80 | (length & 0x7f))
   249  			buffer[bufferUpto+1] = byte((length >> 7) & 0xff)
   250  			h.pool.ByteUpto += length + 2
   251  			copy(buffer[bufferUpto+2:], bytes)
   252  		}
   253  		assert(h.ids[hashPos] == -1)
   254  		h.ids[hashPos] = e
   255  
   256  		if h.count == h.hashHalfSize {
   257  			h.rehash(2*h.hashSize, true)
   258  		}
   259  		return e, nil
   260  	}
   261  	return -(e + 1), nil
   262  }
   263  
   264  func (h *BytesRefHash) findHash(bytes []byte) int {
   265  	assert2(h.bytesStart != nil, "bytesStart is null - not initialized")
   266  	code := h.doHash(bytes)
   267  	// final position
   268  	hashPos := code & h.hashMask
   269  	if e := h.ids[hashPos]; e != -1 && !h.equals(e, bytes) {
   270  		// conflict; use linear probe to find an open slot
   271  		// (see LUCENE-5604):
   272  		for {
   273  			code++
   274  			hashPos = code & h.hashMask
   275  			e = h.ids[hashPos]
   276  			if e == -1 || h.equals(e, bytes) {
   277  				break
   278  			}
   279  		}
   280  	}
   281  	return hashPos
   282  }
   283  
   284  /* Called when has is too small (> 50% occupied) or too large (< 20% occupied). */
   285  func (h *BytesRefHash) rehash(newSize int, hashOnData bool) {
   286  	newMask := newSize - 1
   287  	h.bytesUsed.AddAndGet(NUM_BYTES_INT * int64(newSize))
   288  	newHash := make([]int, newSize)
   289  	for i, _ := range newHash {
   290  		newHash[i] = -1
   291  	}
   292  	for i := 0; i < h.hashSize; i++ {
   293  		if e0 := h.ids[i]; e0 != -1 {
   294  			var code int
   295  			if hashOnData {
   296  				off := h.bytesStart[e0]
   297  				start := off & BYTE_BLOCK_MASK
   298  				bytes := h.pool.Buffers[off>>BYTE_BLOCK_SHIFT]
   299  				var length int
   300  				var pos int
   301  				if bytes[start]&0x80 == 0 {
   302  					// length is 1 byte
   303  					length = int(bytes[start])
   304  					pos = start + 1
   305  				} else {
   306  					length = int(bytes[start]&0x7f) + (int(bytes[start+1]&0xff) << 7)
   307  					pos = start + 2
   308  				}
   309  				code = h.doHash(bytes[pos : pos+length])
   310  			} else {
   311  				code = h.bytesStart[e0]
   312  			}
   313  
   314  			hashPos := code & newMask
   315  			assert(hashPos >= 0)
   316  			if newHash[hashPos] != -1 {
   317  				// conflict; use linear probe to find an open slot
   318  				// (see LUCENE-5604)
   319  				for {
   320  					code++
   321  					hashPos = code & newMask
   322  					if newHash[hashPos] == -1 {
   323  						break
   324  					}
   325  				}
   326  			}
   327  			assert(newHash[hashPos] == -1)
   328  			newHash[hashPos] = e0
   329  		}
   330  	}
   331  
   332  	h.hashMask = newMask
   333  	h.bytesUsed.AddAndGet(NUM_BYTES_INT * int64(-len(h.ids)))
   334  	h.ids = newHash
   335  	h.hashSize = newSize
   336  	h.hashHalfSize = newSize / 2
   337  }
   338  
   339  func (h *BytesRefHash) doHash(p []byte) int {
   340  	return int(MurmurHash3_x86_32(p, GOOD_FAST_HASH_SEED))
   341  }
   342  
   343  /*
   344  reinitializes the BytesRefHash after a previous clear() call. If
   345  clear() has not been called previously this method has no effect.
   346  */
   347  func (h *BytesRefHash) Reinit() {
   348  	if h.bytesStart == nil {
   349  		h.bytesStart = h.bytesStartArray.Init()
   350  	}
   351  	if h.ids == nil {
   352  		h.ids = make([]int, h.hashSize)
   353  		h.bytesUsed.AddAndGet(NUM_BYTES_INT * int64(h.hashSize))
   354  	}
   355  }
   356  
   357  /*
   358  Returns the bytesStart offset into the internally used ByteBlockPool
   359  for the given bytesID.
   360  */
   361  func (h *BytesRefHash) ByteStart(bytesId int) int {
   362  	assert2(h.bytesStart != nil, "bytesStart is null - not initialized")
   363  	assert2(bytesId >= 0 && bytesId <= h.count, "%v", bytesId)
   364  	return h.bytesStart[bytesId]
   365  }
   366  
   367  /* Manages allocation of per-term addresses. */
   368  type BytesStartArray interface {
   369  	// Initializes the BytesStartArray. This call will allocate memory
   370  	Init() []int
   371  	// A Counter reference holding the number of bytes used by this
   372  	// BytesStartArray. The BytesRefHash uses this reference to track
   373  	// its memory usage
   374  	BytesUsed() Counter
   375  	// Grows the BytesStartArray
   376  	Grow() []int
   377  	// clears the BytesStartArray and returns the cleared instance.
   378  	Clear() []int
   379  }