github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/bytesRefHash.go (about) 1 package util 2 3 import ( 4 "fmt" 5 ) 6 7 /* 8 BytesRefHash is a special purpose hash map like data structure 9 optimized for BytesRef instances. BytesRefHash maintains mappings of 10 byte arrays to ids (map[[]byte]int) sorting the hashed bytes 11 efficiently in continuous storage. The mapping to the id is 12 encapsulated inside BytesRefHash and is guaranteed to be increased 13 for each added BytesRef. 14 15 Note: The maximum capacity BytesRef instance passed to add() must not 16 be longer than BYTE_BLOCK_SIZE-2. The internal storage is limited to 17 2GB total byte storage. 18 */ 19 type BytesRefHash struct { 20 pool *ByteBlockPool 21 bytesStart []int 22 23 scratch1 *BytesRef 24 hashSize int 25 hashHalfSize int 26 hashMask int 27 count int 28 lastCount int 29 ids []int 30 bytesStartArray BytesStartArray 31 bytesUsed Counter 32 } 33 34 func NewBytesRefHash(pool *ByteBlockPool, capacity int, 35 bytesStartArray BytesStartArray) *BytesRefHash { 36 ids := make([]int, capacity) 37 for i, _ := range ids { 38 ids[i] = -1 39 } 40 counter := bytesStartArray.BytesUsed() 41 if counter == nil { 42 counter = NewCounter() 43 } 44 counter.AddAndGet(int64(capacity) * NUM_BYTES_INT) 45 return &BytesRefHash{ 46 scratch1: NewEmptyBytesRef(), 47 hashSize: capacity, 48 hashHalfSize: capacity >> 1, 49 hashMask: capacity - 1, 50 lastCount: -1, 51 pool: pool, 52 ids: ids, 53 bytesStartArray: bytesStartArray, 54 bytesStart: bytesStartArray.Init(), 55 bytesUsed: counter, 56 } 57 } 58 59 /* Returns the number of values in this hash. */ 60 func (h *BytesRefHash) Size() int { 61 return h.count 62 } 63 64 /* 65 Returns the ids array in arbitrary order. Valid ids start at offset 66 of 0 and end at a limit of size() - 1 67 68 Note: This is a destructive operation. clear() must be called in 69 order to reuse this BytesRefHash instance. 70 */ 71 func (h *BytesRefHash) compact() []int { 72 assert2(h.bytesStart != nil, "bytesStart is nil - not initialized") 73 upto := 0 74 for i := 0; i < h.hashSize; i++ { 75 if h.ids[i] != -1 { 76 if upto < i { 77 h.ids[upto] = h.ids[i] 78 h.ids[i] = -1 79 } 80 upto++ 81 } 82 } 83 84 assert(upto == h.count) 85 h.lastCount = h.count 86 return h.ids 87 } 88 89 type bytesRefIntroSorter struct { 90 *IntroSorter 91 owner *BytesRefHash 92 compact []int 93 comp func([]byte, []byte) bool 94 pivot *BytesRef 95 scratch1 *BytesRef 96 scratch2 *BytesRef 97 } 98 99 func newBytesRefIntroSorter(owner *BytesRefHash, v []int, 100 comp func([]byte, []byte) bool) *bytesRefIntroSorter { 101 ans := &bytesRefIntroSorter{ 102 owner: owner, 103 compact: v, 104 comp: comp, 105 pivot: NewEmptyBytesRef(), 106 scratch1: NewEmptyBytesRef(), 107 scratch2: NewEmptyBytesRef(), 108 } 109 ans.IntroSorter = NewIntroSorter(ans, ans) 110 return ans 111 } 112 113 func (a *bytesRefIntroSorter) Len() int { return len(a.compact) } 114 func (a *bytesRefIntroSorter) Swap(i, j int) { a.compact[i], a.compact[j] = a.compact[j], a.compact[i] } 115 func (a *bytesRefIntroSorter) Less(i, j int) bool { 116 id1, id2 := a.compact[i], a.compact[j] 117 assert(len(a.owner.bytesStart) > id1 && len(a.owner.bytesStart) > id2) 118 a.owner.pool.SetBytesRef(a.scratch1, a.owner.bytesStart[id1]) 119 a.owner.pool.SetBytesRef(a.scratch2, a.owner.bytesStart[id2]) 120 return a.comp(a.scratch1.ToBytes(), a.scratch2.ToBytes()) 121 } 122 123 func (a *bytesRefIntroSorter) SetPivot(i int) { 124 id := a.compact[i] 125 assert(len(a.owner.bytesStart) > id) 126 a.owner.pool.SetBytesRef(a.pivot, a.owner.bytesStart[id]) 127 } 128 129 func (a *bytesRefIntroSorter) PivotLess(j int) bool { 130 id := a.compact[j] 131 assert(len(a.owner.bytesStart) > id) 132 a.owner.pool.SetBytesRef(a.scratch2, a.owner.bytesStart[id]) 133 return a.comp(a.pivot.ToBytes(), a.scratch2.ToBytes()) 134 } 135 136 /* 137 Returns the values array sorted by the referenced byte values. 138 139 Note: this is a destructive operation. clear() must be called in 140 order to reuse this BytesRefHash instance. 141 */ 142 func (h *BytesRefHash) Sort(comp func(a, b []byte) bool) []int { 143 compact := h.compact() 144 s := newBytesRefIntroSorter(h, compact, comp) 145 s.Sort(0, h.count) 146 // TODO remove this 147 // for i, _ := range compact { 148 // if compact[i+1] == -1 { 149 // break 150 // } 151 // assert(!s.Less(i+1, i)) 152 // if ok := !s.Less(i+1, i); !ok { 153 // fmt.Println("DEBUG1", compact) 154 // assert(ok) 155 // } 156 // } 157 return compact 158 } 159 160 func (h *BytesRefHash) equals(id int, b []byte) bool { 161 h.pool.SetBytesRef(h.scratch1, h.bytesStart[id]) 162 return h.scratch1.bytesEquals(b) 163 } 164 165 func (h *BytesRefHash) shrink(targetSize int) bool { 166 // Cannot use util.Shrink because we require power of 2: 167 newSize := h.hashSize 168 for newSize >= 8 && newSize/4 > targetSize { 169 newSize /= 2 170 } 171 if newSize != h.hashSize { 172 h.bytesUsed.AddAndGet(NUM_BYTES_INT * -int64(h.hashSize-newSize)) 173 h.hashSize = newSize 174 h.ids = make([]int, h.hashSize) 175 for i, _ := range h.ids { 176 h.ids[i] = -1 177 } 178 h.hashHalfSize = newSize / 2 179 h.hashMask = newSize - 1 180 return true 181 } 182 return false 183 } 184 185 /* Clears the BytesRef which maps to the given BytesRef */ 186 func (h *BytesRefHash) Clear(resetPool bool) { 187 h.lastCount = h.count 188 h.count = 0 189 if resetPool { 190 h.pool.Reset(false, false) // we don't need to 0-fill the bufferes 191 } 192 h.bytesStart = h.bytesStartArray.Clear() 193 if h.lastCount != -1 && h.shrink(h.lastCount) { 194 // shurnk clears the hash entries 195 return 196 } 197 for i, _ := range h.ids { 198 h.ids[i] = -1 199 } 200 } 201 202 type MaxBytesLengthExceededError string 203 204 func (e MaxBytesLengthExceededError) Error() string { 205 return string(e) 206 } 207 208 /* Adds a new BytesRef. */ 209 func (h *BytesRefHash) Add(bytes []byte) (int, error) { 210 assert2(h.bytesStart != nil, "Bytesstart is null - not initialized") 211 length := len(bytes) 212 // final position 213 hashPos := h.findHash(bytes) 214 e := h.ids[hashPos] 215 216 if e == -1 { 217 // new entry 218 if len2 := 2 + len(bytes); len2+h.pool.ByteUpto > BYTE_BLOCK_SIZE { 219 if len2 > BYTE_BLOCK_SIZE { 220 return 0, MaxBytesLengthExceededError(fmt.Sprintf( 221 "bytes can be at most %v in length; got %v", 222 BYTE_BLOCK_SIZE-2, len(bytes))) 223 } 224 h.pool.NextBuffer() 225 } 226 buffer := h.pool.Buffer 227 bufferUpto := h.pool.ByteUpto 228 if h.count >= len(h.bytesStart) { 229 h.bytesStart = h.bytesStartArray.Grow() 230 assert2(h.count < len(h.bytesStart)+1, "count: %v len: %v", h.count, len(h.bytesStart)) 231 } 232 e = h.count 233 h.count++ 234 235 h.bytesStart[e] = bufferUpto + h.pool.ByteOffset 236 237 // We first encode the length, followed by the bytes. Length is 238 // encoded as vint, but will consume 1 or 2 bytes at most (we 239 // reject too-long terms, above). 240 if length < 128 { 241 // 1 byte to store length 242 buffer[bufferUpto] = byte(length) 243 h.pool.ByteUpto += length + 1 244 assert2(length >= 0, "Length must be positive: %v", length) 245 copy(buffer[bufferUpto+1:], bytes) 246 } else { 247 // 2 bytes to store length 248 buffer[bufferUpto] = byte(0x80 | (length & 0x7f)) 249 buffer[bufferUpto+1] = byte((length >> 7) & 0xff) 250 h.pool.ByteUpto += length + 2 251 copy(buffer[bufferUpto+2:], bytes) 252 } 253 assert(h.ids[hashPos] == -1) 254 h.ids[hashPos] = e 255 256 if h.count == h.hashHalfSize { 257 h.rehash(2*h.hashSize, true) 258 } 259 return e, nil 260 } 261 return -(e + 1), nil 262 } 263 264 func (h *BytesRefHash) findHash(bytes []byte) int { 265 assert2(h.bytesStart != nil, "bytesStart is null - not initialized") 266 code := h.doHash(bytes) 267 // final position 268 hashPos := code & h.hashMask 269 if e := h.ids[hashPos]; e != -1 && !h.equals(e, bytes) { 270 // conflict; use linear probe to find an open slot 271 // (see LUCENE-5604): 272 for { 273 code++ 274 hashPos = code & h.hashMask 275 e = h.ids[hashPos] 276 if e == -1 || h.equals(e, bytes) { 277 break 278 } 279 } 280 } 281 return hashPos 282 } 283 284 /* Called when has is too small (> 50% occupied) or too large (< 20% occupied). */ 285 func (h *BytesRefHash) rehash(newSize int, hashOnData bool) { 286 newMask := newSize - 1 287 h.bytesUsed.AddAndGet(NUM_BYTES_INT * int64(newSize)) 288 newHash := make([]int, newSize) 289 for i, _ := range newHash { 290 newHash[i] = -1 291 } 292 for i := 0; i < h.hashSize; i++ { 293 if e0 := h.ids[i]; e0 != -1 { 294 var code int 295 if hashOnData { 296 off := h.bytesStart[e0] 297 start := off & BYTE_BLOCK_MASK 298 bytes := h.pool.Buffers[off>>BYTE_BLOCK_SHIFT] 299 var length int 300 var pos int 301 if bytes[start]&0x80 == 0 { 302 // length is 1 byte 303 length = int(bytes[start]) 304 pos = start + 1 305 } else { 306 length = int(bytes[start]&0x7f) + (int(bytes[start+1]&0xff) << 7) 307 pos = start + 2 308 } 309 code = h.doHash(bytes[pos : pos+length]) 310 } else { 311 code = h.bytesStart[e0] 312 } 313 314 hashPos := code & newMask 315 assert(hashPos >= 0) 316 if newHash[hashPos] != -1 { 317 // conflict; use linear probe to find an open slot 318 // (see LUCENE-5604) 319 for { 320 code++ 321 hashPos = code & newMask 322 if newHash[hashPos] == -1 { 323 break 324 } 325 } 326 } 327 assert(newHash[hashPos] == -1) 328 newHash[hashPos] = e0 329 } 330 } 331 332 h.hashMask = newMask 333 h.bytesUsed.AddAndGet(NUM_BYTES_INT * int64(-len(h.ids))) 334 h.ids = newHash 335 h.hashSize = newSize 336 h.hashHalfSize = newSize / 2 337 } 338 339 func (h *BytesRefHash) doHash(p []byte) int { 340 return int(MurmurHash3_x86_32(p, GOOD_FAST_HASH_SEED)) 341 } 342 343 /* 344 reinitializes the BytesRefHash after a previous clear() call. If 345 clear() has not been called previously this method has no effect. 346 */ 347 func (h *BytesRefHash) Reinit() { 348 if h.bytesStart == nil { 349 h.bytesStart = h.bytesStartArray.Init() 350 } 351 if h.ids == nil { 352 h.ids = make([]int, h.hashSize) 353 h.bytesUsed.AddAndGet(NUM_BYTES_INT * int64(h.hashSize)) 354 } 355 } 356 357 /* 358 Returns the bytesStart offset into the internally used ByteBlockPool 359 for the given bytesID. 360 */ 361 func (h *BytesRefHash) ByteStart(bytesId int) int { 362 assert2(h.bytesStart != nil, "bytesStart is null - not initialized") 363 assert2(bytesId >= 0 && bytesId <= h.count, "%v", bytesId) 364 return h.bytesStart[bytesId] 365 } 366 367 /* Manages allocation of per-term addresses. */ 368 type BytesStartArray interface { 369 // Initializes the BytesStartArray. This call will allocate memory 370 Init() []int 371 // A Counter reference holding the number of bytes used by this 372 // BytesStartArray. The BytesRefHash uses this reference to track 373 // its memory usage 374 BytesUsed() Counter 375 // Grows the BytesStartArray 376 Grow() []int 377 // clears the BytesStartArray and returns the cleared instance. 378 Clear() []int 379 }