github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/memo_table.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "math" 21 "unsafe" 22 23 "github.com/apache/arrow/go/v14/arrow" 24 "github.com/apache/arrow/go/v14/arrow/array" 25 "github.com/apache/arrow/go/v14/arrow/memory" 26 "github.com/apache/arrow/go/v14/internal/hashing" 27 "github.com/apache/arrow/go/v14/parquet" 28 ) 29 30 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl 31 32 // MemoTable interface that can be used to swap out implementations of the hash table 33 // used for handling dictionary encoding. Dictionary encoding is built against this interface 34 // to make it easy for code generation and changing implementations. 35 // 36 // Values should remember the order they are inserted to generate a valid dictionary index 37 type MemoTable interface { 38 // Reset drops everything in the table allowing it to be reused 39 Reset() 40 // Size returns the current number of unique values stored in the table 41 // including whether or not a null value has been passed in using GetOrInsertNull 42 Size() int 43 // CopyValues populates out with the values currently in the table, out must 44 // be a slice of the appropriate type for the table type. 45 CopyValues(out interface{}) 46 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 47 // at the indicated index. 48 CopyValuesSubset(start int, out interface{}) 49 50 WriteOut(out []byte) 51 WriteOutSubset(start int, out []byte) 52 // Get returns the index of the table the specified value is, and a boolean indicating 53 // whether or not the value was found in the table. Will panic if val is not the appropriate 54 // type for the underlying table. 55 Get(val interface{}) (int, bool) 56 // GetOrInsert is the same as Get, except if the value is not currently in the table it will 57 // be inserted into the table. 58 GetOrInsert(val interface{}) (idx int, existed bool, err error) 59 // GetNull returns the index of the null value and whether or not it was found in the table 60 GetNull() (int, bool) 61 // GetOrInsertNull returns the index of the null value, if it didn't already exist in the table, 62 // it is inserted. 63 GetOrInsertNull() (idx int, existed bool) 64 } 65 66 type NumericMemoTable interface { 67 MemoTable 68 // WriteOutLE writes the contents of the memo table out to the byteslice 69 // but ensures the values are little-endian before writing them (converting 70 // if on a big endian system). 71 WriteOutLE(out []byte) 72 // WriteOutSubsetLE writes the contents of the memo table out to the byteslice 73 // starting with the index indicated by start, but ensures the values are little 74 // endian before writing them (converting if on a big-endian system). 75 WriteOutSubsetLE(start int, out []byte) 76 } 77 78 // BinaryMemoTable is an extension of the MemoTable interface adding extra methods 79 // for handling byte arrays/strings/fixed length byte arrays. 80 type BinaryMemoTable interface { 81 MemoTable 82 // ValuesSize returns the total number of bytes needed to copy all of the values 83 // from this table. 84 ValuesSize() int 85 // CopyOffsets populates out with the start and end offsets of each value in the 86 // table data. Out should be sized to Size()+1 to accomodate all of the offsets. 87 CopyOffsets(out []int32) 88 // CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets 89 // starting at the specified index. 90 CopyOffsetsSubset(start int, out []int32) 91 // CopyFixedWidthValues exists to cope with the fact that the table doesn't track 92 // the fixed width when inserting the null value into the databuffer populating 93 // a zero length byte slice for the null value (if found). 94 CopyFixedWidthValues(start int, width int, out []byte) 95 // VisitValues calls visitFn on each value in the table starting with the index specified 96 VisitValues(start int, visitFn func([]byte)) 97 // Retain increases the reference count of the separately stored binary data that is 98 // kept alongside the table which contains all of the values in the table. This is 99 // safe to call simultaneously across multiple goroutines. 100 Retain() 101 // Release decreases the reference count by 1 of the separately stored binary data 102 // kept alongside the table containing the values. When the reference count goes to 103 // 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy. 104 Release() 105 } 106 107 // NewInt32Dictionary returns a memotable interface for use with Int32 values only 108 func NewInt32Dictionary() MemoTable { 109 return hashing.NewInt32MemoTable(0) 110 } 111 112 // NewInt64Dictionary returns a memotable interface for use with Int64 values only 113 func NewInt64Dictionary() MemoTable { 114 return hashing.NewInt64MemoTable(0) 115 } 116 117 // NewFloat32Dictionary returns a memotable interface for use with Float32 values only 118 func NewFloat32Dictionary() MemoTable { 119 return hashing.NewFloat32MemoTable(0) 120 } 121 122 // NewFloat64Dictionary returns a memotable interface for use with Float64 values only 123 func NewFloat64Dictionary() MemoTable { 124 return hashing.NewFloat64MemoTable(0) 125 } 126 127 // NewBinaryDictionary returns a memotable interface for use with strings, byte slices, 128 // parquet.ByteArray and parquet.FixedLengthByteArray only. 129 func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable { 130 return hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary)) 131 } 132 133 const keyNotFound = hashing.KeyNotFound 134 135 // standard map based implementation of a binary memotable which is only kept around 136 // currently to be used as a benchmark against the memotables in the internal/hashing 137 // module as a baseline comparison. 138 139 func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable { 140 return &binaryMemoTableImpl{ 141 table: make(map[string]int), 142 nullIndex: keyNotFound, 143 builder: array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary), 144 } 145 } 146 147 type binaryMemoTableImpl struct { 148 table map[string]int 149 builder *array.BinaryBuilder 150 nullIndex int 151 } 152 153 func (m *binaryMemoTableImpl) Reset() { 154 m.table = make(map[string]int) 155 m.nullIndex = keyNotFound 156 m.builder.NewArray().Release() 157 } 158 159 func (m *binaryMemoTableImpl) CopyValues(out interface{}) { 160 m.CopyValuesSubset(0, out) 161 } 162 163 func (m *binaryMemoTableImpl) GetNull() (int, bool) { 164 return m.nullIndex, m.nullIndex != keyNotFound 165 } 166 167 func (m *binaryMemoTableImpl) ValuesSize() int { 168 return m.builder.DataLen() 169 } 170 171 func (m *binaryMemoTableImpl) Size() int { 172 sz := len(m.table) 173 if _, ok := m.GetNull(); ok { 174 sz++ 175 } 176 return sz 177 } 178 179 func (m *binaryMemoTableImpl) valAsString(val interface{}) string { 180 switch v := val.(type) { 181 case string: 182 return v 183 case []byte: 184 return *(*string)(unsafe.Pointer(&v)) 185 case parquet.ByteArray: 186 return *(*string)(unsafe.Pointer(&v)) 187 case parquet.FixedLenByteArray: 188 return *(*string)(unsafe.Pointer(&v)) 189 default: 190 panic("invalid type for value in binarymemotable") 191 } 192 } 193 194 func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) { 195 key := m.valAsString(val) 196 if p, ok := m.table[key]; ok { 197 return p, true 198 } 199 return keyNotFound, false 200 } 201 202 func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { 203 key := m.valAsString(val) 204 idx, found = m.table[key] 205 if !found { 206 idx = m.Size() 207 m.builder.AppendString(key) 208 m.table[key] = idx 209 } 210 return 211 } 212 213 func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) { 214 idx, found = m.GetNull() 215 if !found { 216 idx = m.Size() 217 m.nullIndex = idx 218 m.builder.AppendNull() 219 } 220 return 221 } 222 223 func (m *binaryMemoTableImpl) findOffset(idx int) uintptr { 224 val := m.builder.Value(idx) 225 for len(val) == 0 { 226 idx++ 227 if idx >= m.builder.Len() { 228 break 229 } 230 val = m.builder.Value(idx) 231 } 232 if len(val) != 0 { 233 return uintptr(unsafe.Pointer(&val[0])) 234 } 235 return uintptr(m.builder.DataLen()) + m.findOffset(0) 236 } 237 238 func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) { 239 var ( 240 first = m.findOffset(0) 241 offset = m.findOffset(int(start)) 242 length = m.builder.DataLen() - int(offset-first) 243 ) 244 245 outval := out.([]byte) 246 copy(outval, m.builder.Value(start)[0:length]) 247 } 248 249 func (m *binaryMemoTableImpl) WriteOut(out []byte) { 250 m.CopyValues(out) 251 } 252 253 func (m *binaryMemoTableImpl) WriteOutSubset(start int, out []byte) { 254 m.CopyValuesSubset(start, out) 255 } 256 257 func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) { 258 259 } 260 261 func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int32) { 262 if m.builder.Len() <= start { 263 return 264 } 265 266 first := m.findOffset(0) 267 delta := m.findOffset(start) 268 for i := start; i < m.Size(); i++ { 269 offset := int32(m.findOffset(i) - delta) 270 out[i-start] = offset 271 } 272 273 out[m.Size()-start] = int32(m.builder.DataLen() - int(delta) - int(first)) 274 } 275 276 func (m *binaryMemoTableImpl) CopyOffsets(out []int32) { 277 m.CopyOffsetsSubset(0, out) 278 } 279 280 func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) { 281 for i := int(start); i < m.Size(); i++ { 282 visitFn(m.builder.Value(i)) 283 } 284 } 285 286 func (m *binaryMemoTableImpl) Release() { 287 m.builder.Release() 288 } 289 290 func (m *binaryMemoTableImpl) Retain() { 291 m.builder.Retain() 292 } 293 294 // standard map based implementation of a float64 memotable which is only kept around 295 // currently to be used as a benchmark against the memotables in the internal/hashing 296 // module as a baseline comparison. 297 298 func NewFloat64MemoTable(memory.Allocator) MemoTable { 299 return &float64MemoTableImpl{ 300 table: make(map[float64]struct { 301 value float64 302 memoIndex int 303 }), 304 nullIndex: keyNotFound, 305 nanIndex: keyNotFound, 306 } 307 } 308 309 type float64MemoTableImpl struct { 310 table map[float64]struct { 311 value float64 312 memoIndex int 313 } 314 nullIndex int 315 nanIndex int 316 } 317 318 func (m *float64MemoTableImpl) Reset() { 319 m.table = make(map[float64]struct { 320 value float64 321 memoIndex int 322 }) 323 m.nullIndex = keyNotFound 324 m.nanIndex = keyNotFound 325 } 326 327 func (m *float64MemoTableImpl) GetNull() (int, bool) { 328 return m.nullIndex, m.nullIndex != keyNotFound 329 } 330 331 func (m *float64MemoTableImpl) Size() int { 332 sz := len(m.table) 333 if _, ok := m.GetNull(); ok { 334 sz++ 335 } 336 if m.nanIndex != keyNotFound { 337 sz++ 338 } 339 return sz 340 } 341 342 func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) { 343 idx, found = m.GetNull() 344 if !found { 345 idx = m.Size() 346 m.nullIndex = idx 347 } 348 return 349 } 350 351 func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) { 352 v := val.(float64) 353 if p, ok := m.table[v]; ok { 354 return p.memoIndex, true 355 } 356 if math.IsNaN(v) && m.nanIndex != keyNotFound { 357 return m.nanIndex, true 358 } 359 return keyNotFound, false 360 } 361 362 func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { 363 v := val.(float64) 364 if math.IsNaN(v) { 365 if m.nanIndex == keyNotFound { 366 idx = m.Size() 367 m.nanIndex = idx 368 } else { 369 idx = m.nanIndex 370 found = true 371 } 372 return 373 } 374 375 p, ok := m.table[v] 376 if ok { 377 idx = p.memoIndex 378 } else { 379 idx = m.Size() 380 p.value = v 381 p.memoIndex = idx 382 m.table[v] = p 383 found = true 384 } 385 return 386 } 387 388 func (m *float64MemoTableImpl) CopyValues(out interface{}) { 389 m.CopyValuesSubset(0, out) 390 } 391 392 func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) { 393 outval := out.([]float64) 394 for _, v := range m.table { 395 idx := v.memoIndex - start 396 if idx >= 0 { 397 outval[idx] = v.value 398 } 399 } 400 if m.nanIndex != keyNotFound { 401 outval[m.nanIndex] = math.NaN() 402 } 403 } 404 405 func (m *float64MemoTableImpl) WriteOut(out []byte) { 406 m.CopyValuesSubset(0, arrow.Float64Traits.CastFromBytes(out)) 407 } 408 409 func (m *float64MemoTableImpl) WriteOutSubset(start int, out []byte) { 410 m.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out)) 411 }