github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/memo_table.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "math" 21 "unsafe" 22 23 "github.com/apache/arrow/go/v7/arrow" 24 "github.com/apache/arrow/go/v7/arrow/array" 25 "github.com/apache/arrow/go/v7/arrow/memory" 26 "github.com/apache/arrow/go/v7/parquet" 27 "github.com/apache/arrow/go/v7/parquet/internal/hashing" 28 ) 29 30 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl 31 32 // MemoTable interface that can be used to swap out implementations of the hash table 33 // used for handling dictionary encoding. Dictionary encoding is built against this interface 34 // to make it easy for code generation and changing implementations. 35 // 36 // Values should remember the order they are inserted to generate a valid dictionary index 37 type MemoTable interface { 38 // Reset drops everything in the table allowing it to be reused 39 Reset() 40 // Size returns the current number of unique values stored in the table 41 // including whether or not a null value has been passed in using GetOrInsertNull 42 Size() int 43 // CopyValues populates out with the values currently in the table, out must 44 // be a slice of the appropriate type for the table type. 45 CopyValues(out interface{}) 46 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 47 // at the indicated index. 48 CopyValuesSubset(start int, out interface{}) 49 50 WriteOut(out []byte) 51 WriteOutSubset(start int, out []byte) 52 // Get returns the index of the table the specified value is, and a boolean indicating 53 // whether or not the value was found in the table. Will panic if val is not the appropriate 54 // type for the underlying table. 55 Get(val interface{}) (int, bool) 56 // GetOrInsert is the same as Get, except if the value is not currently in the table it will 57 // be inserted into the table. 58 GetOrInsert(val interface{}) (idx int, existed bool, err error) 59 // GetNull returns the index of the null value and whether or not it was found in the table 60 GetNull() (int, bool) 61 // GetOrInsertNull returns the index of the null value, if it didn't already exist in the table, 62 // it is inserted. 63 GetOrInsertNull() (idx int, existed bool) 64 } 65 66 // BinaryMemoTable is an extension of the MemoTable interface adding extra methods 67 // for handling byte arrays/strings/fixed length byte arrays. 68 type BinaryMemoTable interface { 69 MemoTable 70 // ValuesSize returns the total number of bytes needed to copy all of the values 71 // from this table. 72 ValuesSize() int 73 // CopyOffsets populates out with the start and end offsets of each value in the 74 // table data. Out should be sized to Size()+1 to accomodate all of the offsets. 75 CopyOffsets(out []int8) 76 // CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets 77 // starting at the specified index. 78 CopyOffsetsSubset(start int, out []int8) 79 // CopyFixedWidthValues exists to cope with the fact that the table doesn't track 80 // the fixed width when inserting the null value into the databuffer populating 81 // a zero length byte slice for the null value (if found). 82 CopyFixedWidthValues(start int, width int, out []byte) 83 // VisitValues calls visitFn on each value in the table starting with the index specified 84 VisitValues(start int, visitFn func([]byte)) 85 // Retain increases the reference count of the separately stored binary data that is 86 // kept alongside the table which contains all of the values in the table. This is 87 // safe to call simultaneously across multiple goroutines. 88 Retain() 89 // Release decreases the reference count by 1 of the separately stored binary data 90 // kept alongside the table containing the values. When the reference count goes to 91 // 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy. 92 Release() 93 } 94 95 // NewInt32Dictionary returns a memotable interface for use with Int32 values only 96 func NewInt32Dictionary() MemoTable { 97 return hashing.NewInt32MemoTable(0) 98 } 99 100 // NewInt64Dictionary returns a memotable interface for use with Int64 values only 101 func NewInt64Dictionary() MemoTable { 102 return hashing.NewInt64MemoTable(0) 103 } 104 105 // NewFloat32Dictionary returns a memotable interface for use with Float32 values only 106 func NewFloat32Dictionary() MemoTable { 107 return hashing.NewFloat32MemoTable(0) 108 } 109 110 // NewFloat64Dictionary returns a memotable interface for use with Float64 values only 111 func NewFloat64Dictionary() MemoTable { 112 return hashing.NewFloat64MemoTable(0) 113 } 114 115 // NewBinaryDictionary returns a memotable interface for use with strings, byte slices, 116 // parquet.ByteArray and parquet.FixedLengthByteArray only. 117 func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable { 118 return hashing.NewBinaryMemoTable(mem, 0, -1) 119 } 120 121 const keyNotFound = hashing.KeyNotFound 122 123 // standard map based implementation of a binary memotable which is only kept around 124 // currently to be used as a benchmark against the memotables in the internal/hashing 125 // module as a baseline comparison. 126 127 func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable { 128 return &binaryMemoTableImpl{ 129 table: make(map[string]int), 130 nullIndex: keyNotFound, 131 builder: array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary), 132 } 133 } 134 135 type binaryMemoTableImpl struct { 136 table map[string]int 137 builder *array.BinaryBuilder 138 nullIndex int 139 } 140 141 func (m *binaryMemoTableImpl) Reset() { 142 m.table = make(map[string]int) 143 m.nullIndex = keyNotFound 144 m.builder.NewArray().Release() 145 } 146 147 func (m *binaryMemoTableImpl) CopyValues(out interface{}) { 148 m.CopyValuesSubset(0, out) 149 } 150 151 func (m *binaryMemoTableImpl) GetNull() (int, bool) { 152 return m.nullIndex, m.nullIndex != keyNotFound 153 } 154 155 func (m *binaryMemoTableImpl) ValuesSize() int { 156 return m.builder.DataLen() 157 } 158 159 func (m *binaryMemoTableImpl) Size() int { 160 sz := len(m.table) 161 if _, ok := m.GetNull(); ok { 162 sz++ 163 } 164 return sz 165 } 166 167 func (m *binaryMemoTableImpl) valAsString(val interface{}) string { 168 switch v := val.(type) { 169 case string: 170 return v 171 case []byte: 172 return *(*string)(unsafe.Pointer(&v)) 173 case parquet.ByteArray: 174 return *(*string)(unsafe.Pointer(&v)) 175 case parquet.FixedLenByteArray: 176 return *(*string)(unsafe.Pointer(&v)) 177 default: 178 panic("invalid type for value in binarymemotable") 179 } 180 } 181 182 func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) { 183 key := m.valAsString(val) 184 if p, ok := m.table[key]; ok { 185 return p, true 186 } 187 return keyNotFound, false 188 } 189 190 func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { 191 key := m.valAsString(val) 192 idx, found = m.table[key] 193 if !found { 194 idx = m.Size() 195 m.builder.AppendString(key) 196 m.table[key] = idx 197 } 198 return 199 } 200 201 func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) { 202 idx, found = m.GetNull() 203 if !found { 204 idx = m.Size() 205 m.nullIndex = idx 206 m.builder.AppendNull() 207 } 208 return 209 } 210 211 func (m *binaryMemoTableImpl) findOffset(idx int) uintptr { 212 val := m.builder.Value(idx) 213 for len(val) == 0 { 214 idx++ 215 if idx >= m.builder.Len() { 216 break 217 } 218 val = m.builder.Value(idx) 219 } 220 if len(val) != 0 { 221 return uintptr(unsafe.Pointer(&val[0])) 222 } 223 return uintptr(m.builder.DataLen()) + m.findOffset(0) 224 } 225 226 func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) { 227 var ( 228 first = m.findOffset(0) 229 offset = m.findOffset(int(start)) 230 length = m.builder.DataLen() - int(offset-first) 231 ) 232 233 outval := out.([]byte) 234 copy(outval, m.builder.Value(start)[0:length]) 235 } 236 237 func (m *binaryMemoTableImpl) WriteOut(out []byte) { 238 m.CopyValues(out) 239 } 240 241 func (m *binaryMemoTableImpl) WriteOutSubset(start int, out []byte) { 242 m.CopyValuesSubset(start, out) 243 } 244 245 func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) { 246 247 } 248 249 func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int8) { 250 if m.builder.Len() <= start { 251 return 252 } 253 254 first := m.findOffset(0) 255 delta := m.findOffset(start) 256 for i := start; i < m.Size(); i++ { 257 offset := int8(m.findOffset(i) - delta) 258 out[i-start] = offset 259 } 260 261 out[m.Size()-start] = int8(m.builder.DataLen() - int(delta) - int(first)) 262 } 263 264 func (m *binaryMemoTableImpl) CopyOffsets(out []int8) { 265 m.CopyOffsetsSubset(0, out) 266 } 267 268 func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) { 269 for i := int(start); i < m.Size(); i++ { 270 visitFn(m.builder.Value(i)) 271 } 272 } 273 274 func (m *binaryMemoTableImpl) Release() { 275 m.builder.Release() 276 } 277 278 func (m *binaryMemoTableImpl) Retain() { 279 m.builder.Retain() 280 } 281 282 // standard map based implementation of a float64 memotable which is only kept around 283 // currently to be used as a benchmark against the memotables in the internal/hashing 284 // module as a baseline comparison. 285 286 func NewFloat64MemoTable(memory.Allocator) MemoTable { 287 return &float64MemoTableImpl{ 288 table: make(map[float64]struct { 289 value float64 290 memoIndex int 291 }), 292 nullIndex: keyNotFound, 293 nanIndex: keyNotFound, 294 } 295 } 296 297 type float64MemoTableImpl struct { 298 table map[float64]struct { 299 value float64 300 memoIndex int 301 } 302 nullIndex int 303 nanIndex int 304 } 305 306 func (m *float64MemoTableImpl) Reset() { 307 m.table = make(map[float64]struct { 308 value float64 309 memoIndex int 310 }) 311 m.nullIndex = keyNotFound 312 m.nanIndex = keyNotFound 313 } 314 315 func (m *float64MemoTableImpl) GetNull() (int, bool) { 316 return m.nullIndex, m.nullIndex != keyNotFound 317 } 318 319 func (m *float64MemoTableImpl) Size() int { 320 sz := len(m.table) 321 if _, ok := m.GetNull(); ok { 322 sz++ 323 } 324 if m.nanIndex != keyNotFound { 325 sz++ 326 } 327 return sz 328 } 329 330 func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) { 331 idx, found = m.GetNull() 332 if !found { 333 idx = m.Size() 334 m.nullIndex = idx 335 } 336 return 337 } 338 339 func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) { 340 v := val.(float64) 341 if p, ok := m.table[v]; ok { 342 return p.memoIndex, true 343 } 344 if math.IsNaN(v) && m.nanIndex != keyNotFound { 345 return m.nanIndex, true 346 } 347 return keyNotFound, false 348 } 349 350 func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { 351 v := val.(float64) 352 if math.IsNaN(v) { 353 if m.nanIndex == keyNotFound { 354 idx = m.Size() 355 m.nanIndex = idx 356 } else { 357 idx = m.nanIndex 358 found = true 359 } 360 return 361 } 362 363 p, ok := m.table[v] 364 if ok { 365 idx = p.memoIndex 366 } else { 367 idx = m.Size() 368 p.value = v 369 p.memoIndex = idx 370 m.table[v] = p 371 found = true 372 } 373 return 374 } 375 376 func (m *float64MemoTableImpl) CopyValues(out interface{}) { 377 m.CopyValuesSubset(0, out) 378 } 379 380 func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) { 381 outval := out.([]float64) 382 for _, v := range m.table { 383 idx := v.memoIndex - start 384 if idx >= 0 { 385 outval[idx] = v.value 386 } 387 } 388 if m.nanIndex != keyNotFound { 389 outval[m.nanIndex] = math.NaN() 390 } 391 } 392 393 func (m *float64MemoTableImpl) WriteOut(out []byte) { 394 m.CopyValuesSubset(0, arrow.Float64Traits.CastFromBytes(out)) 395 } 396 397 func (m *float64MemoTableImpl) WriteOutSubset(start int, out []byte) { 398 m.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out)) 399 }