github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/memo_table.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"math"
    21  	"unsafe"
    22  
    23  	"github.com/apache/arrow/go/v14/arrow"
    24  	"github.com/apache/arrow/go/v14/arrow/array"
    25  	"github.com/apache/arrow/go/v14/arrow/memory"
    26  	"github.com/apache/arrow/go/v14/internal/hashing"
    27  	"github.com/apache/arrow/go/v14/parquet"
    28  )
    29  
    30  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl
    31  
    32  // MemoTable interface that can be used to swap out implementations of the hash table
    33  // used for handling dictionary encoding. Dictionary encoding is built against this interface
    34  // to make it easy for code generation and changing implementations.
    35  //
    36  // Values should remember the order they are inserted to generate a valid dictionary index
    37  type MemoTable interface {
    38  	// Reset drops everything in the table allowing it to be reused
    39  	Reset()
    40  	// Size returns the current number of unique values stored in the table
    41  	// including whether or not a null value has been passed in using GetOrInsertNull
    42  	Size() int
    43  	// CopyValues populates out with the values currently in the table, out must
    44  	// be a slice of the appropriate type for the table type.
    45  	CopyValues(out interface{})
    46  	// CopyValuesSubset is like CopyValues but only copies a subset of values starting
    47  	// at the indicated index.
    48  	CopyValuesSubset(start int, out interface{})
    49  
    50  	WriteOut(out []byte)
    51  	WriteOutSubset(start int, out []byte)
    52  	// Get returns the index of the table the specified value is, and a boolean indicating
    53  	// whether or not the value was found in the table. Will panic if val is not the appropriate
    54  	// type for the underlying table.
    55  	Get(val interface{}) (int, bool)
    56  	// GetOrInsert is the same as Get, except if the value is not currently in the table it will
    57  	// be inserted into the table.
    58  	GetOrInsert(val interface{}) (idx int, existed bool, err error)
    59  	// GetNull returns the index of the null value and whether or not it was found in the table
    60  	GetNull() (int, bool)
    61  	// GetOrInsertNull returns the index of the null value, if it didn't already exist in the table,
    62  	// it is inserted.
    63  	GetOrInsertNull() (idx int, existed bool)
    64  }
    65  
    66  type NumericMemoTable interface {
    67  	MemoTable
    68  	// WriteOutLE writes the contents of the memo table out to the byteslice
    69  	// but ensures the values are little-endian before writing them (converting
    70  	// if on a big endian system).
    71  	WriteOutLE(out []byte)
    72  	// WriteOutSubsetLE writes the contents of the memo table out to the byteslice
    73  	// starting with the index indicated by start, but ensures the values are little
    74  	// endian before writing them (converting if on a big-endian system).
    75  	WriteOutSubsetLE(start int, out []byte)
    76  }
    77  
    78  // BinaryMemoTable is an extension of the MemoTable interface adding extra methods
    79  // for handling byte arrays/strings/fixed length byte arrays.
    80  type BinaryMemoTable interface {
    81  	MemoTable
    82  	// ValuesSize returns the total number of bytes needed to copy all of the values
    83  	// from this table.
    84  	ValuesSize() int
    85  	// CopyOffsets populates out with the start and end offsets of each value in the
    86  	// table data. Out should be sized to Size()+1 to accomodate all of the offsets.
    87  	CopyOffsets(out []int32)
    88  	// CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets
    89  	// starting at the specified index.
    90  	CopyOffsetsSubset(start int, out []int32)
    91  	// CopyFixedWidthValues exists to cope with the fact that the table doesn't track
    92  	// the fixed width when inserting the null value into the databuffer populating
    93  	// a zero length byte slice for the null value (if found).
    94  	CopyFixedWidthValues(start int, width int, out []byte)
    95  	// VisitValues calls visitFn on each value in the table starting with the index specified
    96  	VisitValues(start int, visitFn func([]byte))
    97  	// Retain increases the reference count of the separately stored binary data that is
    98  	// kept alongside the table which contains all of the values in the table. This is
    99  	// safe to call simultaneously across multiple goroutines.
   100  	Retain()
   101  	// Release decreases the reference count by 1 of the separately stored binary data
   102  	// kept alongside the table containing the values. When the reference count goes to
   103  	// 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy.
   104  	Release()
   105  }
   106  
   107  // NewInt32Dictionary returns a memotable interface for use with Int32 values only
   108  func NewInt32Dictionary() MemoTable {
   109  	return hashing.NewInt32MemoTable(0)
   110  }
   111  
   112  // NewInt64Dictionary returns a memotable interface for use with Int64 values only
   113  func NewInt64Dictionary() MemoTable {
   114  	return hashing.NewInt64MemoTable(0)
   115  }
   116  
   117  // NewFloat32Dictionary returns a memotable interface for use with Float32 values only
   118  func NewFloat32Dictionary() MemoTable {
   119  	return hashing.NewFloat32MemoTable(0)
   120  }
   121  
   122  // NewFloat64Dictionary returns a memotable interface for use with Float64 values only
   123  func NewFloat64Dictionary() MemoTable {
   124  	return hashing.NewFloat64MemoTable(0)
   125  }
   126  
   127  // NewBinaryDictionary returns a memotable interface for use with strings, byte slices,
   128  // parquet.ByteArray and parquet.FixedLengthByteArray only.
   129  func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable {
   130  	return hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary))
   131  }
   132  
   133  const keyNotFound = hashing.KeyNotFound
   134  
   135  // standard map based implementation of a binary memotable which is only kept around
   136  // currently to be used as a benchmark against the memotables in the internal/hashing
   137  // module as a baseline comparison.
   138  
   139  func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable {
   140  	return &binaryMemoTableImpl{
   141  		table:     make(map[string]int),
   142  		nullIndex: keyNotFound,
   143  		builder:   array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary),
   144  	}
   145  }
   146  
   147  type binaryMemoTableImpl struct {
   148  	table     map[string]int
   149  	builder   *array.BinaryBuilder
   150  	nullIndex int
   151  }
   152  
   153  func (m *binaryMemoTableImpl) Reset() {
   154  	m.table = make(map[string]int)
   155  	m.nullIndex = keyNotFound
   156  	m.builder.NewArray().Release()
   157  }
   158  
   159  func (m *binaryMemoTableImpl) CopyValues(out interface{}) {
   160  	m.CopyValuesSubset(0, out)
   161  }
   162  
   163  func (m *binaryMemoTableImpl) GetNull() (int, bool) {
   164  	return m.nullIndex, m.nullIndex != keyNotFound
   165  }
   166  
   167  func (m *binaryMemoTableImpl) ValuesSize() int {
   168  	return m.builder.DataLen()
   169  }
   170  
   171  func (m *binaryMemoTableImpl) Size() int {
   172  	sz := len(m.table)
   173  	if _, ok := m.GetNull(); ok {
   174  		sz++
   175  	}
   176  	return sz
   177  }
   178  
   179  func (m *binaryMemoTableImpl) valAsString(val interface{}) string {
   180  	switch v := val.(type) {
   181  	case string:
   182  		return v
   183  	case []byte:
   184  		return *(*string)(unsafe.Pointer(&v))
   185  	case parquet.ByteArray:
   186  		return *(*string)(unsafe.Pointer(&v))
   187  	case parquet.FixedLenByteArray:
   188  		return *(*string)(unsafe.Pointer(&v))
   189  	default:
   190  		panic("invalid type for value in binarymemotable")
   191  	}
   192  }
   193  
   194  func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) {
   195  	key := m.valAsString(val)
   196  	if p, ok := m.table[key]; ok {
   197  		return p, true
   198  	}
   199  	return keyNotFound, false
   200  }
   201  
   202  func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   203  	key := m.valAsString(val)
   204  	idx, found = m.table[key]
   205  	if !found {
   206  		idx = m.Size()
   207  		m.builder.AppendString(key)
   208  		m.table[key] = idx
   209  	}
   210  	return
   211  }
   212  
   213  func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) {
   214  	idx, found = m.GetNull()
   215  	if !found {
   216  		idx = m.Size()
   217  		m.nullIndex = idx
   218  		m.builder.AppendNull()
   219  	}
   220  	return
   221  }
   222  
   223  func (m *binaryMemoTableImpl) findOffset(idx int) uintptr {
   224  	val := m.builder.Value(idx)
   225  	for len(val) == 0 {
   226  		idx++
   227  		if idx >= m.builder.Len() {
   228  			break
   229  		}
   230  		val = m.builder.Value(idx)
   231  	}
   232  	if len(val) != 0 {
   233  		return uintptr(unsafe.Pointer(&val[0]))
   234  	}
   235  	return uintptr(m.builder.DataLen()) + m.findOffset(0)
   236  }
   237  
   238  func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) {
   239  	var (
   240  		first  = m.findOffset(0)
   241  		offset = m.findOffset(int(start))
   242  		length = m.builder.DataLen() - int(offset-first)
   243  	)
   244  
   245  	outval := out.([]byte)
   246  	copy(outval, m.builder.Value(start)[0:length])
   247  }
   248  
   249  func (m *binaryMemoTableImpl) WriteOut(out []byte) {
   250  	m.CopyValues(out)
   251  }
   252  
   253  func (m *binaryMemoTableImpl) WriteOutSubset(start int, out []byte) {
   254  	m.CopyValuesSubset(start, out)
   255  }
   256  
   257  func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) {
   258  
   259  }
   260  
   261  func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int32) {
   262  	if m.builder.Len() <= start {
   263  		return
   264  	}
   265  
   266  	first := m.findOffset(0)
   267  	delta := m.findOffset(start)
   268  	for i := start; i < m.Size(); i++ {
   269  		offset := int32(m.findOffset(i) - delta)
   270  		out[i-start] = offset
   271  	}
   272  
   273  	out[m.Size()-start] = int32(m.builder.DataLen() - int(delta) - int(first))
   274  }
   275  
   276  func (m *binaryMemoTableImpl) CopyOffsets(out []int32) {
   277  	m.CopyOffsetsSubset(0, out)
   278  }
   279  
   280  func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) {
   281  	for i := int(start); i < m.Size(); i++ {
   282  		visitFn(m.builder.Value(i))
   283  	}
   284  }
   285  
   286  func (m *binaryMemoTableImpl) Release() {
   287  	m.builder.Release()
   288  }
   289  
   290  func (m *binaryMemoTableImpl) Retain() {
   291  	m.builder.Retain()
   292  }
   293  
   294  // standard map based implementation of a float64 memotable which is only kept around
   295  // currently to be used as a benchmark against the memotables in the internal/hashing
   296  // module as a baseline comparison.
   297  
   298  func NewFloat64MemoTable(memory.Allocator) MemoTable {
   299  	return &float64MemoTableImpl{
   300  		table: make(map[float64]struct {
   301  			value     float64
   302  			memoIndex int
   303  		}),
   304  		nullIndex: keyNotFound,
   305  		nanIndex:  keyNotFound,
   306  	}
   307  }
   308  
   309  type float64MemoTableImpl struct {
   310  	table map[float64]struct {
   311  		value     float64
   312  		memoIndex int
   313  	}
   314  	nullIndex int
   315  	nanIndex  int
   316  }
   317  
   318  func (m *float64MemoTableImpl) Reset() {
   319  	m.table = make(map[float64]struct {
   320  		value     float64
   321  		memoIndex int
   322  	})
   323  	m.nullIndex = keyNotFound
   324  	m.nanIndex = keyNotFound
   325  }
   326  
   327  func (m *float64MemoTableImpl) GetNull() (int, bool) {
   328  	return m.nullIndex, m.nullIndex != keyNotFound
   329  }
   330  
   331  func (m *float64MemoTableImpl) Size() int {
   332  	sz := len(m.table)
   333  	if _, ok := m.GetNull(); ok {
   334  		sz++
   335  	}
   336  	if m.nanIndex != keyNotFound {
   337  		sz++
   338  	}
   339  	return sz
   340  }
   341  
   342  func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) {
   343  	idx, found = m.GetNull()
   344  	if !found {
   345  		idx = m.Size()
   346  		m.nullIndex = idx
   347  	}
   348  	return
   349  }
   350  
   351  func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) {
   352  	v := val.(float64)
   353  	if p, ok := m.table[v]; ok {
   354  		return p.memoIndex, true
   355  	}
   356  	if math.IsNaN(v) && m.nanIndex != keyNotFound {
   357  		return m.nanIndex, true
   358  	}
   359  	return keyNotFound, false
   360  }
   361  
   362  func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   363  	v := val.(float64)
   364  	if math.IsNaN(v) {
   365  		if m.nanIndex == keyNotFound {
   366  			idx = m.Size()
   367  			m.nanIndex = idx
   368  		} else {
   369  			idx = m.nanIndex
   370  			found = true
   371  		}
   372  		return
   373  	}
   374  
   375  	p, ok := m.table[v]
   376  	if ok {
   377  		idx = p.memoIndex
   378  	} else {
   379  		idx = m.Size()
   380  		p.value = v
   381  		p.memoIndex = idx
   382  		m.table[v] = p
   383  		found = true
   384  	}
   385  	return
   386  }
   387  
   388  func (m *float64MemoTableImpl) CopyValues(out interface{}) {
   389  	m.CopyValuesSubset(0, out)
   390  }
   391  
   392  func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) {
   393  	outval := out.([]float64)
   394  	for _, v := range m.table {
   395  		idx := v.memoIndex - start
   396  		if idx >= 0 {
   397  			outval[idx] = v.value
   398  		}
   399  	}
   400  	if m.nanIndex != keyNotFound {
   401  		outval[m.nanIndex] = math.NaN()
   402  	}
   403  }
   404  
   405  func (m *float64MemoTableImpl) WriteOut(out []byte) {
   406  	m.CopyValuesSubset(0, arrow.Float64Traits.CastFromBytes(out))
   407  }
   408  
   409  func (m *float64MemoTableImpl) WriteOutSubset(start int, out []byte) {
   410  	m.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out))
   411  }