github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/memo_table.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"math"
    21  	"unsafe"
    22  
    23  	"github.com/apache/arrow/go/v7/arrow"
    24  	"github.com/apache/arrow/go/v7/arrow/array"
    25  	"github.com/apache/arrow/go/v7/arrow/memory"
    26  	"github.com/apache/arrow/go/v7/parquet"
    27  	"github.com/apache/arrow/go/v7/parquet/internal/hashing"
    28  )
    29  
    30  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl
    31  
    32  // MemoTable interface that can be used to swap out implementations of the hash table
    33  // used for handling dictionary encoding. Dictionary encoding is built against this interface
    34  // to make it easy for code generation and changing implementations.
    35  //
    36  // Values should remember the order they are inserted to generate a valid dictionary index
    37  type MemoTable interface {
    38  	// Reset drops everything in the table allowing it to be reused
    39  	Reset()
    40  	// Size returns the current number of unique values stored in the table
    41  	// including whether or not a null value has been passed in using GetOrInsertNull
    42  	Size() int
    43  	// CopyValues populates out with the values currently in the table, out must
    44  	// be a slice of the appropriate type for the table type.
    45  	CopyValues(out interface{})
    46  	// CopyValuesSubset is like CopyValues but only copies a subset of values starting
    47  	// at the indicated index.
    48  	CopyValuesSubset(start int, out interface{})
    49  
    50  	WriteOut(out []byte)
    51  	WriteOutSubset(start int, out []byte)
    52  	// Get returns the index of the table the specified value is, and a boolean indicating
    53  	// whether or not the value was found in the table. Will panic if val is not the appropriate
    54  	// type for the underlying table.
    55  	Get(val interface{}) (int, bool)
    56  	// GetOrInsert is the same as Get, except if the value is not currently in the table it will
    57  	// be inserted into the table.
    58  	GetOrInsert(val interface{}) (idx int, existed bool, err error)
    59  	// GetNull returns the index of the null value and whether or not it was found in the table
    60  	GetNull() (int, bool)
    61  	// GetOrInsertNull returns the index of the null value, if it didn't already exist in the table,
    62  	// it is inserted.
    63  	GetOrInsertNull() (idx int, existed bool)
    64  }
    65  
    66  // BinaryMemoTable is an extension of the MemoTable interface adding extra methods
    67  // for handling byte arrays/strings/fixed length byte arrays.
    68  type BinaryMemoTable interface {
    69  	MemoTable
    70  	// ValuesSize returns the total number of bytes needed to copy all of the values
    71  	// from this table.
    72  	ValuesSize() int
    73  	// CopyOffsets populates out with the start and end offsets of each value in the
    74  	// table data. Out should be sized to Size()+1 to accomodate all of the offsets.
    75  	CopyOffsets(out []int8)
    76  	// CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets
    77  	// starting at the specified index.
    78  	CopyOffsetsSubset(start int, out []int8)
    79  	// CopyFixedWidthValues exists to cope with the fact that the table doesn't track
    80  	// the fixed width when inserting the null value into the databuffer populating
    81  	// a zero length byte slice for the null value (if found).
    82  	CopyFixedWidthValues(start int, width int, out []byte)
    83  	// VisitValues calls visitFn on each value in the table starting with the index specified
    84  	VisitValues(start int, visitFn func([]byte))
    85  	// Retain increases the reference count of the separately stored binary data that is
    86  	// kept alongside the table which contains all of the values in the table. This is
    87  	// safe to call simultaneously across multiple goroutines.
    88  	Retain()
    89  	// Release decreases the reference count by 1 of the separately stored binary data
    90  	// kept alongside the table containing the values. When the reference count goes to
    91  	// 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy.
    92  	Release()
    93  }
    94  
    95  // NewInt32Dictionary returns a memotable interface for use with Int32 values only
    96  func NewInt32Dictionary() MemoTable {
    97  	return hashing.NewInt32MemoTable(0)
    98  }
    99  
   100  // NewInt64Dictionary returns a memotable interface for use with Int64 values only
   101  func NewInt64Dictionary() MemoTable {
   102  	return hashing.NewInt64MemoTable(0)
   103  }
   104  
   105  // NewFloat32Dictionary returns a memotable interface for use with Float32 values only
   106  func NewFloat32Dictionary() MemoTable {
   107  	return hashing.NewFloat32MemoTable(0)
   108  }
   109  
   110  // NewFloat64Dictionary returns a memotable interface for use with Float64 values only
   111  func NewFloat64Dictionary() MemoTable {
   112  	return hashing.NewFloat64MemoTable(0)
   113  }
   114  
   115  // NewBinaryDictionary returns a memotable interface for use with strings, byte slices,
   116  // parquet.ByteArray and parquet.FixedLengthByteArray only.
   117  func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable {
   118  	return hashing.NewBinaryMemoTable(mem, 0, -1)
   119  }
   120  
   121  const keyNotFound = hashing.KeyNotFound
   122  
   123  // standard map based implementation of a binary memotable which is only kept around
   124  // currently to be used as a benchmark against the memotables in the internal/hashing
   125  // module as a baseline comparison.
   126  
   127  func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable {
   128  	return &binaryMemoTableImpl{
   129  		table:     make(map[string]int),
   130  		nullIndex: keyNotFound,
   131  		builder:   array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary),
   132  	}
   133  }
   134  
   135  type binaryMemoTableImpl struct {
   136  	table     map[string]int
   137  	builder   *array.BinaryBuilder
   138  	nullIndex int
   139  }
   140  
   141  func (m *binaryMemoTableImpl) Reset() {
   142  	m.table = make(map[string]int)
   143  	m.nullIndex = keyNotFound
   144  	m.builder.NewArray().Release()
   145  }
   146  
   147  func (m *binaryMemoTableImpl) CopyValues(out interface{}) {
   148  	m.CopyValuesSubset(0, out)
   149  }
   150  
   151  func (m *binaryMemoTableImpl) GetNull() (int, bool) {
   152  	return m.nullIndex, m.nullIndex != keyNotFound
   153  }
   154  
   155  func (m *binaryMemoTableImpl) ValuesSize() int {
   156  	return m.builder.DataLen()
   157  }
   158  
   159  func (m *binaryMemoTableImpl) Size() int {
   160  	sz := len(m.table)
   161  	if _, ok := m.GetNull(); ok {
   162  		sz++
   163  	}
   164  	return sz
   165  }
   166  
   167  func (m *binaryMemoTableImpl) valAsString(val interface{}) string {
   168  	switch v := val.(type) {
   169  	case string:
   170  		return v
   171  	case []byte:
   172  		return *(*string)(unsafe.Pointer(&v))
   173  	case parquet.ByteArray:
   174  		return *(*string)(unsafe.Pointer(&v))
   175  	case parquet.FixedLenByteArray:
   176  		return *(*string)(unsafe.Pointer(&v))
   177  	default:
   178  		panic("invalid type for value in binarymemotable")
   179  	}
   180  }
   181  
   182  func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) {
   183  	key := m.valAsString(val)
   184  	if p, ok := m.table[key]; ok {
   185  		return p, true
   186  	}
   187  	return keyNotFound, false
   188  }
   189  
   190  func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   191  	key := m.valAsString(val)
   192  	idx, found = m.table[key]
   193  	if !found {
   194  		idx = m.Size()
   195  		m.builder.AppendString(key)
   196  		m.table[key] = idx
   197  	}
   198  	return
   199  }
   200  
   201  func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) {
   202  	idx, found = m.GetNull()
   203  	if !found {
   204  		idx = m.Size()
   205  		m.nullIndex = idx
   206  		m.builder.AppendNull()
   207  	}
   208  	return
   209  }
   210  
   211  func (m *binaryMemoTableImpl) findOffset(idx int) uintptr {
   212  	val := m.builder.Value(idx)
   213  	for len(val) == 0 {
   214  		idx++
   215  		if idx >= m.builder.Len() {
   216  			break
   217  		}
   218  		val = m.builder.Value(idx)
   219  	}
   220  	if len(val) != 0 {
   221  		return uintptr(unsafe.Pointer(&val[0]))
   222  	}
   223  	return uintptr(m.builder.DataLen()) + m.findOffset(0)
   224  }
   225  
   226  func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) {
   227  	var (
   228  		first  = m.findOffset(0)
   229  		offset = m.findOffset(int(start))
   230  		length = m.builder.DataLen() - int(offset-first)
   231  	)
   232  
   233  	outval := out.([]byte)
   234  	copy(outval, m.builder.Value(start)[0:length])
   235  }
   236  
   237  func (m *binaryMemoTableImpl) WriteOut(out []byte) {
   238  	m.CopyValues(out)
   239  }
   240  
   241  func (m *binaryMemoTableImpl) WriteOutSubset(start int, out []byte) {
   242  	m.CopyValuesSubset(start, out)
   243  }
   244  
   245  func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) {
   246  
   247  }
   248  
   249  func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int8) {
   250  	if m.builder.Len() <= start {
   251  		return
   252  	}
   253  
   254  	first := m.findOffset(0)
   255  	delta := m.findOffset(start)
   256  	for i := start; i < m.Size(); i++ {
   257  		offset := int8(m.findOffset(i) - delta)
   258  		out[i-start] = offset
   259  	}
   260  
   261  	out[m.Size()-start] = int8(m.builder.DataLen() - int(delta) - int(first))
   262  }
   263  
   264  func (m *binaryMemoTableImpl) CopyOffsets(out []int8) {
   265  	m.CopyOffsetsSubset(0, out)
   266  }
   267  
   268  func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) {
   269  	for i := int(start); i < m.Size(); i++ {
   270  		visitFn(m.builder.Value(i))
   271  	}
   272  }
   273  
   274  func (m *binaryMemoTableImpl) Release() {
   275  	m.builder.Release()
   276  }
   277  
   278  func (m *binaryMemoTableImpl) Retain() {
   279  	m.builder.Retain()
   280  }
   281  
   282  // standard map based implementation of a float64 memotable which is only kept around
   283  // currently to be used as a benchmark against the memotables in the internal/hashing
   284  // module as a baseline comparison.
   285  
   286  func NewFloat64MemoTable(memory.Allocator) MemoTable {
   287  	return &float64MemoTableImpl{
   288  		table: make(map[float64]struct {
   289  			value     float64
   290  			memoIndex int
   291  		}),
   292  		nullIndex: keyNotFound,
   293  		nanIndex:  keyNotFound,
   294  	}
   295  }
   296  
   297  type float64MemoTableImpl struct {
   298  	table map[float64]struct {
   299  		value     float64
   300  		memoIndex int
   301  	}
   302  	nullIndex int
   303  	nanIndex  int
   304  }
   305  
   306  func (m *float64MemoTableImpl) Reset() {
   307  	m.table = make(map[float64]struct {
   308  		value     float64
   309  		memoIndex int
   310  	})
   311  	m.nullIndex = keyNotFound
   312  	m.nanIndex = keyNotFound
   313  }
   314  
   315  func (m *float64MemoTableImpl) GetNull() (int, bool) {
   316  	return m.nullIndex, m.nullIndex != keyNotFound
   317  }
   318  
   319  func (m *float64MemoTableImpl) Size() int {
   320  	sz := len(m.table)
   321  	if _, ok := m.GetNull(); ok {
   322  		sz++
   323  	}
   324  	if m.nanIndex != keyNotFound {
   325  		sz++
   326  	}
   327  	return sz
   328  }
   329  
   330  func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) {
   331  	idx, found = m.GetNull()
   332  	if !found {
   333  		idx = m.Size()
   334  		m.nullIndex = idx
   335  	}
   336  	return
   337  }
   338  
   339  func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) {
   340  	v := val.(float64)
   341  	if p, ok := m.table[v]; ok {
   342  		return p.memoIndex, true
   343  	}
   344  	if math.IsNaN(v) && m.nanIndex != keyNotFound {
   345  		return m.nanIndex, true
   346  	}
   347  	return keyNotFound, false
   348  }
   349  
   350  func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   351  	v := val.(float64)
   352  	if math.IsNaN(v) {
   353  		if m.nanIndex == keyNotFound {
   354  			idx = m.Size()
   355  			m.nanIndex = idx
   356  		} else {
   357  			idx = m.nanIndex
   358  			found = true
   359  		}
   360  		return
   361  	}
   362  
   363  	p, ok := m.table[v]
   364  	if ok {
   365  		idx = p.memoIndex
   366  	} else {
   367  		idx = m.Size()
   368  		p.value = v
   369  		p.memoIndex = idx
   370  		m.table[v] = p
   371  		found = true
   372  	}
   373  	return
   374  }
   375  
   376  func (m *float64MemoTableImpl) CopyValues(out interface{}) {
   377  	m.CopyValuesSubset(0, out)
   378  }
   379  
   380  func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) {
   381  	outval := out.([]float64)
   382  	for _, v := range m.table {
   383  		idx := v.memoIndex - start
   384  		if idx >= 0 {
   385  			outval[idx] = v.value
   386  		}
   387  	}
   388  	if m.nanIndex != keyNotFound {
   389  		outval[m.nanIndex] = math.NaN()
   390  	}
   391  }
   392  
   393  func (m *float64MemoTableImpl) WriteOut(out []byte) {
   394  	m.CopyValuesSubset(0, arrow.Float64Traits.CastFromBytes(out))
   395  }
   396  
   397  func (m *float64MemoTableImpl) WriteOutSubset(start int, out []byte) {
   398  	m.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out))
   399  }