github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/encoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"math/bits"
    21  	"reflect"
    22  
    23  	"github.com/apache/arrow/go/v10/arrow"
    24  	"github.com/apache/arrow/go/v10/arrow/bitutil"
    25  	"github.com/apache/arrow/go/v10/arrow/memory"
    26  	"github.com/apache/arrow/go/v10/internal/bitutils"
    27  	"github.com/apache/arrow/go/v10/parquet"
    28  	format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet"
    29  	"github.com/apache/arrow/go/v10/parquet/internal/utils"
    30  	"github.com/apache/arrow/go/v10/parquet/schema"
    31  )
    32  
    33  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl
    34  
    35  // EncoderTraits is an interface for the different types to make it more
    36  // convenient to construct encoders for specific types.
    37  type EncoderTraits interface {
    38  	Encoder(format.Encoding, bool, *schema.Column, memory.Allocator) TypedEncoder
    39  }
    40  
    41  // NewEncoder will return the appropriately typed encoder for the requested physical type
    42  // and encoding.
    43  //
    44  // If mem is nil, memory.DefaultAllocator will be used.
    45  func NewEncoder(t parquet.Type, e parquet.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder {
    46  	traits := getEncodingTraits(t)
    47  	if traits == nil {
    48  		return nil
    49  	}
    50  
    51  	if mem == nil {
    52  		mem = memory.DefaultAllocator
    53  	}
    54  	return traits.Encoder(format.Encoding(e), useDict, descr, mem)
    55  }
    56  
    57  type encoder struct {
    58  	descr    *schema.Column
    59  	encoding format.Encoding
    60  	typeLen  int
    61  	mem      memory.Allocator
    62  
    63  	sink *PooledBufferWriter
    64  }
    65  
    66  // newEncoderBase constructs a new base encoder for embedding on the typed encoders
    67  // encapsulating the common functionality.
    68  func newEncoderBase(e format.Encoding, descr *schema.Column, mem memory.Allocator) encoder {
    69  	typelen := -1
    70  	if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray {
    71  		typelen = int(descr.TypeLength())
    72  	}
    73  	return encoder{
    74  		descr:    descr,
    75  		encoding: e,
    76  		mem:      mem,
    77  		typeLen:  typelen,
    78  		sink:     NewPooledBufferWriter(1024),
    79  	}
    80  }
    81  
    82  // ReserveForWrite allocates n bytes so that the next n bytes written do not require new allocations.
    83  func (e *encoder) ReserveForWrite(n int)           { e.sink.Reserve(n) }
    84  func (e *encoder) EstimatedDataEncodedSize() int64 { return int64(e.sink.Len()) }
    85  func (e *encoder) Encoding() parquet.Encoding      { return parquet.Encoding(e.encoding) }
    86  func (e *encoder) Allocator() memory.Allocator     { return e.mem }
    87  func (e *encoder) append(data []byte)              { e.sink.Write(data) }
    88  
    89  // FlushValues flushes any unwritten data to the buffer and returns the finished encoded buffer of data.
    90  // This also clears the encoder, ownership of the data belongs to whomever called FlushValues, Release
    91  // should be called on the resulting Buffer when done.
    92  func (e *encoder) FlushValues() (Buffer, error) { return e.sink.Finish(), nil }
    93  
    94  // Bytes returns the current bytes that have been written to the encoder's buffer but doesn't transfer ownership.
    95  func (e *encoder) Bytes() []byte { return e.sink.Bytes() }
    96  
    97  // Reset drops the data currently in the encoder and resets for new use.
    98  func (e *encoder) Reset() { e.sink.Reset(0) }
    99  
   100  type dictEncoder struct {
   101  	encoder
   102  
   103  	dictEncodedSize int
   104  	idxBuffer       *memory.Buffer
   105  	idxValues       []int32
   106  	memo            MemoTable
   107  }
   108  
   109  // newDictEncoderBase constructs and returns a dictionary encoder for the appropriate type using the passed
   110  // in memo table for constructing the index.
   111  func newDictEncoderBase(descr *schema.Column, memo MemoTable, mem memory.Allocator) dictEncoder {
   112  	return dictEncoder{
   113  		encoder:   newEncoderBase(format.Encoding_PLAIN_DICTIONARY, descr, mem),
   114  		idxBuffer: memory.NewResizableBuffer(mem),
   115  		memo:      memo,
   116  	}
   117  }
   118  
   119  // Reset drops all the currently encoded values from the index and indexes from the data to allow
   120  // restarting the encoding process.
   121  func (d *dictEncoder) Reset() {
   122  	d.encoder.Reset()
   123  	d.dictEncodedSize = 0
   124  	d.idxValues = d.idxValues[:0]
   125  	d.idxBuffer.ResizeNoShrink(0)
   126  	d.memo.Reset()
   127  }
   128  
   129  // append the passed index to the indexbuffer
   130  func (d *dictEncoder) addIndex(idx int) {
   131  	if len(d.idxValues) == cap(d.idxValues) {
   132  		curLen := len(d.idxValues)
   133  		d.idxBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(bitutil.NextPowerOf2(curLen + 1)))
   134  		d.idxValues = arrow.Int32Traits.CastFromBytes(d.idxBuffer.Buf())[: curLen : d.idxBuffer.Len()/arrow.Int32SizeBytes]
   135  	}
   136  	d.idxValues = append(d.idxValues, int32(idx))
   137  }
   138  
   139  // FlushValues dumps all the currently buffered indexes that would become the data page to a buffer and
   140  // returns it or returns nil and any error encountered.
   141  func (d *dictEncoder) FlushValues() (Buffer, error) {
   142  	buf := bufferPool.Get().(*memory.Buffer)
   143  	buf.Reserve(int(d.EstimatedDataEncodedSize()))
   144  	size, err := d.WriteIndices(buf.Buf())
   145  	if err != nil {
   146  		poolBuffer{buf}.Release()
   147  		return nil, err
   148  	}
   149  	buf.ResizeNoShrink(size)
   150  	return poolBuffer{buf}, nil
   151  }
   152  
   153  // EstimatedDataEncodedSize returns the maximum number of bytes needed to store the RLE encoded indexes, not including the
   154  // dictionary index in the computation.
   155  func (d *dictEncoder) EstimatedDataEncodedSize() int64 {
   156  	return 1 + int64(utils.MaxBufferSize(d.BitWidth(), len(d.idxValues))+utils.MinBufferSize(d.BitWidth()))
   157  }
   158  
   159  // NumEntries returns the number of entires in the dictionary index for this encoder.
   160  func (d *dictEncoder) NumEntries() int {
   161  	return d.memo.Size()
   162  }
   163  
   164  // BitWidth returns the max bitwidth that would be necessary for encoding the index values currently
   165  // in the dictionary based on the size of the dictionary index.
   166  func (d *dictEncoder) BitWidth() int {
   167  	switch d.NumEntries() {
   168  	case 0:
   169  		return 0
   170  	case 1:
   171  		return 1
   172  	default:
   173  		return bits.Len32(uint32(d.NumEntries() - 1))
   174  	}
   175  }
   176  
   177  // WriteDict writes the dictionary index to the given byte slice.
   178  func (d *dictEncoder) WriteDict(out []byte) {
   179  	d.memo.WriteOut(out)
   180  }
   181  
   182  // WriteIndices performs Run Length encoding on the indexes and the writes the encoded
   183  // index value data to the provided byte slice, returning the number of bytes actually written.
   184  // If any error is encountered, it will return -1 and the error.
   185  func (d *dictEncoder) WriteIndices(out []byte) (int, error) {
   186  	out[0] = byte(d.BitWidth())
   187  
   188  	enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(out[1:]), d.BitWidth())
   189  	for _, idx := range d.idxValues {
   190  		if err := enc.Put(uint64(idx)); err != nil {
   191  			return -1, err
   192  		}
   193  	}
   194  	nbytes := enc.Flush()
   195  
   196  	d.idxValues = d.idxValues[:0]
   197  	return nbytes + 1, nil
   198  }
   199  
   200  // Put adds a value to the dictionary data column, inserting the value if it
   201  // didn't already exist in the dictionary.
   202  func (d *dictEncoder) Put(v interface{}) {
   203  	memoIdx, found, err := d.memo.GetOrInsert(v)
   204  	if err != nil {
   205  		panic(err)
   206  	}
   207  	if !found {
   208  		d.dictEncodedSize += int(reflect.TypeOf(v).Size())
   209  	}
   210  	d.addIndex(memoIdx)
   211  }
   212  
   213  // DictEncodedSize returns the current size of the encoded dictionary
   214  func (d *dictEncoder) DictEncodedSize() int {
   215  	return d.dictEncodedSize
   216  }
   217  
   218  // spacedCompress is a helper function for encoders to remove the slots in the slices passed in according
   219  // to the bitmap which are null into an output slice that is no longer spaced out with slots for nulls.
   220  func spacedCompress(src, out interface{}, validBits []byte, validBitsOffset int64) int {
   221  	nvalid := 0
   222  
   223  	// for efficiency we use a type switch because the copy runs significantly faster when typed
   224  	// than calling reflect.Copy
   225  	switch s := src.(type) {
   226  	case []int32:
   227  		o := out.([]int32)
   228  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   229  		for {
   230  			run := reader.NextRun()
   231  			if run.Length == 0 {
   232  				break
   233  			}
   234  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   235  			nvalid += int(run.Length)
   236  		}
   237  	case []int64:
   238  		o := out.([]int64)
   239  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   240  		for {
   241  			run := reader.NextRun()
   242  			if run.Length == 0 {
   243  				break
   244  			}
   245  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   246  			nvalid += int(run.Length)
   247  		}
   248  	case []float32:
   249  		o := out.([]float32)
   250  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   251  		for {
   252  			run := reader.NextRun()
   253  			if run.Length == 0 {
   254  				break
   255  			}
   256  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   257  			nvalid += int(run.Length)
   258  		}
   259  	case []float64:
   260  		o := out.([]float64)
   261  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   262  		for {
   263  			run := reader.NextRun()
   264  			if run.Length == 0 {
   265  				break
   266  			}
   267  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   268  			nvalid += int(run.Length)
   269  		}
   270  	case []parquet.ByteArray:
   271  		o := out.([]parquet.ByteArray)
   272  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   273  		for {
   274  			run := reader.NextRun()
   275  			if run.Length == 0 {
   276  				break
   277  			}
   278  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   279  			nvalid += int(run.Length)
   280  		}
   281  	case []parquet.FixedLenByteArray:
   282  		o := out.([]parquet.FixedLenByteArray)
   283  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   284  		for {
   285  			run := reader.NextRun()
   286  			if run.Length == 0 {
   287  				break
   288  			}
   289  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   290  			nvalid += int(run.Length)
   291  		}
   292  	case []bool:
   293  		o := out.([]bool)
   294  		reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   295  		for {
   296  			run := reader.NextRun()
   297  			if run.Length == 0 {
   298  				break
   299  			}
   300  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   301  			nvalid += int(run.Length)
   302  		}
   303  	}
   304  
   305  	return nvalid
   306  }