github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/encoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"math/bits"
    21  	"reflect"
    22  
    23  	"github.com/apache/arrow/go/v7/arrow"
    24  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    25  	"github.com/apache/arrow/go/v7/arrow/memory"
    26  	"github.com/apache/arrow/go/v7/parquet"
    27  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    29  	"github.com/apache/arrow/go/v7/parquet/schema"
    30  )
    31  
    32  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl
    33  
    34  // EncoderTraits is an interface for the different types to make it more
    35  // convenient to construct encoders for specific types.
    36  type EncoderTraits interface {
    37  	Encoder(format.Encoding, bool, *schema.Column, memory.Allocator) TypedEncoder
    38  }
    39  
    40  // NewEncoder will return the appropriately typed encoder for the requested physical type
    41  // and encoding.
    42  //
    43  // If mem is nil, memory.DefaultAllocator will be used.
    44  func NewEncoder(t parquet.Type, e parquet.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder {
    45  	traits := getEncodingTraits(t)
    46  	if traits == nil {
    47  		return nil
    48  	}
    49  
    50  	if mem == nil {
    51  		mem = memory.DefaultAllocator
    52  	}
    53  	return traits.Encoder(format.Encoding(e), useDict, descr, mem)
    54  }
    55  
    56  type encoder struct {
    57  	descr    *schema.Column
    58  	encoding format.Encoding
    59  	typeLen  int
    60  	mem      memory.Allocator
    61  
    62  	sink *PooledBufferWriter
    63  }
    64  
    65  // newEncoderBase constructs a new base encoder for embedding on the typed encoders
    66  // encapsulating the common functionality.
    67  func newEncoderBase(e format.Encoding, descr *schema.Column, mem memory.Allocator) encoder {
    68  	typelen := -1
    69  	if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray {
    70  		typelen = int(descr.TypeLength())
    71  	}
    72  	return encoder{
    73  		descr:    descr,
    74  		encoding: e,
    75  		mem:      mem,
    76  		typeLen:  typelen,
    77  		sink:     NewPooledBufferWriter(1024),
    78  	}
    79  }
    80  
    81  // ReserveForWrite allocates n bytes so that the next n bytes written do not require new allocations.
    82  func (e *encoder) ReserveForWrite(n int)           { e.sink.Reserve(n) }
    83  func (e *encoder) EstimatedDataEncodedSize() int64 { return int64(e.sink.Len()) }
    84  func (e *encoder) Encoding() parquet.Encoding      { return parquet.Encoding(e.encoding) }
    85  func (e *encoder) Allocator() memory.Allocator     { return e.mem }
    86  func (e *encoder) append(data []byte)              { e.sink.Write(data) }
    87  
    88  // FlushValues flushes any unwritten data to the buffer and returns the finished encoded buffer of data.
    89  // This also clears the encoder, ownership of the data belongs to whomever called FlushValues, Release
    90  // should be called on the resulting Buffer when done.
    91  func (e *encoder) FlushValues() (Buffer, error) { return e.sink.Finish(), nil }
    92  
    93  // Bytes returns the current bytes that have been written to the encoder's buffer but doesn't transfer ownership.
    94  func (e *encoder) Bytes() []byte { return e.sink.Bytes() }
    95  
    96  // Reset drops the data currently in the encoder and resets for new use.
    97  func (e *encoder) Reset() { e.sink.Reset(0) }
    98  
    99  type dictEncoder struct {
   100  	encoder
   101  
   102  	dictEncodedSize int
   103  	idxBuffer       *memory.Buffer
   104  	idxValues       []int32
   105  	memo            MemoTable
   106  }
   107  
   108  // newDictEncoderBase constructs and returns a dictionary encoder for the appropriate type using the passed
   109  // in memo table for constructing the index.
   110  func newDictEncoderBase(descr *schema.Column, memo MemoTable, mem memory.Allocator) dictEncoder {
   111  	return dictEncoder{
   112  		encoder:   newEncoderBase(format.Encoding_PLAIN_DICTIONARY, descr, mem),
   113  		idxBuffer: memory.NewResizableBuffer(mem),
   114  		memo:      memo,
   115  	}
   116  }
   117  
   118  // Reset drops all the currently encoded values from the index and indexes from the data to allow
   119  // restarting the encoding process.
   120  func (d *dictEncoder) Reset() {
   121  	d.encoder.Reset()
   122  	d.dictEncodedSize = 0
   123  	d.idxValues = d.idxValues[:0]
   124  	d.idxBuffer.ResizeNoShrink(0)
   125  	d.memo.Reset()
   126  }
   127  
   128  // append the passed index to the indexbuffer
   129  func (d *dictEncoder) addIndex(idx int) {
   130  	if len(d.idxValues) == cap(d.idxValues) {
   131  		curLen := len(d.idxValues)
   132  		d.idxBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(bitutil.NextPowerOf2(curLen + 1)))
   133  		d.idxValues = arrow.Int32Traits.CastFromBytes(d.idxBuffer.Buf())[: curLen : d.idxBuffer.Len()/arrow.Int32SizeBytes]
   134  	}
   135  	d.idxValues = append(d.idxValues, int32(idx))
   136  }
   137  
   138  // FlushValues dumps all the currently buffered indexes that would become the data page to a buffer and
   139  // returns it or returns nil and any error encountered.
   140  func (d *dictEncoder) FlushValues() (Buffer, error) {
   141  	buf := bufferPool.Get().(*memory.Buffer)
   142  	buf.Reserve(int(d.EstimatedDataEncodedSize()))
   143  	size, err := d.WriteIndices(buf.Buf())
   144  	if err != nil {
   145  		poolBuffer{buf}.Release()
   146  		return nil, err
   147  	}
   148  	buf.ResizeNoShrink(size)
   149  	return poolBuffer{buf}, nil
   150  }
   151  
   152  // EstimatedDataEncodedSize returns the maximum number of bytes needed to store the RLE encoded indexes, not including the
   153  // dictionary index in the computation.
   154  func (d *dictEncoder) EstimatedDataEncodedSize() int64 {
   155  	return 1 + int64(utils.MaxBufferSize(d.BitWidth(), len(d.idxValues))+utils.MinBufferSize(d.BitWidth()))
   156  }
   157  
   158  // NumEntries returns the number of entires in the dictionary index for this encoder.
   159  func (d *dictEncoder) NumEntries() int {
   160  	return d.memo.Size()
   161  }
   162  
   163  // BitWidth returns the max bitwidth that would be necessary for encoding the index values currently
   164  // in the dictionary based on the size of the dictionary index.
   165  func (d *dictEncoder) BitWidth() int {
   166  	switch d.NumEntries() {
   167  	case 0:
   168  		return 0
   169  	case 1:
   170  		return 1
   171  	default:
   172  		return bits.Len32(uint32(d.NumEntries() - 1))
   173  	}
   174  }
   175  
   176  // WriteDict writes the dictionary index to the given byte slice.
   177  func (d *dictEncoder) WriteDict(out []byte) {
   178  	d.memo.WriteOut(out)
   179  }
   180  
   181  // WriteIndices performs Run Length encoding on the indexes and the writes the encoded
   182  // index value data to the provided byte slice, returning the number of bytes actually written.
   183  // If any error is encountered, it will return -1 and the error.
   184  func (d *dictEncoder) WriteIndices(out []byte) (int, error) {
   185  	out[0] = byte(d.BitWidth())
   186  
   187  	enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(out[1:]), d.BitWidth())
   188  	for _, idx := range d.idxValues {
   189  		if err := enc.Put(uint64(idx)); err != nil {
   190  			return -1, err
   191  		}
   192  	}
   193  	nbytes := enc.Flush()
   194  
   195  	d.idxValues = d.idxValues[:0]
   196  	return nbytes + 1, nil
   197  }
   198  
   199  // Put adds a value to the dictionary data column, inserting the value if it
   200  // didn't already exist in the dictionary.
   201  func (d *dictEncoder) Put(v interface{}) {
   202  	memoIdx, found, err := d.memo.GetOrInsert(v)
   203  	if err != nil {
   204  		panic(err)
   205  	}
   206  	if !found {
   207  		d.dictEncodedSize += int(reflect.TypeOf(v).Size())
   208  	}
   209  	d.addIndex(memoIdx)
   210  }
   211  
   212  // DictEncodedSize returns the current size of the encoded dictionary
   213  func (d *dictEncoder) DictEncodedSize() int {
   214  	return d.dictEncodedSize
   215  }
   216  
   217  // spacedCompress is a helper function for encoders to remove the slots in the slices passed in according
   218  // to the bitmap which are null into an output slice that is no longer spaced out with slots for nulls.
   219  func spacedCompress(src, out interface{}, validBits []byte, validBitsOffset int64) int {
   220  	nvalid := 0
   221  
   222  	// for efficiency we use a type switch because the copy runs significantly faster when typed
   223  	// than calling reflect.Copy
   224  	switch s := src.(type) {
   225  	case []int32:
   226  		o := out.([]int32)
   227  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   228  		for {
   229  			run := reader.NextRun()
   230  			if run.Length == 0 {
   231  				break
   232  			}
   233  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   234  			nvalid += int(run.Length)
   235  		}
   236  	case []int64:
   237  		o := out.([]int64)
   238  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   239  		for {
   240  			run := reader.NextRun()
   241  			if run.Length == 0 {
   242  				break
   243  			}
   244  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   245  			nvalid += int(run.Length)
   246  		}
   247  	case []float32:
   248  		o := out.([]float32)
   249  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   250  		for {
   251  			run := reader.NextRun()
   252  			if run.Length == 0 {
   253  				break
   254  			}
   255  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   256  			nvalid += int(run.Length)
   257  		}
   258  	case []float64:
   259  		o := out.([]float64)
   260  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   261  		for {
   262  			run := reader.NextRun()
   263  			if run.Length == 0 {
   264  				break
   265  			}
   266  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   267  			nvalid += int(run.Length)
   268  		}
   269  	case []parquet.ByteArray:
   270  		o := out.([]parquet.ByteArray)
   271  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   272  		for {
   273  			run := reader.NextRun()
   274  			if run.Length == 0 {
   275  				break
   276  			}
   277  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   278  			nvalid += int(run.Length)
   279  		}
   280  	case []parquet.FixedLenByteArray:
   281  		o := out.([]parquet.FixedLenByteArray)
   282  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   283  		for {
   284  			run := reader.NextRun()
   285  			if run.Length == 0 {
   286  				break
   287  			}
   288  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   289  			nvalid += int(run.Length)
   290  		}
   291  	case []bool:
   292  		o := out.([]bool)
   293  		reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s)))
   294  		for {
   295  			run := reader.NextRun()
   296  			if run.Length == 0 {
   297  				break
   298  			}
   299  			copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)])
   300  			nvalid += int(run.Length)
   301  		}
   302  	}
   303  
   304  	return nvalid
   305  }