github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/delta_byte_array.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v14/arrow/memory"
    21  	"github.com/apache/arrow/go/v14/internal/utils"
    22  	"github.com/apache/arrow/go/v14/parquet"
    23  	"golang.org/x/xerrors"
    24  )
    25  
    26  // DeltaByteArrayEncoder is an encoder for writing bytearrays which are delta encoded
    27  // this is also known as incremental encoding or front compression. For each element
    28  // in a sequence of strings, we store the prefix length of the previous entry plus the suffix
    29  // see https://en.wikipedia.org/wiki/Incremental_encoding for a longer description.
    30  //
    31  // This is stored as a sequence of delta-encoded prefix lengths followed by the suffixes
    32  // encoded as delta length byte arrays.
    33  type DeltaByteArrayEncoder struct {
    34  	encoder
    35  
    36  	prefixEncoder *DeltaBitPackInt32Encoder
    37  	suffixEncoder *DeltaLengthByteArrayEncoder
    38  
    39  	lastVal parquet.ByteArray
    40  }
    41  
    42  func (enc *DeltaByteArrayEncoder) EstimatedDataEncodedSize() int64 {
    43  	return enc.prefixEncoder.EstimatedDataEncodedSize() + enc.suffixEncoder.EstimatedDataEncodedSize()
    44  }
    45  
    46  func (enc *DeltaByteArrayEncoder) initEncoders() {
    47  	enc.prefixEncoder = &DeltaBitPackInt32Encoder{
    48  		deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}
    49  	enc.suffixEncoder = &DeltaLengthByteArrayEncoder{
    50  		newEncoderBase(enc.encoding, nil, enc.mem),
    51  		&DeltaBitPackInt32Encoder{
    52  			deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}}
    53  }
    54  
    55  // Type returns the underlying physical type this operates on, in this case ByteArrays only
    56  func (DeltaByteArrayEncoder) Type() parquet.Type { return parquet.Types.ByteArray }
    57  
    58  // Put writes a slice of ByteArrays to the encoder
    59  func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) {
    60  	if len(in) == 0 {
    61  		return
    62  	}
    63  
    64  	var suf parquet.ByteArray
    65  	if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet
    66  		enc.initEncoders()
    67  		enc.prefixEncoder.Put([]int32{0})
    68  		suf = in[0]
    69  		enc.lastVal = in[0]
    70  		enc.suffixEncoder.Put([]parquet.ByteArray{suf})
    71  		in = in[1:]
    72  	}
    73  
    74  	// for each value, figure out the common prefix with the previous value
    75  	// and then write the prefix length and the suffix.
    76  	for _, val := range in {
    77  		l1 := enc.lastVal.Len()
    78  		l2 := val.Len()
    79  		j := 0
    80  		for j < l1 && j < l2 {
    81  			if enc.lastVal[j] != val[j] {
    82  				break
    83  			}
    84  			j++
    85  		}
    86  		enc.prefixEncoder.Put([]int32{int32(j)})
    87  		suf = val[j:]
    88  		enc.suffixEncoder.Put([]parquet.ByteArray{suf})
    89  		enc.lastVal = val
    90  	}
    91  
    92  	// do the memcpy after the loops to keep a copy of the lastVal
    93  	// we do a copy here so that we only copy and keep a reference
    94  	// to the suffix, and aren't forcing the *entire* value to stay
    95  	// in memory while we have this reference to just the suffix.
    96  	enc.lastVal = append([]byte{}, enc.lastVal...)
    97  }
    98  
    99  // PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset
   100  // to compress the data before writing it without the null slots.
   101  func (enc *DeltaByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
   102  	if validBits != nil {
   103  		data := make([]parquet.ByteArray, len(in))
   104  		nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   105  		enc.Put(data[:nvalid])
   106  	} else {
   107  		enc.Put(in)
   108  	}
   109  }
   110  
   111  // Flush flushes any remaining data out and returns the finished encoded buffer.
   112  // or returns nil and any error encountered during flushing.
   113  func (enc *DeltaByteArrayEncoder) FlushValues() (Buffer, error) {
   114  	if enc.prefixEncoder == nil {
   115  		enc.initEncoders()
   116  	}
   117  	prefixBuf, err := enc.prefixEncoder.FlushValues()
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  	defer prefixBuf.Release()
   122  
   123  	suffixBuf, err := enc.suffixEncoder.FlushValues()
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  	defer suffixBuf.Release()
   128  
   129  	ret := bufferPool.Get().(*memory.Buffer)
   130  	ret.ResizeNoShrink(prefixBuf.Len() + suffixBuf.Len())
   131  	copy(ret.Bytes(), prefixBuf.Bytes())
   132  	copy(ret.Bytes()[prefixBuf.Len():], suffixBuf.Bytes())
   133  	return poolBuffer{ret}, nil
   134  }
   135  
   136  // DeltaByteArrayDecoder is a decoder for a column of data encoded using incremental or prefix encoding.
   137  type DeltaByteArrayDecoder struct {
   138  	*DeltaLengthByteArrayDecoder
   139  
   140  	prefixLengths []int32
   141  	lastVal       parquet.ByteArray
   142  }
   143  
   144  // Type returns the underlying physical type this decoder operates on, in this case ByteArrays only
   145  func (DeltaByteArrayDecoder) Type() parquet.Type {
   146  	return parquet.Types.ByteArray
   147  }
   148  
   149  func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem }
   150  
   151  // SetData expects the data passed in to be the prefix lengths, followed by the
   152  // blocks of suffix data in order to initialize the decoder.
   153  func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error {
   154  	prefixLenDec := DeltaBitPackInt32Decoder{
   155  		deltaBitPackDecoder: &deltaBitPackDecoder{
   156  			decoder: newDecoderBase(d.encoding, d.descr),
   157  			mem:     d.mem}}
   158  
   159  	if err := prefixLenDec.SetData(nvalues, data); err != nil {
   160  		return err
   161  	}
   162  
   163  	d.prefixLengths = make([]int32, nvalues)
   164  	// decode all the prefix lengths first so we know how many bytes it took to get the
   165  	// prefix lengths for nvalues
   166  	prefixLenDec.Decode(d.prefixLengths)
   167  
   168  	// now that we know how many bytes we needed for the prefix lengths, the rest are the
   169  	// delta length byte array encoding.
   170  	return d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):])
   171  }
   172  
   173  // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded
   174  func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) {
   175  	max := utils.MinInt(len(out), d.nvals)
   176  	if max == 0 {
   177  		return 0, nil
   178  	}
   179  	out = out[:max]
   180  
   181  	var err error
   182  	if d.lastVal == nil {
   183  		_, err = d.DeltaLengthByteArrayDecoder.Decode(out[:1])
   184  		if err != nil {
   185  			return 0, err
   186  		}
   187  		d.lastVal = out[0]
   188  		out = out[1:]
   189  		d.prefixLengths = d.prefixLengths[1:]
   190  	}
   191  
   192  	var prefixLen int32
   193  	suffixHolder := make([]parquet.ByteArray, 1)
   194  	for len(out) > 0 {
   195  		prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:]
   196  
   197  		prefix := d.lastVal[:prefixLen:prefixLen]
   198  		_, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder)
   199  		if err != nil {
   200  			return 0, err
   201  		}
   202  
   203  		if len(suffixHolder[0]) == 0 {
   204  			d.lastVal = prefix
   205  		} else {
   206  			d.lastVal = make([]byte, int(prefixLen)+len(suffixHolder[0]))
   207  			copy(d.lastVal, prefix)
   208  			copy(d.lastVal[prefixLen:], suffixHolder[0])
   209  		}
   210  		out[0], out = d.lastVal, out[1:]
   211  	}
   212  	return max, nil
   213  }
   214  
   215  // DecodeSpaced is like decode, but the result is spaced out based on the bitmap provided.
   216  func (d *DeltaByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   217  	toread := len(out) - nullCount
   218  	values, err := d.Decode(out[:toread])
   219  	if err != nil {
   220  		return values, err
   221  	}
   222  	if values != toread {
   223  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   224  	}
   225  
   226  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   227  }