github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/delta_byte_array.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v7/arrow/memory"
    21  	"github.com/apache/arrow/go/v7/parquet"
    22  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    23  	"golang.org/x/xerrors"
    24  )
    25  
    26  // DeltaByteArrayEncoder is an encoder for writing bytearrays which are delta encoded
    27  // this is also known as incremental encoding or front compression. For each element
    28  // in a sequence of strings, we store the prefix length of the previous entry plus the suffix
    29  // see https://en.wikipedia.org/wiki/Incremental_encoding for a longer description.
    30  //
    31  // This is stored as a sequence of delta-encoded prefix lengths followed by the suffixes
    32  // encoded as delta length byte arrays.
    33  type DeltaByteArrayEncoder struct {
    34  	encoder
    35  
    36  	prefixEncoder *DeltaBitPackInt32Encoder
    37  	suffixEncoder *DeltaLengthByteArrayEncoder
    38  
    39  	lastVal parquet.ByteArray
    40  }
    41  
    42  func (enc *DeltaByteArrayEncoder) initEncoders() {
    43  	enc.prefixEncoder = &DeltaBitPackInt32Encoder{
    44  		deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}
    45  	enc.suffixEncoder = &DeltaLengthByteArrayEncoder{
    46  		newEncoderBase(enc.encoding, nil, enc.mem),
    47  		&DeltaBitPackInt32Encoder{
    48  			deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}}
    49  }
    50  
    51  // Type returns the underlying physical type this operates on, in this case ByteArrays only
    52  func (DeltaByteArrayEncoder) Type() parquet.Type { return parquet.Types.ByteArray }
    53  
    54  // Put writes a slice of ByteArrays to the encoder
    55  func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) {
    56  	if len(in) == 0 {
    57  		return
    58  	}
    59  
    60  	var suf parquet.ByteArray
    61  	if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet
    62  		enc.initEncoders()
    63  		enc.prefixEncoder.Put([]int32{0})
    64  		suf = in[0]
    65  		enc.lastVal = in[0]
    66  		enc.suffixEncoder.Put([]parquet.ByteArray{suf})
    67  		in = in[1:]
    68  	}
    69  
    70  	// for each value, figure out the common prefix with the previous value
    71  	// and then write the prefix length and the suffix.
    72  	for _, val := range in {
    73  		l1 := enc.lastVal.Len()
    74  		l2 := val.Len()
    75  		j := 0
    76  		for j < l1 && j < l2 {
    77  			if enc.lastVal[j] != val[j] {
    78  				break
    79  			}
    80  			j++
    81  		}
    82  		enc.prefixEncoder.Put([]int32{int32(j)})
    83  		suf = val[j:]
    84  		enc.suffixEncoder.Put([]parquet.ByteArray{suf})
    85  		enc.lastVal = val
    86  	}
    87  
    88  	// do the memcpy after the loops to keep a copy of the lastVal
    89  	// we do a copy here so that we only copy and keep a reference
    90  	// to the suffix, and aren't forcing the *entire* value to stay
    91  	// in memory while we have this reference to just the suffix.
    92  	enc.lastVal = append([]byte{}, enc.lastVal...)
    93  }
    94  
    95  // PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset
    96  // to compress the data before writing it without the null slots.
    97  func (enc *DeltaByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
    98  	if validBits != nil {
    99  		data := make([]parquet.ByteArray, len(in))
   100  		nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   101  		enc.Put(data[:nvalid])
   102  	} else {
   103  		enc.Put(in)
   104  	}
   105  }
   106  
   107  // Flush flushes any remaining data out and returns the finished encoded buffer.
   108  // or returns nil and any error encountered during flushing.
   109  func (enc *DeltaByteArrayEncoder) FlushValues() (Buffer, error) {
   110  	if enc.prefixEncoder == nil {
   111  		enc.initEncoders()
   112  	}
   113  	prefixBuf, err := enc.prefixEncoder.FlushValues()
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  	defer prefixBuf.Release()
   118  
   119  	suffixBuf, err := enc.suffixEncoder.FlushValues()
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  	defer suffixBuf.Release()
   124  
   125  	ret := bufferPool.Get().(*memory.Buffer)
   126  	ret.ResizeNoShrink(prefixBuf.Len() + suffixBuf.Len())
   127  	copy(ret.Bytes(), prefixBuf.Bytes())
   128  	copy(ret.Bytes()[prefixBuf.Len():], suffixBuf.Bytes())
   129  	return poolBuffer{ret}, nil
   130  }
   131  
   132  // DeltaByteArrayDecoder is a decoder for a column of data encoded using incremental or prefix encoding.
   133  type DeltaByteArrayDecoder struct {
   134  	*DeltaLengthByteArrayDecoder
   135  
   136  	prefixLengths []int32
   137  	lastVal       parquet.ByteArray
   138  }
   139  
   140  // Type returns the underlying physical type this decoder operates on, in this case ByteArrays only
   141  func (DeltaByteArrayDecoder) Type() parquet.Type {
   142  	return parquet.Types.ByteArray
   143  }
   144  
   145  func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem }
   146  
   147  // SetData expects the data passed in to be the prefix lengths, followed by the
   148  // blocks of suffix data in order to initialize the decoder.
   149  func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error {
   150  	prefixLenDec := DeltaBitPackInt32Decoder{
   151  		deltaBitPackDecoder: &deltaBitPackDecoder{
   152  			decoder: newDecoderBase(d.encoding, d.descr),
   153  			mem:     d.mem}}
   154  
   155  	if err := prefixLenDec.SetData(nvalues, data); err != nil {
   156  		return err
   157  	}
   158  
   159  	d.prefixLengths = make([]int32, nvalues)
   160  	// decode all the prefix lengths first so we know how many bytes it took to get the
   161  	// prefix lengths for nvalues
   162  	prefixLenDec.Decode(d.prefixLengths)
   163  
   164  	// now that we know how many bytes we needed for the prefix lengths, the rest are the
   165  	// delta length byte array encoding.
   166  	return d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):])
   167  }
   168  
   169  // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded
   170  func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) {
   171  	max := utils.MinInt(len(out), d.nvals)
   172  	if max == 0 {
   173  		return 0, nil
   174  	}
   175  	out = out[:max]
   176  
   177  	var err error
   178  	if d.lastVal == nil {
   179  		_, err = d.DeltaLengthByteArrayDecoder.Decode(out[:1])
   180  		if err != nil {
   181  			return 0, err
   182  		}
   183  		d.lastVal = out[0]
   184  		out = out[1:]
   185  		d.prefixLengths = d.prefixLengths[1:]
   186  	}
   187  
   188  	var prefixLen int32
   189  	suffixHolder := make([]parquet.ByteArray, 1)
   190  	for len(out) > 0 {
   191  		prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:]
   192  
   193  		prefix := d.lastVal[:prefixLen:prefixLen]
   194  		_, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder)
   195  		if err != nil {
   196  			return 0, err
   197  		}
   198  
   199  		if len(suffixHolder[0]) == 0 {
   200  			d.lastVal = prefix
   201  		} else {
   202  			d.lastVal = make([]byte, int(prefixLen)+len(suffixHolder[0]))
   203  			copy(d.lastVal, prefix)
   204  			copy(d.lastVal[prefixLen:], suffixHolder[0])
   205  		}
   206  		out[0], out = d.lastVal, out[1:]
   207  	}
   208  	return max, nil
   209  }
   210  
   211  // DecodeSpaced is like decode, but the result is spaced out based on the bitmap provided.
   212  func (d *DeltaByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   213  	toread := len(out) - nullCount
   214  	values, err := d.Decode(out[:toread])
   215  	if err != nil {
   216  		return values, err
   217  	}
   218  	if values != toread {
   219  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   220  	}
   221  
   222  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   223  }