github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/levels.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"io"
    24  	"math/bits"
    25  
    26  	"github.com/JohnCGriffin/overflow"
    27  	"github.com/apache/arrow/go/v10/arrow/bitutil"
    28  	shared_utils "github.com/apache/arrow/go/v10/internal/utils"
    29  	"github.com/apache/arrow/go/v10/parquet"
    30  	format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v10/parquet/internal/utils"
    32  	"golang.org/x/xerrors"
    33  )
    34  
    35  // LevelEncoder is for handling the encoding of Definition and Repetition levels
    36  // to parquet files.
    37  type LevelEncoder struct {
    38  	bitWidth int
    39  	rleLen   int
    40  	encoding format.Encoding
    41  	rle      *utils.RleEncoder
    42  	bit      *utils.BitWriter
    43  }
    44  
    45  // LevelEncodingMaxBufferSize estimates the max number of bytes needed to encode data with the
    46  // specified encoding given the max level and number of buffered values provided.
    47  func LevelEncodingMaxBufferSize(encoding parquet.Encoding, maxLvl int16, nbuffered int) int {
    48  	bitWidth := bits.Len64(uint64(maxLvl))
    49  	nbytes := 0
    50  	switch encoding {
    51  	case parquet.Encodings.RLE:
    52  		nbytes = utils.MaxBufferSize(bitWidth, nbuffered) + utils.MinBufferSize(bitWidth)
    53  	case parquet.Encodings.BitPacked:
    54  		nbytes = int(bitutil.BytesForBits(int64(nbuffered * bitWidth)))
    55  	default:
    56  		panic("parquet: unknown encoding type for levels")
    57  	}
    58  	return nbytes
    59  }
    60  
    61  // Reset resets the encoder allowing it to be reused and updating the maxlevel to the new
    62  // specified value.
    63  func (l *LevelEncoder) Reset(maxLvl int16) {
    64  	l.bitWidth = bits.Len64(uint64(maxLvl))
    65  	switch l.encoding {
    66  	case format.Encoding_RLE:
    67  		l.rle.Clear()
    68  		l.rle.BitWidth = l.bitWidth
    69  	case format.Encoding_BIT_PACKED:
    70  		l.bit.Clear()
    71  	default:
    72  		panic("parquet: unknown encoding type")
    73  	}
    74  }
    75  
    76  // Init is called to set up the desired encoding type, max level and underlying writer for a
    77  // level encoder to control where the resulting encoded buffer will end up.
    78  func (l *LevelEncoder) Init(encoding parquet.Encoding, maxLvl int16, w io.WriterAt) {
    79  	l.bitWidth = bits.Len64(uint64(maxLvl))
    80  	l.encoding = format.Encoding(encoding)
    81  	switch l.encoding {
    82  	case format.Encoding_RLE:
    83  		l.rle = utils.NewRleEncoder(w, l.bitWidth)
    84  	case format.Encoding_BIT_PACKED:
    85  		l.bit = utils.NewBitWriter(w)
    86  	default:
    87  		panic("parquet: unknown encoding type for levels")
    88  	}
    89  }
    90  
    91  // EncodeNoFlush encodes the provided levels in the encoder, but doesn't flush
    92  // the buffer and return it yet, appending these encoded values. Returns the number
    93  // of values encoded and any error encountered or nil. If err is not nil, nencoded
    94  // will be the number of values encoded before the error was encountered
    95  func (l *LevelEncoder) EncodeNoFlush(lvls []int16) (nencoded int, err error) {
    96  	if l.rle == nil && l.bit == nil {
    97  		panic("parquet: level encoders are not initialized")
    98  	}
    99  
   100  	switch l.encoding {
   101  	case format.Encoding_RLE:
   102  		for _, level := range lvls {
   103  			if err = l.rle.Put(uint64(level)); err != nil {
   104  				return
   105  			}
   106  			nencoded++
   107  		}
   108  	default:
   109  		for _, level := range lvls {
   110  			if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil {
   111  				return
   112  			}
   113  			nencoded++
   114  		}
   115  	}
   116  	return
   117  }
   118  
   119  // Flush flushes out any encoded data to the underlying writer.
   120  func (l *LevelEncoder) Flush() {
   121  	if l.rle == nil && l.bit == nil {
   122  		panic("parquet: level encoders are not initialized")
   123  	}
   124  
   125  	switch l.encoding {
   126  	case format.Encoding_RLE:
   127  		l.rleLen = l.rle.Flush()
   128  	default:
   129  		l.bit.Flush(false)
   130  	}
   131  }
   132  
   133  // Encode encodes the slice of definition or repetition levels based on
   134  // the currently configured encoding type and returns the number of
   135  // values that were encoded.
   136  func (l *LevelEncoder) Encode(lvls []int16) (nencoded int, err error) {
   137  	if l.rle == nil && l.bit == nil {
   138  		panic("parquet: level encoders are not initialized")
   139  	}
   140  
   141  	switch l.encoding {
   142  	case format.Encoding_RLE:
   143  		defer func() { l.rleLen = l.rle.Flush() }()
   144  		for _, level := range lvls {
   145  			if err = l.rle.Put(uint64(level)); err != nil {
   146  				return
   147  			}
   148  			nencoded++
   149  		}
   150  
   151  	default:
   152  		defer l.bit.Flush(false)
   153  		for _, level := range lvls {
   154  			if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil {
   155  				return
   156  			}
   157  			nencoded++
   158  		}
   159  	}
   160  	return
   161  }
   162  
   163  // Len returns the number of bytes that were written as Run Length encoded
   164  // levels, this is only valid for run length encoding and will panic if using
   165  // deprecated bit packed encoding.
   166  func (l *LevelEncoder) Len() int {
   167  	if l.encoding != format.Encoding_RLE {
   168  		panic("parquet: level encoder, only implemented for RLE")
   169  	}
   170  	return l.rleLen
   171  }
   172  
   173  // LevelDecoder handles the decoding of repetition and definition levels from a
   174  // parquet file supporting bit packed and run length encoded values.
   175  type LevelDecoder struct {
   176  	bitWidth  int
   177  	remaining int // the number of values left to be decoded in the input data
   178  	maxLvl    int16
   179  	encoding  format.Encoding
   180  	// only one of the following should ever be set at a time based on the
   181  	// encoding format.
   182  	rle *utils.RleDecoder
   183  	bit *utils.BitReader
   184  }
   185  
   186  // SetData sets in the data to be decoded by subsequent calls by specifying the encoding type
   187  // the maximum level (which is what determines the bit width), the number of values expected
   188  // and the raw bytes to decode. Returns the number of bytes expected to be decoded.
   189  func (l *LevelDecoder) SetData(encoding parquet.Encoding, maxLvl int16, nbuffered int, data []byte) (int, error) {
   190  	l.maxLvl = maxLvl
   191  	l.encoding = format.Encoding(encoding)
   192  	l.remaining = nbuffered
   193  	l.bitWidth = bits.Len64(uint64(maxLvl))
   194  
   195  	switch encoding {
   196  	case parquet.Encodings.RLE:
   197  		if len(data) < 4 {
   198  			return 0, xerrors.New("parquet: received invalid levels (corrupt data page?)")
   199  		}
   200  
   201  		nbytes := int32(binary.LittleEndian.Uint32(data[:4]))
   202  		if nbytes < 0 || nbytes > int32(len(data)-4) {
   203  			return 0, xerrors.New("parquet: received invalid number of bytes (corrupt data page?)")
   204  		}
   205  
   206  		buf := data[4:]
   207  		if l.rle == nil {
   208  			l.rle = utils.NewRleDecoder(bytes.NewReader(buf), l.bitWidth)
   209  		} else {
   210  			l.rle.Reset(bytes.NewReader(buf), l.bitWidth)
   211  		}
   212  		return int(nbytes) + 4, nil
   213  	case parquet.Encodings.BitPacked:
   214  		nbits, ok := overflow.Mul(nbuffered, l.bitWidth)
   215  		if !ok {
   216  			return 0, xerrors.New("parquet: number of buffered values too large (corrupt data page?)")
   217  		}
   218  
   219  		nbytes := bitutil.BytesForBits(int64(nbits))
   220  		if nbytes < 0 || nbytes > int64(len(data)) {
   221  			return 0, xerrors.New("parquet: recieved invalid number of bytes (corrupt data page?)")
   222  		}
   223  		if l.bit == nil {
   224  			l.bit = utils.NewBitReader(bytes.NewReader(data))
   225  		} else {
   226  			l.bit.Reset(bytes.NewReader(data))
   227  		}
   228  		return int(nbytes), nil
   229  	default:
   230  		return 0, fmt.Errorf("parquet: unknown encoding type for levels '%s'", encoding)
   231  	}
   232  }
   233  
   234  // SetDataV2 is the same as SetData but only for DataPageV2 pages and only supports
   235  // run length encoding.
   236  func (l *LevelDecoder) SetDataV2(nbytes int32, maxLvl int16, nbuffered int, data []byte) error {
   237  	if nbytes < 0 {
   238  		return xerrors.New("parquet: invalid page header (corrupt data page?)")
   239  	}
   240  
   241  	l.maxLvl = maxLvl
   242  	l.encoding = format.Encoding_RLE
   243  	l.remaining = nbuffered
   244  	l.bitWidth = bits.Len64(uint64(maxLvl))
   245  
   246  	if l.rle == nil {
   247  		l.rle = utils.NewRleDecoder(bytes.NewReader(data), l.bitWidth)
   248  	} else {
   249  		l.rle.Reset(bytes.NewReader(data), l.bitWidth)
   250  	}
   251  	return nil
   252  }
   253  
   254  // Decode decodes the bytes that were set with SetData into the slice of levels
   255  // returning the total number of levels that were decoded and the number of
   256  // values which had a level equal to the max level, indicating how many physical
   257  // values exist to be read.
   258  func (l *LevelDecoder) Decode(levels []int16) (int, int64) {
   259  	var (
   260  		buf          [1024]uint64
   261  		totaldecoded int
   262  		decoded      int
   263  		valsToRead   int64
   264  	)
   265  
   266  	n := shared_utils.Min(int64(l.remaining), int64(len(levels)))
   267  	for n > 0 {
   268  		batch := shared_utils.Min(1024, n)
   269  		switch l.encoding {
   270  		case format.Encoding_RLE:
   271  			decoded = l.rle.GetBatch(buf[:batch])
   272  		case format.Encoding_BIT_PACKED:
   273  			decoded, _ = l.bit.GetBatch(uint(l.bitWidth), buf[:batch])
   274  		}
   275  		l.remaining -= decoded
   276  		totaldecoded += decoded
   277  		n -= batch
   278  
   279  		for idx, val := range buf[:decoded] {
   280  			lvl := int16(val)
   281  			levels[idx] = lvl
   282  			if lvl == l.maxLvl {
   283  				valsToRead++
   284  			}
   285  		}
   286  		levels = levels[decoded:]
   287  	}
   288  
   289  	return totaldecoded, valsToRead
   290  }