github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/levels.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  	"math/bits"
    24  
    25  	"github.com/JohnCGriffin/overflow"
    26  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    27  	"github.com/apache/arrow/go/v7/parquet"
    28  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  // LevelEncoder is for handling the encoding of Definition and Repetition levels
    34  // to parquet files.
    35  type LevelEncoder struct {
    36  	bitWidth int
    37  	rleLen   int
    38  	encoding format.Encoding
    39  	rle      *utils.RleEncoder
    40  	bit      *utils.BitWriter
    41  }
    42  
    43  // LevelEncodingMaxBufferSize estimates the max number of bytes needed to encode data with the
    44  // specified encoding given the max level and number of buffered values provided.
    45  func LevelEncodingMaxBufferSize(encoding parquet.Encoding, maxLvl int16, nbuffered int) int {
    46  	bitWidth := bits.Len64(uint64(maxLvl))
    47  	nbytes := 0
    48  	switch encoding {
    49  	case parquet.Encodings.RLE:
    50  		nbytes = utils.MaxBufferSize(bitWidth, nbuffered) + utils.MinBufferSize(bitWidth)
    51  	case parquet.Encodings.BitPacked:
    52  		nbytes = int(bitutil.BytesForBits(int64(nbuffered * bitWidth)))
    53  	default:
    54  		panic("parquet: unknown encoding type for levels")
    55  	}
    56  	return nbytes
    57  }
    58  
    59  // Reset resets the encoder allowing it to be reused and updating the maxlevel to the new
    60  // specified value.
    61  func (l *LevelEncoder) Reset(maxLvl int16) {
    62  	l.bitWidth = bits.Len64(uint64(maxLvl))
    63  	switch l.encoding {
    64  	case format.Encoding_RLE:
    65  		l.rle.Clear()
    66  		l.rle.BitWidth = l.bitWidth
    67  	case format.Encoding_BIT_PACKED:
    68  		l.bit.Clear()
    69  	default:
    70  		panic("parquet: unknown encoding type")
    71  	}
    72  }
    73  
    74  // Init is called to set up the desired encoding type, max level and underlying writer for a
    75  // level encoder to control where the resulting encoded buffer will end up.
    76  func (l *LevelEncoder) Init(encoding parquet.Encoding, maxLvl int16, w io.WriterAt) {
    77  	l.bitWidth = bits.Len64(uint64(maxLvl))
    78  	l.encoding = format.Encoding(encoding)
    79  	switch l.encoding {
    80  	case format.Encoding_RLE:
    81  		l.rle = utils.NewRleEncoder(w, l.bitWidth)
    82  	case format.Encoding_BIT_PACKED:
    83  		l.bit = utils.NewBitWriter(w)
    84  	default:
    85  		panic("parquet: unknown encoding type for levels")
    86  	}
    87  }
    88  
    89  // EncodeNoFlush encodes the provided levels in the encoder, but doesn't flush
    90  // the buffer and return it yet, appending these encoded values. Returns the number
    91  // of values encoded and any error encountered or nil. If err is not nil, nencoded
    92  // will be the number of values encoded before the error was encountered
    93  func (l *LevelEncoder) EncodeNoFlush(lvls []int16) (nencoded int, err error) {
    94  	if l.rle == nil && l.bit == nil {
    95  		panic("parquet: level encoders are not initialized")
    96  	}
    97  
    98  	switch l.encoding {
    99  	case format.Encoding_RLE:
   100  		for _, level := range lvls {
   101  			if err = l.rle.Put(uint64(level)); err != nil {
   102  				return
   103  			}
   104  			nencoded++
   105  		}
   106  	default:
   107  		for _, level := range lvls {
   108  			if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil {
   109  				return
   110  			}
   111  			nencoded++
   112  		}
   113  	}
   114  	return
   115  }
   116  
   117  // Flush flushes out any encoded data to the underlying writer.
   118  func (l *LevelEncoder) Flush() {
   119  	if l.rle == nil && l.bit == nil {
   120  		panic("parquet: level encoders are not initialized")
   121  	}
   122  
   123  	switch l.encoding {
   124  	case format.Encoding_RLE:
   125  		l.rleLen = l.rle.Flush()
   126  	default:
   127  		l.bit.Flush(false)
   128  	}
   129  }
   130  
   131  // Encode encodes the slice of definition or repetition levels based on
   132  // the currently configured encoding type and returns the number of
   133  // values that were encoded.
   134  func (l *LevelEncoder) Encode(lvls []int16) (nencoded int, err error) {
   135  	if l.rle == nil && l.bit == nil {
   136  		panic("parquet: level encoders are not initialized")
   137  	}
   138  
   139  	switch l.encoding {
   140  	case format.Encoding_RLE:
   141  		defer func() { l.rleLen = l.rle.Flush() }()
   142  		for _, level := range lvls {
   143  			if err = l.rle.Put(uint64(level)); err != nil {
   144  				return
   145  			}
   146  			nencoded++
   147  		}
   148  
   149  	default:
   150  		defer l.bit.Flush(false)
   151  		for _, level := range lvls {
   152  			if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil {
   153  				return
   154  			}
   155  			nencoded++
   156  		}
   157  	}
   158  	return
   159  }
   160  
   161  // Len returns the number of bytes that were written as Run Length encoded
   162  // levels, this is only valid for run length encoding and will panic if using
   163  // deprecated bit packed encoding.
   164  func (l *LevelEncoder) Len() int {
   165  	if l.encoding != format.Encoding_RLE {
   166  		panic("parquet: level encoder, only implemented for RLE")
   167  	}
   168  	return l.rleLen
   169  }
   170  
   171  // LevelDecoder handles the decoding of repetition and definition levels from a
   172  // parquet file supporting bit packed and run length encoded values.
   173  type LevelDecoder struct {
   174  	bitWidth  int
   175  	remaining int // the number of values left to be decoded in the input data
   176  	maxLvl    int16
   177  	encoding  format.Encoding
   178  	// only one of the following should ever be set at a time based on the
   179  	// encoding format.
   180  	rle *utils.RleDecoder
   181  	bit *utils.BitReader
   182  }
   183  
   184  // SetData sets in the data to be decoded by subsequent calls by specifying the encoding type
   185  // the maximum level (which is what determines the bit width), the number of values expected
   186  // and the raw bytes to decode. Returns the number of bytes expected to be decoded.
   187  func (l *LevelDecoder) SetData(encoding parquet.Encoding, maxLvl int16, nbuffered int, data []byte) (int, error) {
   188  	l.maxLvl = maxLvl
   189  	l.encoding = format.Encoding(encoding)
   190  	l.remaining = nbuffered
   191  	l.bitWidth = bits.Len64(uint64(maxLvl))
   192  
   193  	switch encoding {
   194  	case parquet.Encodings.RLE:
   195  		if len(data) < 4 {
   196  			return 0, xerrors.New("parquet: received invalid levels (corrupt data page?)")
   197  		}
   198  
   199  		nbytes := int32(binary.LittleEndian.Uint32(data[:4]))
   200  		if nbytes < 0 || nbytes > int32(len(data)-4) {
   201  			return 0, xerrors.New("parquet: received invalid number of bytes (corrupt data page?)")
   202  		}
   203  
   204  		buf := data[4:]
   205  		if l.rle == nil {
   206  			l.rle = utils.NewRleDecoder(bytes.NewReader(buf), l.bitWidth)
   207  		} else {
   208  			l.rle.Reset(bytes.NewReader(buf), l.bitWidth)
   209  		}
   210  		return int(nbytes) + 4, nil
   211  	case parquet.Encodings.BitPacked:
   212  		nbits, ok := overflow.Mul(nbuffered, l.bitWidth)
   213  		if !ok {
   214  			return 0, xerrors.New("parquet: number of buffered values too large (corrupt data page?)")
   215  		}
   216  
   217  		nbytes := bitutil.BytesForBits(int64(nbits))
   218  		if nbytes < 0 || nbytes > int64(len(data)) {
   219  			return 0, xerrors.New("parquet: recieved invalid number of bytes (corrupt data page?)")
   220  		}
   221  		if l.bit == nil {
   222  			l.bit = utils.NewBitReader(bytes.NewReader(data))
   223  		} else {
   224  			l.bit.Reset(bytes.NewReader(data))
   225  		}
   226  		return int(nbytes), nil
   227  	default:
   228  		return 0, xerrors.Errorf("parquet: unknown encoding type for levels '%s'", encoding)
   229  	}
   230  }
   231  
   232  // SetDataV2 is the same as SetData but only for DataPageV2 pages and only supports
   233  // run length encoding.
   234  func (l *LevelDecoder) SetDataV2(nbytes int32, maxLvl int16, nbuffered int, data []byte) error {
   235  	if nbytes < 0 {
   236  		return xerrors.New("parquet: invalid page header (corrupt data page?)")
   237  	}
   238  
   239  	l.maxLvl = maxLvl
   240  	l.encoding = format.Encoding_RLE
   241  	l.remaining = nbuffered
   242  	l.bitWidth = bits.Len64(uint64(maxLvl))
   243  
   244  	if l.rle == nil {
   245  		l.rle = utils.NewRleDecoder(bytes.NewReader(data), l.bitWidth)
   246  	} else {
   247  		l.rle.Reset(bytes.NewReader(data), l.bitWidth)
   248  	}
   249  	return nil
   250  }
   251  
   252  // Decode decodes the bytes that were set with SetData into the slice of levels
   253  // returning the total number of levels that were decoded and the number of
   254  // values which had a level equal to the max level, indicating how many physical
   255  // values exist to be read.
   256  func (l *LevelDecoder) Decode(levels []int16) (int, int64) {
   257  	var (
   258  		buf          [1024]uint64
   259  		totaldecoded int
   260  		decoded      int
   261  		valsToRead   int64
   262  	)
   263  
   264  	n := utils.Min(int64(l.remaining), int64(len(levels)))
   265  	for n > 0 {
   266  		batch := utils.Min(1024, n)
   267  		switch l.encoding {
   268  		case format.Encoding_RLE:
   269  			decoded = l.rle.GetBatch(buf[:batch])
   270  		case format.Encoding_BIT_PACKED:
   271  			decoded, _ = l.bit.GetBatch(uint(l.bitWidth), buf[:batch])
   272  		}
   273  		l.remaining -= decoded
   274  		totaldecoded += decoded
   275  		n -= batch
   276  
   277  		for idx, val := range buf[:decoded] {
   278  			lvl := int16(val)
   279  			levels[idx] = lvl
   280  			if lvl == l.maxLvl {
   281  				valsToRead++
   282  			}
   283  		}
   284  		levels = levels[decoded:]
   285  	}
   286  
   287  	return totaldecoded, valsToRead
   288  }