github.com/fraugster/parquet-go@v0.12.0/page_v1.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"hash/crc32"
     9  	"io"
    10  	"math/bits"
    11  
    12  	"github.com/fraugster/parquet-go/parquet"
    13  )
    14  
    15  type dataPageReaderV1 struct {
    16  	ph *parquet.PageHeader
    17  
    18  	valuesCount        int32
    19  	encoding           parquet.Encoding
    20  	dDecoder, rDecoder levelDecoder
    21  	valuesDecoder      valuesDecoder
    22  	fn                 getValueDecoderFn
    23  
    24  	position int
    25  
    26  	alloc *allocTracker
    27  }
    28  
    29  func (dp *dataPageReaderV1) numValues() int32 {
    30  	return dp.valuesCount
    31  }
    32  
    33  func (dp *dataPageReaderV1) readValues(size int) (values []interface{}, dLevel *packedArray, rLevel *packedArray, err error) {
    34  	if rem := int(dp.valuesCount) - dp.position; rem < size {
    35  		size = rem
    36  	}
    37  
    38  	if size == 0 {
    39  		return nil, nil, nil, nil
    40  	}
    41  
    42  	rLevel, _, err = decodePackedArray(dp.rDecoder, size)
    43  	if err != nil {
    44  		return nil, nil, nil, fmt.Errorf("read repetition levels failed: %w", err)
    45  	}
    46  
    47  	var notNull int
    48  	dLevel, notNull, err = decodePackedArray(dp.dDecoder, size)
    49  	if err != nil {
    50  		return nil, nil, nil, fmt.Errorf("read definition levels failed: %w", err)
    51  	}
    52  
    53  	val := make([]interface{}, notNull)
    54  
    55  	if notNull != 0 {
    56  		if n, err := dp.valuesDecoder.decodeValues(val); err != nil {
    57  			return nil, nil, nil, fmt.Errorf("read values from page failed, need %d value read %d: %w", notNull, n, err)
    58  		}
    59  	}
    60  	dp.position += size
    61  
    62  	return val, dLevel, rLevel, nil
    63  }
    64  
    65  func (dp *dataPageReaderV1) init(dDecoder, rDecoder getLevelDecoder, values getValueDecoderFn) error {
    66  	if dp.ph.DataPageHeader == nil {
    67  		return errors.New("page header is missing data page header")
    68  	}
    69  
    70  	var err error
    71  	dp.rDecoder, err = rDecoder(dp.ph.DataPageHeader.RepetitionLevelEncoding)
    72  	if err != nil {
    73  		return err
    74  	}
    75  
    76  	dp.dDecoder, err = dDecoder(dp.ph.DataPageHeader.DefinitionLevelEncoding)
    77  	if err != nil {
    78  		return err
    79  	}
    80  
    81  	dp.fn = values
    82  	dp.position = 0
    83  
    84  	return nil
    85  }
    86  
    87  func (dp *dataPageReaderV1) read(r io.Reader, ph *parquet.PageHeader, codec parquet.CompressionCodec, validateCRC bool) (err error) {
    88  	if ph.DataPageHeader == nil {
    89  		return fmt.Errorf("null DataPageHeader in %+v", ph)
    90  	}
    91  
    92  	if dp.valuesCount = ph.DataPageHeader.NumValues; dp.valuesCount < 0 {
    93  		return fmt.Errorf("negative NumValues in DATA_PAGE: %d", dp.valuesCount)
    94  	}
    95  
    96  	dataPageBlock, err := readPageBlock(r, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), validateCRC, ph.Crc, dp.alloc)
    97  	if err != nil {
    98  		return err
    99  	}
   100  
   101  	reader, err := newBlockReader(dataPageBlock, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), dp.alloc)
   102  	if err != nil {
   103  		return err
   104  	}
   105  
   106  	dp.encoding = ph.DataPageHeader.Encoding
   107  	dp.ph = ph
   108  
   109  	if dp.valuesDecoder, err = dp.fn(dp.encoding); err != nil {
   110  		return err
   111  	}
   112  
   113  	if err := dp.rDecoder.initSize(reader); err != nil {
   114  		return err
   115  	}
   116  
   117  	if err := dp.dDecoder.initSize(reader); err != nil {
   118  		return err
   119  	}
   120  
   121  	return dp.valuesDecoder.init(reader)
   122  }
   123  
   124  type dataPageWriterV1 struct {
   125  	dictValues []interface{}
   126  	col        *Column
   127  	codec      parquet.CompressionCodec
   128  	page       *dataPage
   129  
   130  	dictionary bool
   131  	enableCRC  bool
   132  }
   133  
   134  func (dp *dataPageWriterV1) init(col *Column, codec parquet.CompressionCodec) error {
   135  	dp.col = col
   136  	dp.codec = codec
   137  	return nil
   138  }
   139  
   140  func (dp *dataPageWriterV1) getHeader(comp, unComp int, pageStats *parquet.Statistics, crc32Checksum *int32) *parquet.PageHeader {
   141  	enc := dp.col.data.encoding()
   142  	if dp.dictionary {
   143  		enc = parquet.Encoding_RLE_DICTIONARY
   144  	}
   145  	ph := &parquet.PageHeader{
   146  		Type:                 parquet.PageType_DATA_PAGE,
   147  		UncompressedPageSize: int32(unComp),
   148  		CompressedPageSize:   int32(comp),
   149  		Crc:                  crc32Checksum,
   150  		DataPageHeader: &parquet.DataPageHeader{
   151  			NumValues: int32(dp.page.numValues) + int32(dp.page.nullValues),
   152  			Encoding:  enc,
   153  			// Only RLE supported for now, not sure if we need support for more encoding
   154  			DefinitionLevelEncoding: parquet.Encoding_RLE,
   155  			RepetitionLevelEncoding: parquet.Encoding_RLE,
   156  			Statistics:              pageStats,
   157  		},
   158  	}
   159  	return ph
   160  }
   161  
   162  func (dp *dataPageWriterV1) write(ctx context.Context, w io.Writer) (int, int, error) {
   163  	dataBuf := &bytes.Buffer{}
   164  	// Only write repetition value higher than zero
   165  	if dp.col.MaxRepetitionLevel() > 0 {
   166  		if err := encodeLevelsV1(dataBuf, dp.col.MaxRepetitionLevel(), dp.page.rL); err != nil {
   167  			return 0, 0, err
   168  		}
   169  	}
   170  
   171  	// Only write definition value higher than zero
   172  	if dp.col.MaxDefinitionLevel() > 0 {
   173  		if err := encodeLevelsV1(dataBuf, dp.col.MaxDefinitionLevel(), dp.page.dL); err != nil {
   174  			return 0, 0, err
   175  		}
   176  	}
   177  
   178  	enc := dp.col.data.encoding()
   179  
   180  	if dp.dictionary {
   181  		enc = parquet.Encoding_RLE_DICTIONARY
   182  	}
   183  
   184  	if dp.dictionary {
   185  		encoder := newDictEncoder(dataBuf, bits.Len(uint(len(dp.dictValues))))
   186  		if err := encoder.encodeIndices(dp.page.indexList); err != nil {
   187  			return 0, 0, err
   188  		}
   189  		if err := encoder.Close(); err != nil {
   190  			return 0, 0, err
   191  		}
   192  	} else {
   193  		encoder, err := getValuesEncoder(enc, dp.col.Element(), dp.dictValues)
   194  		if err != nil {
   195  			return 0, 0, err
   196  		}
   197  
   198  		err = encodeValue(dataBuf, encoder, dp.page.values)
   199  		if err != nil {
   200  			return 0, 0, err
   201  		}
   202  	}
   203  
   204  	comp, err := compressBlock(dataBuf.Bytes(), dp.codec)
   205  	if err != nil {
   206  		return 0, 0, fmt.Errorf("compressing data failed with %s method: %w", dp.codec, err)
   207  	}
   208  	compSize, unCompSize := len(comp), len(dataBuf.Bytes())
   209  
   210  	var crc32Checksum *int32
   211  	if dp.enableCRC {
   212  		v := int32(crc32.ChecksumIEEE(comp))
   213  		crc32Checksum = &v
   214  	}
   215  
   216  	header := dp.getHeader(compSize, unCompSize, dp.page.stats, crc32Checksum)
   217  	if err := writeThrift(ctx, header, w); err != nil {
   218  		return 0, 0, err
   219  	}
   220  
   221  	return compSize, unCompSize, writeFull(w, comp)
   222  }
   223  
   224  func newDataPageV1Writer(useDict bool, dictValues []interface{}, page *dataPage, enableCRC bool) pageWriter {
   225  	return &dataPageWriterV1{
   226  		dictionary: useDict,
   227  		dictValues: dictValues,
   228  		page:       page,
   229  		enableCRC:  enableCRC,
   230  	}
   231  }