github.com/fraugster/parquet-go@v0.12.0/page_v2.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"hash/crc32"
     8  	"io"
     9  	"math/bits"
    10  
    11  	"github.com/fraugster/parquet-go/parquet"
    12  )
    13  
    14  type dataPageReaderV2 struct {
    15  	ph *parquet.PageHeader
    16  
    17  	valuesCount        int32
    18  	encoding           parquet.Encoding
    19  	valuesDecoder      valuesDecoder
    20  	dDecoder, rDecoder levelDecoder
    21  	fn                 getValueDecoderFn
    22  	position           int
    23  
    24  	alloc *allocTracker
    25  }
    26  
    27  func (dp *dataPageReaderV2) numValues() int32 {
    28  	return dp.valuesCount
    29  }
    30  
    31  func (dp *dataPageReaderV2) readValues(size int) (values []interface{}, dLevel *packedArray, rLevel *packedArray, err error) {
    32  	if rem := int(dp.valuesCount) - dp.position; rem < size {
    33  		size = rem
    34  	}
    35  
    36  	if size == 0 {
    37  		return nil, nil, nil, nil
    38  	}
    39  
    40  	rLevel, _, err = decodePackedArray(dp.rDecoder, size)
    41  	if err != nil {
    42  		return nil, nil, nil, fmt.Errorf("read repetition levels failed: %w", err)
    43  	}
    44  
    45  	var notNull int
    46  	dLevel, notNull, err = decodePackedArray(dp.dDecoder, size)
    47  	if err != nil {
    48  		return nil, nil, nil, fmt.Errorf("read definition levels failed: %w", err)
    49  	}
    50  
    51  	val := make([]interface{}, notNull)
    52  
    53  	if notNull != 0 {
    54  		if n, err := dp.valuesDecoder.decodeValues(val); err != nil {
    55  			return nil, nil, nil, fmt.Errorf("read values from page failed, need %d values but read %d: %w", notNull, n, err)
    56  		}
    57  	}
    58  	dp.position += size
    59  	return val, dLevel, rLevel, nil
    60  }
    61  
    62  func (dp *dataPageReaderV2) init(dDecoder, rDecoder getLevelDecoder, values getValueDecoderFn) error {
    63  	var err error
    64  	// Page v2 dose not have any encoding for the levels
    65  	dp.dDecoder, err = dDecoder(parquet.Encoding_RLE)
    66  	if err != nil {
    67  		return err
    68  	}
    69  	dp.rDecoder, err = rDecoder(parquet.Encoding_RLE)
    70  	if err != nil {
    71  		return err
    72  	}
    73  	dp.fn = values
    74  	dp.position = 0
    75  
    76  	return nil
    77  }
    78  
    79  func (dp *dataPageReaderV2) read(r io.Reader, ph *parquet.PageHeader, codec parquet.CompressionCodec, validateCRC bool) error {
    80  	// 1- Uncompressed size is affected by the level lens.
    81  	// 2- In page V2 the rle size is in header, not in level stream
    82  	if ph.DataPageHeaderV2 == nil {
    83  		return fmt.Errorf("null DataPageHeaderV2 in %+v", ph)
    84  	}
    85  
    86  	if dp.valuesCount = ph.DataPageHeaderV2.NumValues; dp.valuesCount < 0 {
    87  		return fmt.Errorf("negative NumValues in DATA_PAGE_V2: %d", dp.valuesCount)
    88  	}
    89  
    90  	if ph.DataPageHeaderV2.RepetitionLevelsByteLength < 0 {
    91  		return fmt.Errorf("invalid RepetitionLevelsByteLength %d", ph.DataPageHeaderV2.RepetitionLevelsByteLength)
    92  	}
    93  	if ph.DataPageHeaderV2.DefinitionLevelsByteLength < 0 {
    94  		return fmt.Errorf("invalid DefinitionLevelsByteLength %d", ph.DataPageHeaderV2.DefinitionLevelsByteLength)
    95  	}
    96  	dp.encoding = ph.DataPageHeaderV2.Encoding
    97  	dp.ph = ph
    98  
    99  	{ // to hide the govet shadow error
   100  		var err error
   101  		if dp.valuesDecoder, err = dp.fn(dp.encoding); err != nil {
   102  			return err
   103  		}
   104  	}
   105  
   106  	dataPageBlock, err := readPageBlock(r, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), validateCRC, ph.Crc, dp.alloc)
   107  	if err != nil {
   108  		return err
   109  	}
   110  
   111  	levelsSize := ph.DataPageHeaderV2.RepetitionLevelsByteLength + ph.DataPageHeaderV2.DefinitionLevelsByteLength
   112  
   113  	if ph.DataPageHeaderV2.RepetitionLevelsByteLength > 0 {
   114  		if err = dp.rDecoder.init(bytes.NewReader(dataPageBlock[:int(ph.DataPageHeaderV2.RepetitionLevelsByteLength)])); err != nil {
   115  			return fmt.Errorf("read repetition level failed: %w", err)
   116  		}
   117  	}
   118  
   119  	if ph.DataPageHeaderV2.DefinitionLevelsByteLength > 0 {
   120  		if err = dp.dDecoder.init(bytes.NewReader(dataPageBlock[int(ph.DataPageHeaderV2.RepetitionLevelsByteLength):levelsSize])); err != nil {
   121  			return fmt.Errorf("read definition level failed: %w", err)
   122  		}
   123  	}
   124  
   125  	reader, err := newBlockReader(dataPageBlock[levelsSize:], codec, ph.GetCompressedPageSize()-levelsSize, ph.GetUncompressedPageSize()-levelsSize, dp.alloc)
   126  	if err != nil {
   127  		return err
   128  	}
   129  
   130  	return dp.valuesDecoder.init(reader)
   131  }
   132  
   133  type dataPageWriterV2 struct {
   134  	dictValues []interface{}
   135  	col        *Column
   136  	codec      parquet.CompressionCodec
   137  	page       *dataPage
   138  
   139  	dictionary bool
   140  	enableCRC  bool
   141  }
   142  
   143  func (dp *dataPageWriterV2) init(col *Column, codec parquet.CompressionCodec) error {
   144  	dp.col = col
   145  	dp.codec = codec
   146  	return nil
   147  }
   148  
   149  func (dp *dataPageWriterV2) getHeader(comp, unComp, defSize, repSize int, isCompressed bool, pageStats *parquet.Statistics, numRows int32, crc32Checksum *int32) *parquet.PageHeader {
   150  	enc := dp.col.data.encoding()
   151  	if dp.dictionary {
   152  		enc = parquet.Encoding_RLE_DICTIONARY
   153  	}
   154  	ph := &parquet.PageHeader{
   155  		Type:                 parquet.PageType_DATA_PAGE_V2,
   156  		UncompressedPageSize: int32(unComp + defSize + repSize),
   157  		CompressedPageSize:   int32(comp + defSize + repSize),
   158  		Crc:                  crc32Checksum,
   159  		DataPageHeaderV2: &parquet.DataPageHeaderV2{
   160  			NumValues:                  int32(dp.page.numValues) + int32(dp.page.nullValues),
   161  			NumNulls:                   int32(dp.page.nullValues),
   162  			NumRows:                    numRows,
   163  			Encoding:                   enc,
   164  			DefinitionLevelsByteLength: int32(defSize),
   165  			RepetitionLevelsByteLength: int32(repSize),
   166  			IsCompressed:               isCompressed,
   167  			Statistics:                 pageStats,
   168  		},
   169  	}
   170  	return ph
   171  }
   172  
   173  func (dp *dataPageWriterV2) write(ctx context.Context, w io.Writer) (int, int, error) {
   174  	rep := &bytes.Buffer{}
   175  
   176  	// Only write repetition value higher than zero
   177  	if dp.col.MaxRepetitionLevel() > 0 {
   178  		if err := encodeLevelsV2(rep, dp.col.MaxRepetitionLevel(), dp.page.rL); err != nil {
   179  			return 0, 0, err
   180  		}
   181  	}
   182  
   183  	def := &bytes.Buffer{}
   184  
   185  	// Only write definition level higher than zero
   186  	if dp.col.MaxDefinitionLevel() > 0 {
   187  		if err := encodeLevelsV2(def, dp.col.MaxDefinitionLevel(), dp.page.dL); err != nil {
   188  			return 0, 0, err
   189  		}
   190  	}
   191  
   192  	dataBuf := &bytes.Buffer{}
   193  	enc := dp.col.data.encoding()
   194  
   195  	if dp.dictionary {
   196  		enc = parquet.Encoding_RLE_DICTIONARY
   197  	}
   198  
   199  	if dp.dictionary {
   200  		encoder := newDictEncoder(dataBuf, bits.Len(uint(len(dp.dictValues))))
   201  		if err := encoder.encodeIndices(dp.page.indexList); err != nil {
   202  			return 0, 0, err
   203  		}
   204  		if err := encoder.Close(); err != nil {
   205  			return 0, 0, err
   206  		}
   207  	} else {
   208  		encoder, err := getValuesEncoder(enc, dp.col.Element(), dp.dictValues)
   209  		if err != nil {
   210  			return 0, 0, err
   211  		}
   212  
   213  		err = encodeValue(dataBuf, encoder, dp.page.values)
   214  		if err != nil {
   215  			return 0, 0, err
   216  		}
   217  	}
   218  
   219  	comp, err := compressBlock(dataBuf.Bytes(), dp.codec)
   220  	if err != nil {
   221  		return 0, 0, fmt.Errorf("compressing data failed with %s method: %w", dp.codec, err)
   222  	}
   223  
   224  	var crc32Checksum *int32
   225  	if dp.enableCRC {
   226  		v := int32(crc32.ChecksumIEEE(append(append(rep.Bytes(), def.Bytes()...), comp...)))
   227  		crc32Checksum = &v
   228  	}
   229  
   230  	compSize, unCompSize := len(comp), len(dataBuf.Bytes())
   231  	defLen, repLen := def.Len(), rep.Len()
   232  	header := dp.getHeader(compSize, unCompSize, defLen, repLen, dp.codec != parquet.CompressionCodec_UNCOMPRESSED, dp.page.stats, int32(dp.page.numRows), crc32Checksum)
   233  	if err := writeThrift(ctx, header, w); err != nil {
   234  		return 0, 0, err
   235  	}
   236  
   237  	if err := writeFull(w, rep.Bytes()); err != nil {
   238  		return 0, 0, err
   239  	}
   240  
   241  	if err := writeFull(w, def.Bytes()); err != nil {
   242  		return 0, 0, err
   243  	}
   244  
   245  	return compSize + defLen + repLen, unCompSize + defLen + repLen, writeFull(w, comp)
   246  }
   247  
   248  func newDataPageV2Writer(useDict bool, dictValues []interface{}, page *dataPage, enableCRC bool) pageWriter {
   249  	return &dataPageWriterV2{
   250  		dictionary: useDict,
   251  		dictValues: dictValues,
   252  		page:       page,
   253  		enableCRC:  enableCRC,
   254  	}
   255  }