github.com/apache/arrow/go/v7@v7.0.1/parquet/file/page_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"sync"
    22  
    23  	"github.com/apache/arrow/go/v7/arrow/memory"
    24  	"github.com/apache/arrow/go/v7/parquet"
    25  	"github.com/apache/arrow/go/v7/parquet/compress"
    26  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    27  	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
    28  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
    30  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    31  	"github.com/apache/arrow/go/v7/parquet/metadata"
    32  	libthrift "github.com/apache/thrift/lib/go/thrift"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // PageWriter is the interface for both serialized and buffered page writers
    37  type PageWriter interface {
    38  	// Closes the current page, flushing any buffered data pages/dictionary pages
    39  	// based on the input parameters. Subsequent calls have no effect.
    40  	Close(hasDict, fallback bool) error
    41  	// Write the provided datapage out to the underlying writer
    42  	WriteDataPage(page DataPage) (int64, error)
    43  	// Write the provided dictionary page out to the underlying writer
    44  	WriteDictionaryPage(page *DictionaryPage) (int64, error)
    45  	// returns true if there is a configured compressor for the data
    46  	HasCompressor() bool
    47  	// use the configured compressor and writer properties to compress the data in src
    48  	// using the buffer buf. Returns the slice of the compressed bytes which may be
    49  	// the bytes in the provided buffer
    50  	Compress(buf *bytes.Buffer, src []byte) []byte
    51  	// Allow reuse of the pagewriter object by resetting it using these values instead
    52  	// of having to create a new object.
    53  	Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error
    54  }
    55  
    56  type serializedPageWriter struct {
    57  	mem      memory.Allocator
    58  	metaData *metadata.ColumnChunkMetaDataBuilder
    59  	sink     utils.WriterTell
    60  
    61  	nvalues           int64
    62  	dictPageOffset    int64
    63  	dataPageOffset    int64
    64  	totalUncompressed int64
    65  	totalCompressed   int64
    66  	pageOrdinal       int16
    67  	rgOrdinal         int16
    68  	columnOrdinal     int16
    69  
    70  	compressLevel int
    71  	compressor    compress.Codec
    72  	metaEncryptor encryption.Encryptor
    73  	dataEncryptor encryption.Encryptor
    74  	encryptionBuf bytes.Buffer
    75  
    76  	dataPageAAD       []byte
    77  	dataPageHeaderAAD []byte
    78  
    79  	dictEncodingStats map[parquet.Encoding]int32
    80  	dataEncodingStats map[parquet.Encoding]int32
    81  
    82  	thriftSerializer *thrift.Serializer
    83  }
    84  
    85  func createSerializedPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) {
    86  	var (
    87  		compressor compress.Codec
    88  		err        error
    89  	)
    90  	if codec != compress.Codecs.Uncompressed {
    91  		compressor, err = compress.GetCodec(codec)
    92  		if err != nil {
    93  			return nil, err
    94  		}
    95  	}
    96  
    97  	pgwriter := &serializedPageWriter{
    98  		sink:              sink,
    99  		compressor:        compressor,
   100  		compressLevel:     compressionLevel,
   101  		metaData:          metadata,
   102  		rgOrdinal:         rowGroupOrdinal,
   103  		columnOrdinal:     columnChunkOrdinal,
   104  		mem:               mem,
   105  		metaEncryptor:     metaEncryptor,
   106  		dataEncryptor:     dataEncryptor,
   107  		dictEncodingStats: make(map[parquet.Encoding]int32),
   108  		dataEncodingStats: make(map[parquet.Encoding]int32),
   109  		thriftSerializer:  thrift.NewThriftSerializer(),
   110  	}
   111  	if metaEncryptor != nil || dataEncryptor != nil {
   112  		pgwriter.initEncryption()
   113  	}
   114  	return pgwriter, nil
   115  }
   116  
   117  // NewPageWriter returns a page writer using either the buffered or serialized implementations
   118  func NewPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, buffered bool, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) {
   119  	if buffered {
   120  		return newBufferedPageWriter(sink, codec, compressionLevel, metadata, rowGroupOrdinal, columnChunkOrdinal, mem, metaEncryptor, dataEncryptor)
   121  	}
   122  	return createSerializedPageWriter(sink, codec, compressionLevel, metadata, rowGroupOrdinal, columnChunkOrdinal, mem, metaEncryptor, dataEncryptor)
   123  }
   124  
   125  // Reset allows reusing the pagewriter object instead of creating a new one.
   126  func (pw *serializedPageWriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error {
   127  	var (
   128  		compressor compress.Codec
   129  		err        error
   130  	)
   131  	if codec != compress.Codecs.Uncompressed {
   132  		compressor, err = compress.GetCodec(codec)
   133  		if err != nil {
   134  			return err
   135  		}
   136  	}
   137  
   138  	pw.sink = sink
   139  	pw.compressor = compressor
   140  	pw.compressLevel = compressionLevel
   141  	pw.metaData = metadata
   142  	pw.rgOrdinal = rowGroupOrdinal
   143  	pw.columnOrdinal = columnChunkOrdinal
   144  	pw.metaEncryptor = metaEncryptor
   145  	pw.dataEncryptor = dataEncryptor
   146  	pw.dictEncodingStats = make(map[parquet.Encoding]int32)
   147  	pw.dataEncodingStats = make(map[parquet.Encoding]int32)
   148  
   149  	pw.nvalues = 0
   150  	pw.dictPageOffset = 0
   151  	pw.dataPageOffset = 0
   152  	pw.totalUncompressed = 0
   153  	pw.totalCompressed = 0
   154  	pw.pageOrdinal = 0
   155  
   156  	if metaEncryptor != nil || dataEncryptor != nil {
   157  		pw.initEncryption()
   158  	}
   159  	return nil
   160  }
   161  
   162  func (pw *serializedPageWriter) initEncryption() {
   163  	if pw.dataEncryptor != nil {
   164  		pw.dataPageAAD = []byte(encryption.CreateModuleAad(pw.dataEncryptor.FileAad(), encryption.DataPageModule, pw.rgOrdinal, pw.columnOrdinal, -1))
   165  	}
   166  	if pw.metaEncryptor != nil {
   167  		pw.dataPageHeaderAAD = []byte(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), encryption.DataPageHeaderModule, pw.rgOrdinal, pw.columnOrdinal, -1))
   168  	}
   169  }
   170  
   171  func (pw *serializedPageWriter) updateEncryption(moduleType int8) error {
   172  	switch moduleType {
   173  	case encryption.ColumnMetaModule:
   174  		pw.metaEncryptor.UpdateAad(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1))
   175  	case encryption.DataPageModule:
   176  		encryption.QuickUpdatePageAad(pw.dataPageAAD, pw.pageOrdinal)
   177  		pw.dataEncryptor.UpdateAad(string(pw.dataPageAAD))
   178  	case encryption.DataPageHeaderModule:
   179  		encryption.QuickUpdatePageAad(pw.dataPageHeaderAAD, pw.pageOrdinal)
   180  		pw.metaEncryptor.UpdateAad(string(pw.dataPageHeaderAAD))
   181  	case encryption.DictPageHeaderModule:
   182  		pw.metaEncryptor.UpdateAad(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1))
   183  	case encryption.DictPageModule:
   184  		pw.dataEncryptor.UpdateAad(encryption.CreateModuleAad(pw.dataEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1))
   185  	}
   186  	return xerrors.New("unknown module type in updateencryption")
   187  }
   188  
   189  func (pw *serializedPageWriter) Close(hasDict, fallback bool) error {
   190  	if pw.metaEncryptor != nil {
   191  		pw.updateEncryption(encryption.ColumnMetaModule)
   192  	}
   193  
   194  	chunkInfo := metadata.ChunkMetaInfo{
   195  		NumValues:        pw.nvalues,
   196  		DictPageOffset:   pw.dictPageOffset,
   197  		IndexPageOffset:  -1,
   198  		DataPageOffset:   pw.dataPageOffset,
   199  		CompressedSize:   pw.totalCompressed,
   200  		UncompressedSize: pw.totalUncompressed,
   201  	}
   202  	encodingStats := metadata.EncodingStats{
   203  		DictEncodingStats: pw.dictEncodingStats,
   204  		DataEncodingStats: pw.dataEncodingStats,
   205  	}
   206  	pw.metaData.Finish(chunkInfo, hasDict, fallback, encodingStats, pw.metaEncryptor)
   207  	_, err := pw.metaData.WriteTo(pw.sink)
   208  	return err
   209  }
   210  
   211  func (pw *serializedPageWriter) Compress(buf *bytes.Buffer, src []byte) []byte {
   212  	maxCompressed := pw.compressor.CompressBound(int64(len(src)))
   213  	buf.Grow(int(maxCompressed))
   214  	return pw.compressor.EncodeLevel(buf.Bytes(), src, pw.compressLevel)
   215  }
   216  
   217  var dataPageV1HeaderPool = sync.Pool{
   218  	New: func() interface{} { return format.NewDataPageHeader() },
   219  }
   220  
   221  func (pw *serializedPageWriter) setDataPageHeader(pageHdr *format.PageHeader, page *DataPageV1) {
   222  	pageHdr.Type = format.PageType_DATA_PAGE
   223  	hdr := dataPageV1HeaderPool.Get().(*format.DataPageHeader)
   224  	hdr.NumValues = page.nvals
   225  	hdr.Encoding = page.encoding
   226  	hdr.DefinitionLevelEncoding = page.defLvlEncoding
   227  	hdr.RepetitionLevelEncoding = page.repLvlEncoding
   228  	hdr.Statistics = page.statistics.ToThrift()
   229  	pageHdr.DataPageHeader = hdr
   230  	pageHdr.DataPageHeaderV2 = nil
   231  	pageHdr.DictionaryPageHeader = nil
   232  }
   233  
   234  var dataPageV2HeaderPool = sync.Pool{
   235  	New: func() interface{} { return format.NewDataPageHeaderV2() },
   236  }
   237  
   238  func (pw *serializedPageWriter) setDataPageV2Header(pageHdr *format.PageHeader, page *DataPageV2) {
   239  	pageHdr.Type = format.PageType_DATA_PAGE_V2
   240  	hdr := dataPageV2HeaderPool.Get().(*format.DataPageHeaderV2)
   241  	hdr.NumValues = page.nvals
   242  	hdr.NumNulls = page.nulls
   243  	hdr.NumRows = page.nrows
   244  	hdr.Encoding = page.encoding
   245  	hdr.DefinitionLevelsByteLength = page.defLvlByteLen
   246  	hdr.RepetitionLevelsByteLength = page.repLvlByteLen
   247  	hdr.IsCompressed = page.compressed
   248  	hdr.Statistics = page.statistics.ToThrift()
   249  	pageHdr.DataPageHeaderV2 = hdr
   250  	pageHdr.DataPageHeader = nil
   251  	pageHdr.DictionaryPageHeader = nil
   252  }
   253  
   254  func (pw *serializedPageWriter) HasCompressor() bool          { return pw.compressor != nil }
   255  func (pw *serializedPageWriter) NumValues() int64             { return pw.nvalues }
   256  func (pw *serializedPageWriter) DictionaryPageOffset() int64  { return pw.dictPageOffset }
   257  func (pw *serializedPageWriter) DataPageoffset() int64        { return pw.dataPageOffset }
   258  func (pw *serializedPageWriter) TotalCompressedSize() int64   { return pw.totalCompressed }
   259  func (pw *serializedPageWriter) TotalUncompressedSize() int64 { return pw.totalUncompressed }
   260  
   261  func (pw *serializedPageWriter) WriteDictionaryPage(page *DictionaryPage) (int64, error) {
   262  	uncompressed := len(page.Data())
   263  
   264  	var data []byte
   265  	if pw.HasCompressor() {
   266  		var buffer bytes.Buffer
   267  		data = pw.Compress(&buffer, page.Data())
   268  		// data = buffer.Bytes()
   269  	} else {
   270  		data = page.Data()
   271  	}
   272  
   273  	dictPageHeader := &format.DictionaryPageHeader{
   274  		NumValues: page.NumValues(),
   275  		Encoding:  page.Encoding(),
   276  		IsSorted:  libthrift.BoolPtr(page.IsSorted()),
   277  	}
   278  
   279  	if pw.dataEncryptor != nil {
   280  		pw.updateEncryption(encryption.DictPageModule)
   281  		pw.encryptionBuf.Reset()
   282  		pw.encryptionBuf.Grow(pw.dataEncryptor.CiphertextSizeDelta() + len(data))
   283  		pw.dataEncryptor.Encrypt(&pw.encryptionBuf, data)
   284  		data = pw.encryptionBuf.Bytes()
   285  	}
   286  
   287  	pageHdr := pageHeaderPool.Get().(*format.PageHeader)
   288  	defer pageHeaderPool.Put(pageHdr)
   289  	pageHdr.Type = format.PageType_DICTIONARY_PAGE
   290  	pageHdr.UncompressedPageSize = int32(uncompressed)
   291  	pageHdr.CompressedPageSize = int32(len(data))
   292  	pageHdr.DictionaryPageHeader = dictPageHeader
   293  	pageHdr.DataPageHeader = nil
   294  	pageHdr.DataPageHeaderV2 = nil
   295  
   296  	startPos := pw.sink.Tell()
   297  	if pw.dictPageOffset == 0 {
   298  		pw.dictPageOffset = int64(startPos)
   299  	}
   300  
   301  	if pw.metaEncryptor != nil {
   302  		if err := pw.updateEncryption(encryption.DictPageHeaderModule); err != nil {
   303  			return 0, err
   304  		}
   305  	}
   306  	headerSize, err := pw.thriftSerializer.Serialize(pageHdr, pw.sink, pw.metaEncryptor)
   307  	if err != nil {
   308  		return 0, err
   309  	}
   310  	written, err := pw.sink.Write(data)
   311  	if err != nil {
   312  		return 0, err
   313  	}
   314  
   315  	written += headerSize
   316  
   317  	pw.totalUncompressed += int64(uncompressed + headerSize)
   318  	pw.totalCompressed = int64(written)
   319  	pw.dictEncodingStats[parquet.Encoding(page.encoding)]++
   320  	return int64(written), nil
   321  }
   322  
   323  var pageHeaderPool = sync.Pool{
   324  	New: func() interface{} {
   325  		return format.NewPageHeader()
   326  	},
   327  }
   328  
   329  func (pw *serializedPageWriter) WriteDataPage(page DataPage) (int64, error) {
   330  	uncompressed := page.UncompressedSize()
   331  	data := page.Data()
   332  
   333  	if pw.dataEncryptor != nil {
   334  		if err := pw.updateEncryption(encryption.DataPageModule); err != nil {
   335  			return 0, err
   336  		}
   337  		pw.encryptionBuf.Reset()
   338  		pw.encryptionBuf.Grow(pw.dataEncryptor.CiphertextSizeDelta() + len(data))
   339  		pw.dataEncryptor.Encrypt(&pw.encryptionBuf, data)
   340  		data = pw.encryptionBuf.Bytes()
   341  	}
   342  
   343  	pageHdr := pageHeaderPool.Get().(*format.PageHeader)
   344  	defer pageHeaderPool.Put(pageHdr)
   345  	pageHdr.UncompressedPageSize = uncompressed
   346  	pageHdr.CompressedPageSize = int32(len(data))
   347  
   348  	switch dpage := page.(type) {
   349  	case *DataPageV1:
   350  		pw.setDataPageHeader(pageHdr, dpage)
   351  		defer dataPageV1HeaderPool.Put(pageHdr.DataPageHeader)
   352  	case *DataPageV2:
   353  		pw.setDataPageV2Header(pageHdr, dpage)
   354  		defer dataPageV2HeaderPool.Put(pageHdr.DataPageHeaderV2)
   355  	default:
   356  		return 0, xerrors.New("parquet: unexpected page type")
   357  	}
   358  
   359  	startPos := pw.sink.Tell()
   360  	if pw.pageOrdinal == 0 {
   361  		pw.dataPageOffset = int64(startPos)
   362  	}
   363  
   364  	if pw.metaEncryptor != nil {
   365  		if err := pw.updateEncryption(encryption.DataPageHeaderModule); err != nil {
   366  			return 0, err
   367  		}
   368  	}
   369  	headerSize, err := pw.thriftSerializer.Serialize(pageHdr, pw.sink, pw.metaEncryptor)
   370  	if err != nil {
   371  		return 0, err
   372  	}
   373  	written, err := pw.sink.Write(data)
   374  	if err != nil {
   375  		return int64(written), err
   376  	}
   377  	written += headerSize
   378  
   379  	pw.totalUncompressed += int64(uncompressed) + int64(headerSize)
   380  	pw.totalCompressed += int64(written)
   381  	pw.nvalues += int64(page.NumValues())
   382  	pw.dataEncodingStats[parquet.Encoding(page.Encoding())]++
   383  	pw.pageOrdinal++
   384  	return int64(written), nil
   385  }
   386  
   387  type bufferedPageWriter struct {
   388  	finalSink          utils.WriterTell
   389  	inMemSink          *encoding.BufferWriter
   390  	metadata           *metadata.ColumnChunkMetaDataBuilder
   391  	pager              *serializedPageWriter
   392  	hasDictionaryPages bool
   393  }
   394  
   395  func newBufferedPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, mem memory.Allocator, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) {
   396  	wr := &bufferedPageWriter{
   397  		finalSink:          sink,
   398  		metadata:           metadata,
   399  		hasDictionaryPages: false,
   400  		inMemSink:          encoding.NewBufferWriter(0, mem),
   401  	}
   402  	pager, err := createSerializedPageWriter(wr.inMemSink, codec, compressionLevel, metadata, rgOrdinal, columnOrdinal, mem, metaEncryptor, dataEncryptor)
   403  	if err != nil {
   404  		return nil, err
   405  	}
   406  	wr.pager = pager.(*serializedPageWriter)
   407  	return wr, nil
   408  }
   409  
   410  func (bw *bufferedPageWriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error {
   411  	bw.finalSink = sink
   412  	bw.metadata = metadata
   413  	bw.hasDictionaryPages = false
   414  	bw.inMemSink.Reset(0)
   415  
   416  	return bw.pager.Reset(bw.inMemSink, codec, compressionLevel, metadata, rgOrdinal, columnOrdinal, metaEncryptor, dataEncryptor)
   417  }
   418  
   419  func (bw *bufferedPageWriter) WriteDictionaryPage(page *DictionaryPage) (int64, error) {
   420  	bw.hasDictionaryPages = true
   421  	return bw.pager.WriteDictionaryPage(page)
   422  }
   423  
   424  func (bw *bufferedPageWriter) Close(hasDict, fallback bool) error {
   425  	if bw.pager.metaEncryptor != nil {
   426  		bw.pager.updateEncryption(encryption.ColumnMetaModule)
   427  	}
   428  
   429  	position := bw.finalSink.Tell()
   430  	dictOffset := int64(0)
   431  	if bw.hasDictionaryPages {
   432  		dictOffset = bw.pager.DictionaryPageOffset() + position
   433  	}
   434  
   435  	chunkInfo := metadata.ChunkMetaInfo{
   436  		NumValues:        bw.pager.NumValues(),
   437  		DictPageOffset:   dictOffset,
   438  		IndexPageOffset:  -1,
   439  		DataPageOffset:   bw.pager.DataPageoffset() + position,
   440  		CompressedSize:   bw.pager.TotalCompressedSize(),
   441  		UncompressedSize: bw.pager.TotalUncompressedSize(),
   442  	}
   443  	encodingStats := metadata.EncodingStats{
   444  		DictEncodingStats: bw.pager.dictEncodingStats,
   445  		DataEncodingStats: bw.pager.dataEncodingStats,
   446  	}
   447  	bw.metadata.Finish(chunkInfo, hasDict, fallback, encodingStats, bw.pager.metaEncryptor)
   448  	bw.metadata.WriteTo(bw.inMemSink)
   449  
   450  	buf := bw.inMemSink.Finish()
   451  	defer buf.Release()
   452  	_, err := bw.finalSink.Write(buf.Bytes())
   453  	return err
   454  }
   455  
   456  func (bw *bufferedPageWriter) WriteDataPage(page DataPage) (int64, error) {
   457  	return bw.pager.WriteDataPage(page)
   458  }
   459  
   460  func (bw *bufferedPageWriter) HasCompressor() bool {
   461  	return bw.pager.HasCompressor()
   462  }
   463  
   464  func (bw *bufferedPageWriter) Compress(buf *bytes.Buffer, src []byte) []byte {
   465  	return bw.pager.Compress(buf, src)
   466  }