github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_writer_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"reflect"
    23  	"testing"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    26  	"github.com/apache/arrow/go/v7/arrow/memory"
    27  	"github.com/apache/arrow/go/v7/parquet"
    28  	"github.com/apache/arrow/go/v7/parquet/compress"
    29  	"github.com/apache/arrow/go/v7/parquet/file"
    30  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    31  	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
    32  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    33  	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
    34  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    35  	"github.com/apache/arrow/go/v7/parquet/metadata"
    36  	"github.com/apache/arrow/go/v7/parquet/schema"
    37  	"github.com/stretchr/testify/assert"
    38  	"github.com/stretchr/testify/mock"
    39  	"github.com/stretchr/testify/suite"
    40  )
    41  
    42  const (
    43  	SmallSize = 100
    44  	// larger to test some corner cases, only in some specific cases
    45  	LargeSize = 100000
    46  	// very large to test dictionary fallback
    47  	VeryLargeSize = 400000
    48  	// dictionary page size for testing fallback
    49  	DictionaryPageSize = 1024 * 1024
    50  )
    51  
    52  type mockpagewriter struct {
    53  	mock.Mock
    54  }
    55  
    56  func (m *mockpagewriter) Close(hasDict, fallBack bool) error {
    57  	return m.Called(hasDict, fallBack).Error(0)
    58  }
    59  func (m *mockpagewriter) WriteDataPage(page file.DataPage) (int64, error) {
    60  	args := m.Called(page)
    61  	return int64(args.Int(0)), args.Error(1)
    62  }
    63  func (m *mockpagewriter) WriteDictionaryPage(page *file.DictionaryPage) (int64, error) {
    64  	args := m.Called(page)
    65  	return int64(args.Int(0)), args.Error(1)
    66  }
    67  func (m *mockpagewriter) HasCompressor() bool {
    68  	return m.Called().Bool(0)
    69  }
    70  func (m *mockpagewriter) Compress(buf *bytes.Buffer, src []byte) []byte {
    71  	return m.Called(buf, src).Get(0).([]byte)
    72  }
    73  func (m *mockpagewriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error {
    74  	return m.Called().Error(0)
    75  }
    76  
    77  func TestWriteDataPageV1NumValues(t *testing.T) {
    78  	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
    79  		schema.Must(schema.ListOf(
    80  			schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)),
    81  			parquet.Repetitions.Optional, -1)),
    82  	}, -1)))
    83  	descr := sc.Column(0)
    84  	props := parquet.NewWriterProperties(
    85  		parquet.WithStats(true),
    86  		parquet.WithVersion(parquet.V1_0),
    87  		parquet.WithDataPageVersion(parquet.DataPageV1),
    88  		parquet.WithDictionaryDefault(false))
    89  
    90  	metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr)
    91  	pager := new(mockpagewriter)
    92  	defer pager.AssertExpectations(t)
    93  	pager.On("HasCompressor").Return(false)
    94  	wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter)
    95  
    96  	// write a list "[[0, 1], null, [2, null, 3]]"
    97  	// should be 6 values, 2 nulls and 3 rows
    98  	wr.WriteBatch([]int32{0, 1, 2, 3},
    99  		[]int16{3, 3, 0, 3, 2, 3},
   100  		[]int16{0, 1, 0, 0, 1, 1})
   101  
   102  	pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool {
   103  		pagev1, ok := page.(*file.DataPageV1)
   104  		if !ok {
   105  			return false
   106  		}
   107  
   108  		encodedStats := pagev1.Statistics()
   109  		// only match if the page being written has 2 nulls, 6 values and 3 rows
   110  		return pagev1.NumValues() == 6 &&
   111  			encodedStats.HasNullCount &&
   112  			encodedStats.NullCount == 2
   113  	})).Return(10, nil)
   114  
   115  	wr.FlushBufferedDataPages()
   116  	assert.EqualValues(t, 3, wr.RowsWritten())
   117  }
   118  
   119  func TestWriteDataPageV2NumRows(t *testing.T) {
   120  	// test issue from PARQUET-2066
   121  	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
   122  		schema.Must(schema.ListOf(
   123  			schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)),
   124  			parquet.Repetitions.Optional, -1)),
   125  	}, -1)))
   126  	descr := sc.Column(0)
   127  	props := parquet.NewWriterProperties(
   128  		parquet.WithStats(true),
   129  		parquet.WithVersion(parquet.V2_LATEST),
   130  		parquet.WithDataPageVersion(parquet.DataPageV2),
   131  		parquet.WithDictionaryDefault(false))
   132  
   133  	metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr)
   134  	pager := new(mockpagewriter)
   135  	defer pager.AssertExpectations(t)
   136  	pager.On("HasCompressor").Return(false)
   137  	wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter)
   138  
   139  	// write a list "[[0, 1], null, [2, null, 3]]"
   140  	// should be 6 values, 2 nulls and 3 rows
   141  	wr.WriteBatch([]int32{0, 1, 2, 3},
   142  		[]int16{3, 3, 0, 3, 2, 3},
   143  		[]int16{0, 1, 0, 0, 1, 1})
   144  
   145  	pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool {
   146  		pagev2, ok := page.(*file.DataPageV2)
   147  		if !ok {
   148  			return false
   149  		}
   150  
   151  		encodedStats := pagev2.Statistics()
   152  		// only match if the page being written has 2 nulls, 6 values and 3 rows
   153  		return !pagev2.IsCompressed() &&
   154  			pagev2.NumNulls() == 2 && encodedStats.NullCount == 2 &&
   155  			pagev2.NumValues() == 6 &&
   156  			pagev2.NumRows() == 3
   157  	})).Return(10, nil)
   158  
   159  	wr.FlushBufferedDataPages()
   160  	assert.EqualValues(t, 3, wr.RowsWritten())
   161  }
   162  
   163  func TestDataPageV2RowBoundaries(t *testing.T) {
   164  	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
   165  		schema.Must(schema.ListOf(
   166  			schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)),
   167  			parquet.Repetitions.Optional, -1)),
   168  	}, -1)))
   169  	descr := sc.Column(0)
   170  	props := parquet.NewWriterProperties(
   171  		parquet.WithBatchSize(128),
   172  		parquet.WithDataPageSize(1024),
   173  		parquet.WithVersion(parquet.V2_LATEST),
   174  		parquet.WithDataPageVersion(parquet.DataPageV2),
   175  		parquet.WithDictionaryDefault(false))
   176  
   177  	metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr)
   178  	pager := new(mockpagewriter)
   179  	defer pager.AssertExpectations(t)
   180  	pager.On("HasCompressor").Return(false)
   181  	wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter)
   182  
   183  	pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool {
   184  		pagev2, ok := page.(*file.DataPageV2)
   185  		if !ok {
   186  			return false
   187  		}
   188  
   189  		// only match if the page being written has 2 nulls, 6 values and 3 rows
   190  		return !pagev2.IsCompressed() &&
   191  			pagev2.NumNulls() == 0 &&
   192  			pagev2.NumValues() == 378 &&
   193  			pagev2.NumRows() == 126
   194  	})).Return(10, nil)
   195  
   196  	// create rows of lists of 3 values each
   197  	values := make([]int32, 1024)
   198  	defLevels := make([]int16, 1024)
   199  	repLevels := make([]int16, 1024)
   200  	for i := range values {
   201  		values[i] = int32(i)
   202  		defLevels[i] = 3
   203  
   204  		switch i % 3 {
   205  		case 0:
   206  			repLevels[i] = 0
   207  		case 1, 2:
   208  			repLevels[i] = 1
   209  		}
   210  	}
   211  
   212  	wr.WriteBatch(values, defLevels, repLevels)
   213  }
   214  
   215  type PrimitiveWriterTestSuite struct {
   216  	testutils.PrimitiveTypedTest
   217  	suite.Suite
   218  
   219  	props *parquet.WriterProperties
   220  	descr *schema.Column
   221  
   222  	metadata   *metadata.ColumnChunkMetaDataBuilder
   223  	sink       *encoding.BufferWriter
   224  	readbuffer *memory.Buffer
   225  	reader     file.ColumnChunkReader
   226  }
   227  
   228  func (p *PrimitiveWriterTestSuite) SetupTest() {
   229  	p.SetupValuesOut(SmallSize)
   230  	p.props = parquet.NewWriterProperties()
   231  	p.SetupSchema(parquet.Repetitions.Required, 1)
   232  	p.descr = p.Schema.Column(0)
   233  }
   234  
   235  func (p *PrimitiveWriterTestSuite) buildReader(nrows int64, compression compress.Compression) file.ColumnChunkReader {
   236  	p.readbuffer = p.sink.Finish()
   237  	pagereader, _ := file.NewPageReader(bytes.NewReader(p.readbuffer.Bytes()), nrows, compression, mem, nil)
   238  	return file.NewColumnReader(p.descr, pagereader, mem)
   239  }
   240  
   241  func (p *PrimitiveWriterTestSuite) buildWriter(_ int64, columnProps parquet.ColumnProperties, version parquet.Version) file.ColumnChunkWriter {
   242  	p.sink = encoding.NewBufferWriter(0, mem)
   243  	opts := make([]parquet.WriterProperty, 0)
   244  	opts = append(opts, parquet.WithVersion(version))
   245  	if columnProps.Encoding == parquet.Encodings.PlainDict || columnProps.Encoding == parquet.Encodings.RLEDict {
   246  		opts = append(opts, parquet.WithDictionaryDefault(true), parquet.WithDictionaryPageSizeLimit(DictionaryPageSize))
   247  	} else {
   248  		opts = append(opts, parquet.WithDictionaryDefault(false), parquet.WithEncoding(columnProps.Encoding))
   249  	}
   250  	opts = append(opts, parquet.WithMaxStatsSize(columnProps.MaxStatsSize), parquet.WithStats(columnProps.StatsEnabled))
   251  	p.props = parquet.NewWriterProperties(opts...)
   252  
   253  	p.metadata = metadata.NewColumnChunkMetaDataBuilder(p.props, p.descr)
   254  	pager, _ := file.NewPageWriter(p.sink, columnProps.Codec, compress.DefaultCompressionLevel, p.metadata, -1, -1, memory.DefaultAllocator, false, nil, nil)
   255  	return file.NewColumnChunkWriter(p.metadata, pager, p.props)
   256  }
   257  
   258  func (p *PrimitiveWriterTestSuite) readColumn(compression compress.Compression) int64 {
   259  	totalValues := int64(len(p.DefLevelsOut))
   260  	reader := p.buildReader(totalValues, compression)
   261  	return p.ReadBatch(reader, totalValues, 0, p.DefLevelsOut, p.RepLevelsOut)
   262  }
   263  
   264  func (p *PrimitiveWriterTestSuite) readColumnFully(compression compress.Compression) int64 {
   265  	totalValues := int64(len(p.DefLevelsOut))
   266  	reader := p.buildReader(totalValues, compression)
   267  	valuesRead := int64(0)
   268  	for valuesRead < totalValues {
   269  		read := p.ReadBatch(reader, totalValues-valuesRead, valuesRead, p.DefLevelsOut[valuesRead:], p.RepLevelsOut[valuesRead:])
   270  		valuesRead += read
   271  	}
   272  	return valuesRead
   273  }
   274  
   275  func (p *PrimitiveWriterTestSuite) readAndCompare(compression compress.Compression, nrows int64) {
   276  	p.SetupValuesOut(nrows)
   277  	p.readColumnFully(compression)
   278  	p.Equal(p.Values, p.ValuesOut)
   279  }
   280  
   281  func (p *PrimitiveWriterTestSuite) writeRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, compressLvl int, nrows int64) {
   282  	columnProperties := parquet.ColumnProperties{
   283  		Encoding:          encoding,
   284  		Codec:             compression,
   285  		DictionaryEnabled: dict,
   286  		StatsEnabled:      stats,
   287  		CompressionLevel:  compressLvl,
   288  	}
   289  	writer := p.buildWriter(nrows, columnProperties, parquet.V1_0)
   290  	p.WriteBatchValues(writer, nil, nil)
   291  	// behavior should be independant of the number of calls to Close
   292  	writer.Close()
   293  	writer.Close()
   294  }
   295  
   296  func (p *PrimitiveWriterTestSuite) writeRequiredWithSettingsSpaced(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressionLvl int) {
   297  	validBits := make([]byte, int(bitutil.BytesForBits(int64(len(p.DefLevels))))+1)
   298  	memory.Set(validBits, 255)
   299  	columnProperties := parquet.ColumnProperties{
   300  		Encoding:          encoding,
   301  		Codec:             compression,
   302  		DictionaryEnabled: dict,
   303  		StatsEnabled:      stats,
   304  		CompressionLevel:  compressionLvl,
   305  	}
   306  	writer := p.buildWriter(nrows, columnProperties, parquet.V1_0)
   307  	p.WriteBatchValuesSpaced(writer, nil, nil, validBits, 0)
   308  	// behavior should be independant from the number of close calls
   309  	writer.Close()
   310  	writer.Close()
   311  }
   312  
   313  func (p *PrimitiveWriterTestSuite) testRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressLvl int) {
   314  	p.GenerateData(nrows)
   315  	p.writeRequiredWithSettings(encoding, compression, dict, stats, compressLvl, nrows)
   316  	p.NotPanics(func() { p.readAndCompare(compression, nrows) })
   317  	p.writeRequiredWithSettingsSpaced(encoding, compression, dict, stats, nrows, compressLvl)
   318  	p.NotPanics(func() { p.readAndCompare(compression, nrows) })
   319  }
   320  
   321  func (p *PrimitiveWriterTestSuite) testRequiredWithEncoding(encoding parquet.Encoding) {
   322  	p.testRequiredWithSettings(encoding, compress.Codecs.Uncompressed, false, false, SmallSize, compress.DefaultCompressionLevel)
   323  }
   324  
   325  func (p *PrimitiveWriterTestSuite) metadataNumValues() int64 {
   326  	// metadata accessor created lazily
   327  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil)
   328  	return metadata.NumValues()
   329  }
   330  
   331  func (p *PrimitiveWriterTestSuite) metadataEncodings() []parquet.Encoding {
   332  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil)
   333  	return metadata.Encodings()
   334  }
   335  
   336  func (p *PrimitiveWriterTestSuite) metadataEncodingStats() []metadata.PageEncodingStats {
   337  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil)
   338  	return metadata.EncodingStats()
   339  }
   340  
   341  func (p *PrimitiveWriterTestSuite) metadataStatsHasMinMax() (hasMin, hasMax bool) {
   342  	appVersion := metadata.NewAppVersion(p.props.CreatedBy())
   343  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil)
   344  	stats, _ := metadata.Statistics()
   345  	encoded, _ := stats.Encode()
   346  	return encoded.HasMin, encoded.HasMax
   347  }
   348  
   349  func (p *PrimitiveWriterTestSuite) metadataIsStatsSet() bool {
   350  	appVersion := metadata.NewAppVersion(p.props.CreatedBy())
   351  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil)
   352  	set, _ := metadata.StatsSet()
   353  	return set
   354  }
   355  
   356  func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parquet.Version) {
   357  	p.GenerateData(VeryLargeSize)
   358  	props := parquet.DefaultColumnProperties()
   359  	props.DictionaryEnabled = true
   360  
   361  	if version == parquet.V1_0 {
   362  		props.Encoding = parquet.Encodings.PlainDict
   363  	} else {
   364  		props.Encoding = parquet.Encodings.RLEDict
   365  	}
   366  
   367  	writer := p.buildWriter(VeryLargeSize, props, version)
   368  	p.WriteBatchValues(writer, nil, nil)
   369  	writer.Close()
   370  
   371  	// Read all the rows so that we are sure that also the non-dictionary pages are read correctly
   372  	p.SetupValuesOut(VeryLargeSize)
   373  	valuesRead := p.readColumnFully(compress.Codecs.Uncompressed)
   374  	p.EqualValues(VeryLargeSize, valuesRead)
   375  	p.Equal(p.Values, p.ValuesOut)
   376  
   377  	encodings := p.metadataEncodings()
   378  	if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) {
   379  		// dictionary encoding is not allowed for booleans
   380  		// there are 2 encodings (PLAIN, RLE) in a non dictionary encoding case
   381  		p.Equal([]parquet.Encoding{parquet.Encodings.Plain, parquet.Encodings.RLE}, encodings)
   382  	} else if version == parquet.V1_0 {
   383  		// There are 4 encodings (PLAIN_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case
   384  		// for version 1.0
   385  		p.Equal([]parquet.Encoding{parquet.Encodings.PlainDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings)
   386  	} else {
   387  		// There are 4 encodings (RLE_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case for
   388  		// version 2.0
   389  		p.Equal([]parquet.Encoding{parquet.Encodings.RLEDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings)
   390  	}
   391  
   392  	encodingStats := p.metadataEncodingStats()
   393  	if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) {
   394  		p.Equal(parquet.Encodings.Plain, encodingStats[0].Encoding)
   395  		p.Equal(format.PageType_DATA_PAGE, encodingStats[0].PageType)
   396  	} else if version == parquet.V1_0 {
   397  		expected := []metadata.PageEncodingStats{
   398  			{Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DICTIONARY_PAGE},
   399  			{Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE},
   400  			{Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DATA_PAGE}}
   401  		p.Equal(expected[0], encodingStats[0])
   402  		p.ElementsMatch(expected[1:], encodingStats[1:])
   403  	} else {
   404  		expected := []metadata.PageEncodingStats{
   405  			{Encoding: parquet.Encodings.Plain, PageType: format.PageType_DICTIONARY_PAGE},
   406  			{Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE},
   407  			{Encoding: parquet.Encodings.RLEDict, PageType: format.PageType_DATA_PAGE}}
   408  		p.Equal(expected[0], encodingStats[0])
   409  		p.ElementsMatch(expected[1:], encodingStats[1:])
   410  	}
   411  }
   412  
   413  func (p *PrimitiveWriterTestSuite) TestRequiredPlain() {
   414  	p.testRequiredWithEncoding(parquet.Encodings.Plain)
   415  }
   416  
   417  func (p *PrimitiveWriterTestSuite) TestRequiredDictionary() {
   418  	p.testRequiredWithEncoding(parquet.Encodings.PlainDict)
   419  }
   420  
   421  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStats() {
   422  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Uncompressed, false, true, LargeSize, compress.DefaultCompressionLevel)
   423  }
   424  
   425  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithSnappy() {
   426  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, false, LargeSize, compress.DefaultCompressionLevel)
   427  }
   428  
   429  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndSnappy() {
   430  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, true, LargeSize, compress.DefaultCompressionLevel)
   431  }
   432  
   433  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotli() {
   434  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, compress.DefaultCompressionLevel)
   435  }
   436  
   437  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotliAndLevel() {
   438  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, 10)
   439  }
   440  
   441  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndBrotli() {
   442  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, true, LargeSize, compress.DefaultCompressionLevel)
   443  }
   444  
   445  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzip() {
   446  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, compress.DefaultCompressionLevel)
   447  }
   448  
   449  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzipAndLevel() {
   450  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, 10)
   451  }
   452  
   453  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndGzip() {
   454  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, true, LargeSize, compress.DefaultCompressionLevel)
   455  }
   456  
   457  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstd() {
   458  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, compress.DefaultCompressionLevel)
   459  }
   460  
   461  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstdAndLevel() {
   462  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, 6)
   463  }
   464  
   465  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndZstd() {
   466  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, true, LargeSize, compress.DefaultCompressionLevel)
   467  }
   468  
   469  func (p *PrimitiveWriterTestSuite) TestOptionalNonRepeated() {
   470  	p.SetupSchema(parquet.Repetitions.Optional, 1)
   471  	p.descr = p.Schema.Column(0)
   472  
   473  	p.GenerateData(SmallSize)
   474  	p.DefLevels[1] = 0
   475  
   476  	writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   477  	p.WriteBatchValues(writer, p.DefLevels, nil)
   478  	writer.Close()
   479  
   480  	p.Equal(int64(100), p.metadataNumValues())
   481  
   482  	values := p.readColumn(compress.Codecs.Uncompressed)
   483  	p.EqualValues(99, values)
   484  	p.Equal(reflect.ValueOf(p.Values).Slice(0, 99).Interface(), reflect.ValueOf(p.ValuesOut).Slice(0, 99).Interface())
   485  }
   486  
   487  func (p *PrimitiveWriterTestSuite) TestOptionalSpaced() {
   488  	p.SetupSchema(parquet.Repetitions.Optional, 1)
   489  	p.descr = p.Schema.Column(0)
   490  
   491  	p.GenerateData(SmallSize)
   492  	validBits := make([]byte, int(bitutil.BytesForBits(SmallSize)))
   493  	memory.Set(validBits, 255)
   494  	p.DefLevels[SmallSize-1] = 0
   495  	bitutil.ClearBit(validBits, SmallSize-1)
   496  	p.DefLevels[1] = 0
   497  	bitutil.ClearBit(validBits, 1)
   498  
   499  	writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   500  	p.WriteBatchValuesSpaced(writer, p.DefLevels, nil, validBits, 0)
   501  	writer.Close()
   502  
   503  	p.Equal(int64(100), p.metadataNumValues())
   504  
   505  	values := p.readColumn(compress.Codecs.Uncompressed)
   506  	p.EqualValues(98, values)
   507  
   508  	orig := reflect.ValueOf(p.Values)
   509  	orig = orig.Slice(0, 99)
   510  	reflect.Copy(orig.Slice(1, orig.Len()), orig.Slice(2, orig.Len()))
   511  	orig = orig.Slice(0, 98)
   512  	out := reflect.ValueOf(p.ValuesOut)
   513  	out = out.Slice(0, 98)
   514  
   515  	p.Equal(orig.Interface(), out.Interface())
   516  }
   517  
   518  func (p *PrimitiveWriterTestSuite) TestWriteRepeated() {
   519  	// optional and repeated so def and repetition levels
   520  	p.SetupSchema(parquet.Repetitions.Repeated, 1)
   521  	p.descr = p.Schema.Column(0)
   522  	p.GenerateData(SmallSize)
   523  	p.DefLevels[1] = 0
   524  	p.RepLevels = make([]int16, SmallSize)
   525  	for idx := range p.RepLevels {
   526  		p.RepLevels[idx] = 0
   527  	}
   528  
   529  	writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   530  	p.WriteBatchValues(writer, p.DefLevels, p.RepLevels)
   531  	writer.Close()
   532  
   533  	values := p.readColumn(compress.Codecs.Uncompressed)
   534  	p.EqualValues(SmallSize-1, values)
   535  	out := reflect.ValueOf(p.ValuesOut).Slice(0, SmallSize-1).Interface()
   536  	vals := reflect.ValueOf(p.Values).Slice(0, SmallSize-1).Interface()
   537  	p.Equal(vals, out)
   538  }
   539  
   540  func (p *PrimitiveWriterTestSuite) TestRequiredLargeChunk() {
   541  	p.GenerateData(LargeSize)
   542  
   543  	// Test 1: required and non-repeated, so no def or rep levels
   544  	writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   545  	p.WriteBatchValues(writer, nil, nil)
   546  	writer.Close()
   547  
   548  	// just read the first SmallSize rows to ensure we could read it back in
   549  	values := p.readColumn(compress.Codecs.Uncompressed)
   550  	p.EqualValues(SmallSize, values)
   551  	p.Equal(reflect.ValueOf(p.Values).Slice(0, SmallSize).Interface(), p.ValuesOut)
   552  }
   553  
   554  func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV1() {
   555  	p.testDictionaryFallbackEncoding(parquet.V1_0)
   556  }
   557  
   558  func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() {
   559  	p.testDictionaryFallbackEncoding(parquet.V2_LATEST)
   560  }
   561  
   562  func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() {
   563  	// test case for NULL values
   564  	p.SetupSchema(parquet.Repetitions.Optional, 1)
   565  	p.descr = p.Schema.Column(0)
   566  	p.GenerateData(LargeSize)
   567  	p.RepLevels = make([]int16, LargeSize)
   568  	for idx := range p.DefLevels {
   569  		p.DefLevels[idx] = 0
   570  		p.RepLevels[idx] = 0
   571  	}
   572  
   573  	writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   574  	p.WriteBatchValues(writer, p.DefLevels, p.RepLevels)
   575  	writer.Close()
   576  
   577  	valuesRead := p.readColumn(compress.Codecs.Uncompressed)
   578  	p.Zero(valuesRead)
   579  }
   580  
   581  func createWriterTestSuite(typ reflect.Type) suite.TestingSuite {
   582  	switch typ {
   583  	case reflect.TypeOf(true):
   584  		return &BooleanValueWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}}
   585  	case reflect.TypeOf(parquet.ByteArray{}):
   586  		return &ByteArrayWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}}
   587  	}
   588  	return &PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}
   589  }
   590  
   591  func TestColumnWriter(t *testing.T) {
   592  	t.Parallel()
   593  	types := []struct {
   594  		typ reflect.Type
   595  	}{
   596  		{reflect.TypeOf(true)},
   597  		{reflect.TypeOf(int32(0))},
   598  		{reflect.TypeOf(int64(0))},
   599  		{reflect.TypeOf(float32(0))},
   600  		{reflect.TypeOf(float64(0))},
   601  		{reflect.TypeOf(parquet.Int96{})},
   602  		{reflect.TypeOf(parquet.ByteArray{})},
   603  		{reflect.TypeOf(parquet.FixedLenByteArray{})},
   604  	}
   605  	for _, tt := range types {
   606  		tt := tt
   607  		t.Run(tt.typ.String(), func(t *testing.T) {
   608  			t.Parallel()
   609  			suite.Run(t, createWriterTestSuite(tt.typ))
   610  		})
   611  	}
   612  }
   613  
   614  type ByteArrayWriterSuite struct {
   615  	PrimitiveWriterTestSuite
   616  }
   617  
   618  func (b *ByteArrayWriterSuite) TestOmitStats() {
   619  	// prevent writing large MIN,MAX stats
   620  	minLen := 1024 * 4
   621  	maxLen := 1024 * 8
   622  	b.SetupSchema(parquet.Repetitions.Required, 1)
   623  	b.Values = make([]parquet.ByteArray, SmallSize)
   624  	writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   625  	testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen)
   626  	writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil)
   627  	writer.Close()
   628  
   629  	hasMin, hasMax := b.metadataStatsHasMinMax()
   630  	b.False(hasMin)
   631  	b.False(hasMax)
   632  }
   633  
   634  func (b *ByteArrayWriterSuite) TestOmitDataPageStats() {
   635  	// prevent writing large stats in DataPageHeader
   636  	minLen := math.Pow10(7)
   637  	maxLen := math.Pow10(7)
   638  	b.SetupSchema(parquet.Repetitions.Required, 1)
   639  	colprops := parquet.DefaultColumnProperties()
   640  	colprops.StatsEnabled = false
   641  
   642  	writer := b.buildWriter(SmallSize, colprops, parquet.V1_0)
   643  	b.Values = make([]parquet.ByteArray, 1)
   644  	testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, int(minLen), int(maxLen))
   645  	writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil)
   646  	writer.Close()
   647  
   648  	b.NotPanics(func() { b.readColumn(compress.Codecs.Uncompressed) })
   649  }
   650  
   651  func (b *ByteArrayWriterSuite) TestLimitStats() {
   652  	minLen := 1024 * 4
   653  	maxLen := 1024 * 8
   654  	b.SetupSchema(parquet.Repetitions.Required, 1)
   655  	colprops := parquet.DefaultColumnProperties()
   656  	colprops.MaxStatsSize = int64(maxLen)
   657  
   658  	writer := b.buildWriter(SmallSize, colprops, parquet.V1_0).(*file.ByteArrayColumnChunkWriter)
   659  	b.Values = make([]parquet.ByteArray, SmallSize)
   660  	testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen)
   661  	writer.WriteBatch(b.Values.([]parquet.ByteArray), nil, nil)
   662  	writer.Close()
   663  
   664  	b.True(b.metadataIsStatsSet())
   665  }
   666  
   667  func (b *ByteArrayWriterSuite) TestCheckDefaultStats() {
   668  	b.SetupSchema(parquet.Repetitions.Required, 1)
   669  	writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0)
   670  	b.GenerateData(SmallSize)
   671  	b.WriteBatchValues(writer, nil, nil)
   672  	writer.Close()
   673  
   674  	b.True(b.metadataIsStatsSet())
   675  }
   676  
   677  type BooleanValueWriterSuite struct {
   678  	PrimitiveWriterTestSuite
   679  }
   680  
   681  func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() {
   682  	b.SetupSchema(parquet.Repetitions.Required, 1)
   683  	writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0).(*file.BooleanColumnChunkWriter)
   684  	for i := 0; i < SmallSize; i++ {
   685  		val := i%2 == 0
   686  		writer.WriteBatch([]bool{val}, nil, nil)
   687  	}
   688  	writer.Close()
   689  	b.readColumn(compress.Codecs.Uncompressed)
   690  	for i := 0; i < SmallSize; i++ {
   691  		b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i])
   692  	}
   693  }