github.com/apache/arrow/go/v16@v16.1.0/parquet/file/column_writer_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"reflect"
    23  	"runtime"
    24  	"sync"
    25  	"testing"
    26  
    27  	"github.com/apache/arrow/go/v16/arrow"
    28  	"github.com/apache/arrow/go/v16/arrow/array"
    29  	"github.com/apache/arrow/go/v16/arrow/bitutil"
    30  	"github.com/apache/arrow/go/v16/arrow/memory"
    31  	arrutils "github.com/apache/arrow/go/v16/internal/utils"
    32  	"github.com/apache/arrow/go/v16/parquet"
    33  	"github.com/apache/arrow/go/v16/parquet/compress"
    34  	"github.com/apache/arrow/go/v16/parquet/file"
    35  	"github.com/apache/arrow/go/v16/parquet/internal/encoding"
    36  	"github.com/apache/arrow/go/v16/parquet/internal/encryption"
    37  	format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet"
    38  	"github.com/apache/arrow/go/v16/parquet/internal/testutils"
    39  	"github.com/apache/arrow/go/v16/parquet/internal/utils"
    40  	"github.com/apache/arrow/go/v16/parquet/metadata"
    41  	"github.com/apache/arrow/go/v16/parquet/pqarrow"
    42  	"github.com/apache/arrow/go/v16/parquet/schema"
    43  	"github.com/stretchr/testify/assert"
    44  	"github.com/stretchr/testify/mock"
    45  	"github.com/stretchr/testify/suite"
    46  )
    47  
    48  const (
    49  	SmallSize = 100
    50  	// larger to test some corner cases, only in some specific cases
    51  	LargeSize = 100000
    52  	// very large to test dictionary fallback
    53  	VeryLargeSize = 400000
    54  	// dictionary page size for testing fallback
    55  	DictionaryPageSize = 1024 * 1024
    56  )
    57  
    58  type mockpagewriter struct {
    59  	mock.Mock
    60  }
    61  
    62  func (m *mockpagewriter) Close(hasDict, fallBack bool) error {
    63  	return m.Called(hasDict, fallBack).Error(0)
    64  }
    65  func (m *mockpagewriter) WriteDataPage(page file.DataPage) (int64, error) {
    66  	args := m.Called(page)
    67  	return int64(args.Int(0)), args.Error(1)
    68  }
    69  func (m *mockpagewriter) WriteDictionaryPage(page *file.DictionaryPage) (int64, error) {
    70  	args := m.Called(page)
    71  	return int64(args.Int(0)), args.Error(1)
    72  }
    73  func (m *mockpagewriter) HasCompressor() bool {
    74  	return m.Called().Bool(0)
    75  }
    76  func (m *mockpagewriter) Compress(buf *bytes.Buffer, src []byte) []byte {
    77  	return m.Called(buf, src).Get(0).([]byte)
    78  }
    79  func (m *mockpagewriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error {
    80  	return m.Called().Error(0)
    81  }
    82  
    83  func TestWriteDataPageV1NumValues(t *testing.T) {
    84  	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
    85  		schema.Must(schema.ListOf(
    86  			schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)),
    87  			parquet.Repetitions.Optional, -1)),
    88  	}, -1)))
    89  	descr := sc.Column(0)
    90  	props := parquet.NewWriterProperties(
    91  		parquet.WithStats(true),
    92  		parquet.WithVersion(parquet.V1_0),
    93  		parquet.WithDataPageVersion(parquet.DataPageV1),
    94  		parquet.WithDictionaryDefault(false))
    95  
    96  	metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr)
    97  	pager := new(mockpagewriter)
    98  	defer pager.AssertExpectations(t)
    99  	pager.On("HasCompressor").Return(false)
   100  	wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter)
   101  
   102  	// write a list "[[0, 1], null, [2, null, 3]]"
   103  	// should be 6 values, 2 nulls and 3 rows
   104  	wr.WriteBatch([]int32{0, 1, 2, 3},
   105  		[]int16{3, 3, 0, 3, 2, 3},
   106  		[]int16{0, 1, 0, 0, 1, 1})
   107  
   108  	pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool {
   109  		pagev1, ok := page.(*file.DataPageV1)
   110  		if !ok {
   111  			return false
   112  		}
   113  
   114  		encodedStats := pagev1.Statistics()
   115  		// only match if the page being written has 2 nulls, 6 values and 3 rows
   116  		return pagev1.NumValues() == 6 &&
   117  			encodedStats.HasNullCount &&
   118  			encodedStats.NullCount == 2
   119  	})).Return(10, nil)
   120  
   121  	wr.FlushBufferedDataPages()
   122  	assert.EqualValues(t, 3, wr.RowsWritten())
   123  }
   124  
   125  func TestWriteDataPageV2NumRows(t *testing.T) {
   126  	// test issue from PARQUET-2066
   127  	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
   128  		schema.Must(schema.ListOf(
   129  			schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)),
   130  			parquet.Repetitions.Optional, -1)),
   131  	}, -1)))
   132  	descr := sc.Column(0)
   133  	props := parquet.NewWriterProperties(
   134  		parquet.WithStats(true),
   135  		parquet.WithVersion(parquet.V2_LATEST),
   136  		parquet.WithDataPageVersion(parquet.DataPageV2),
   137  		parquet.WithDictionaryDefault(false))
   138  
   139  	metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr)
   140  	pager := new(mockpagewriter)
   141  	defer pager.AssertExpectations(t)
   142  	pager.On("HasCompressor").Return(false)
   143  	wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter)
   144  
   145  	// write a list "[[0, 1], null, [2, null, 3]]"
   146  	// should be 6 values, 2 nulls and 3 rows
   147  	wr.WriteBatch([]int32{0, 1, 2, 3},
   148  		[]int16{3, 3, 0, 3, 2, 3},
   149  		[]int16{0, 1, 0, 0, 1, 1})
   150  
   151  	pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool {
   152  		pagev2, ok := page.(*file.DataPageV2)
   153  		if !ok {
   154  			return false
   155  		}
   156  
   157  		encodedStats := pagev2.Statistics()
   158  		// only match if the page being written has 2 nulls, 6 values and 3 rows
   159  		return !pagev2.IsCompressed() &&
   160  			pagev2.NumNulls() == 2 && encodedStats.NullCount == 2 &&
   161  			pagev2.NumValues() == 6 &&
   162  			pagev2.NumRows() == 3
   163  	})).Return(10, nil)
   164  
   165  	wr.FlushBufferedDataPages()
   166  	assert.EqualValues(t, 3, wr.RowsWritten())
   167  }
   168  
   169  func TestDataPageV2RowBoundaries(t *testing.T) {
   170  	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
   171  		schema.Must(schema.ListOf(
   172  			schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)),
   173  			parquet.Repetitions.Optional, -1)),
   174  	}, -1)))
   175  	descr := sc.Column(0)
   176  	props := parquet.NewWriterProperties(
   177  		parquet.WithBatchSize(128),
   178  		parquet.WithDataPageSize(1024),
   179  		parquet.WithVersion(parquet.V2_LATEST),
   180  		parquet.WithDataPageVersion(parquet.DataPageV2),
   181  		parquet.WithDictionaryDefault(false))
   182  
   183  	metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr)
   184  	pager := new(mockpagewriter)
   185  	defer pager.AssertExpectations(t)
   186  	pager.On("HasCompressor").Return(false)
   187  	wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter)
   188  
   189  	pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool {
   190  		pagev2, ok := page.(*file.DataPageV2)
   191  		if !ok {
   192  			return false
   193  		}
   194  
   195  		// only match if the page being written has 2 nulls, 6 values and 3 rows
   196  		return !pagev2.IsCompressed() &&
   197  			pagev2.NumNulls() == 0 &&
   198  			pagev2.NumValues() == 378 &&
   199  			pagev2.NumRows() == 126
   200  	})).Return(10, nil)
   201  
   202  	// create rows of lists of 3 values each
   203  	values := make([]int32, 1024)
   204  	defLevels := make([]int16, 1024)
   205  	repLevels := make([]int16, 1024)
   206  	for i := range values {
   207  		values[i] = int32(i)
   208  		defLevels[i] = 3
   209  
   210  		switch i % 3 {
   211  		case 0:
   212  			repLevels[i] = 0
   213  		case 1, 2:
   214  			repLevels[i] = 1
   215  		}
   216  	}
   217  
   218  	wr.WriteBatch(values, defLevels, repLevels)
   219  }
   220  
   221  type PrimitiveWriterTestSuite struct {
   222  	testutils.PrimitiveTypedTest
   223  	suite.Suite
   224  
   225  	props *parquet.WriterProperties
   226  	descr *schema.Column
   227  
   228  	metadata   *metadata.ColumnChunkMetaDataBuilder
   229  	sink       *encoding.BufferWriter
   230  	readbuffer *memory.Buffer
   231  
   232  	bufferPool sync.Pool
   233  }
   234  
   235  func (p *PrimitiveWriterTestSuite) SetupTest() {
   236  	p.SetupValuesOut(SmallSize)
   237  	p.props = parquet.NewWriterProperties()
   238  	p.SetupSchema(parquet.Repetitions.Required, 1)
   239  	p.descr = p.Schema.Column(0)
   240  
   241  	p.bufferPool = sync.Pool{
   242  		New: func() interface{} {
   243  			buf := memory.NewResizableBuffer(mem)
   244  			runtime.SetFinalizer(buf, func(obj *memory.Buffer) {
   245  				obj.Release()
   246  			})
   247  			return buf
   248  		},
   249  	}
   250  }
   251  
   252  func (p *PrimitiveWriterTestSuite) TearDownTest() {
   253  	p.bufferPool = sync.Pool{}
   254  }
   255  
   256  func (p *PrimitiveWriterTestSuite) buildReader(nrows int64, compression compress.Compression) file.ColumnChunkReader {
   257  	p.readbuffer = p.sink.Finish()
   258  	pagereader, _ := file.NewPageReader(arrutils.NewBufferedReader(bytes.NewReader(p.readbuffer.Bytes()), p.readbuffer.Len()), nrows, compression, mem, nil)
   259  	return file.NewColumnReader(p.descr, pagereader, mem, &p.bufferPool)
   260  }
   261  
   262  func (p *PrimitiveWriterTestSuite) buildWriter(_ int64, columnProps parquet.ColumnProperties, opts ...parquet.WriterProperty) file.ColumnChunkWriter {
   263  	p.sink = encoding.NewBufferWriter(0, mem)
   264  	if columnProps.Encoding == parquet.Encodings.PlainDict || columnProps.Encoding == parquet.Encodings.RLEDict {
   265  		opts = append(opts, parquet.WithDictionaryDefault(true), parquet.WithDictionaryPageSizeLimit(DictionaryPageSize))
   266  	} else {
   267  		opts = append(opts, parquet.WithDictionaryDefault(false), parquet.WithEncoding(columnProps.Encoding))
   268  	}
   269  	opts = append(opts, parquet.WithMaxStatsSize(columnProps.MaxStatsSize), parquet.WithStats(columnProps.StatsEnabled))
   270  	p.props = parquet.NewWriterProperties(opts...)
   271  
   272  	p.metadata = metadata.NewColumnChunkMetaDataBuilder(p.props, p.descr)
   273  	pager, _ := file.NewPageWriter(p.sink, columnProps.Codec, compress.DefaultCompressionLevel, p.metadata, -1, -1, memory.DefaultAllocator, false, nil, nil)
   274  	return file.NewColumnChunkWriter(p.metadata, pager, p.props)
   275  }
   276  
   277  func (p *PrimitiveWriterTestSuite) readColumn(compression compress.Compression) int64 {
   278  	totalValues := int64(len(p.DefLevelsOut))
   279  	reader := p.buildReader(totalValues, compression)
   280  	return p.ReadBatch(reader, totalValues, 0, p.DefLevelsOut, p.RepLevelsOut)
   281  }
   282  
   283  func (p *PrimitiveWriterTestSuite) readColumnFully(compression compress.Compression) int64 {
   284  	totalValues := int64(len(p.DefLevelsOut))
   285  	reader := p.buildReader(totalValues, compression)
   286  	valuesRead := int64(0)
   287  	for valuesRead < totalValues {
   288  		read := p.ReadBatch(reader, totalValues-valuesRead, valuesRead, p.DefLevelsOut[valuesRead:], p.RepLevelsOut[valuesRead:])
   289  		valuesRead += read
   290  	}
   291  	return valuesRead
   292  }
   293  
   294  func (p *PrimitiveWriterTestSuite) readAndCompare(compression compress.Compression, nrows int64) {
   295  	p.SetupValuesOut(nrows)
   296  	p.readColumnFully(compression)
   297  	p.Equal(p.Values, p.ValuesOut)
   298  }
   299  
   300  func (p *PrimitiveWriterTestSuite) writeRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, compressLvl int, nrows int64) {
   301  	columnProperties := parquet.ColumnProperties{
   302  		Encoding:          encoding,
   303  		Codec:             compression,
   304  		DictionaryEnabled: dict,
   305  		StatsEnabled:      stats,
   306  		CompressionLevel:  compressLvl,
   307  	}
   308  	writer := p.buildWriter(nrows, columnProperties, parquet.WithVersion(parquet.V1_0))
   309  	p.WriteBatchValues(writer, nil, nil)
   310  	// behavior should be independent of the number of calls to Close
   311  	writer.Close()
   312  	writer.Close()
   313  }
   314  
   315  func (p *PrimitiveWriterTestSuite) writeRequiredWithSettingsSpaced(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressionLvl int) {
   316  	validBits := make([]byte, int(bitutil.BytesForBits(int64(len(p.DefLevels))))+1)
   317  	memory.Set(validBits, 255)
   318  	columnProperties := parquet.ColumnProperties{
   319  		Encoding:          encoding,
   320  		Codec:             compression,
   321  		DictionaryEnabled: dict,
   322  		StatsEnabled:      stats,
   323  		CompressionLevel:  compressionLvl,
   324  	}
   325  	writer := p.buildWriter(nrows, columnProperties, parquet.WithVersion(parquet.V1_0))
   326  	p.WriteBatchValuesSpaced(writer, nil, nil, validBits, 0)
   327  	// behavior should be independent from the number of close calls
   328  	writer.Close()
   329  	writer.Close()
   330  }
   331  
   332  func (p *PrimitiveWriterTestSuite) testRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressLvl int) {
   333  	p.GenerateData(nrows)
   334  	p.writeRequiredWithSettings(encoding, compression, dict, stats, compressLvl, nrows)
   335  	p.NotPanics(func() { p.readAndCompare(compression, nrows) })
   336  	p.writeRequiredWithSettingsSpaced(encoding, compression, dict, stats, nrows, compressLvl)
   337  	p.NotPanics(func() { p.readAndCompare(compression, nrows) })
   338  }
   339  
   340  func (p *PrimitiveWriterTestSuite) testRequiredWithEncoding(encoding parquet.Encoding) {
   341  	p.testRequiredWithSettings(encoding, compress.Codecs.Uncompressed, false, false, SmallSize, compress.DefaultCompressionLevel)
   342  }
   343  
   344  func (p *PrimitiveWriterTestSuite) metadataNumValues() int64 {
   345  	// metadata accessor created lazily
   346  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil)
   347  	return metadata.NumValues()
   348  }
   349  
   350  func (p *PrimitiveWriterTestSuite) metadataEncodings() []parquet.Encoding {
   351  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil)
   352  	return metadata.Encodings()
   353  }
   354  
   355  func (p *PrimitiveWriterTestSuite) metadataEncodingStats() []metadata.PageEncodingStats {
   356  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil)
   357  	return metadata.EncodingStats()
   358  }
   359  
   360  func (p *PrimitiveWriterTestSuite) metadataStatsHasMinMax() (hasMin, hasMax bool) {
   361  	appVersion := metadata.NewAppVersion(p.props.CreatedBy())
   362  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil)
   363  	stats, _ := metadata.Statistics()
   364  	encoded, _ := stats.Encode()
   365  	return encoded.HasMin, encoded.HasMax
   366  }
   367  
   368  func (p *PrimitiveWriterTestSuite) metadataIsStatsSet() bool {
   369  	appVersion := metadata.NewAppVersion(p.props.CreatedBy())
   370  	metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil)
   371  	set, _ := metadata.StatsSet()
   372  	return set
   373  }
   374  
   375  func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parquet.Version) {
   376  	p.GenerateData(VeryLargeSize)
   377  	props := parquet.DefaultColumnProperties()
   378  	props.DictionaryEnabled = true
   379  
   380  	if version == parquet.V1_0 {
   381  		props.Encoding = parquet.Encodings.PlainDict
   382  	} else {
   383  		props.Encoding = parquet.Encodings.RLEDict
   384  	}
   385  
   386  	writer := p.buildWriter(VeryLargeSize, props, parquet.WithVersion(version))
   387  	p.WriteBatchValues(writer, nil, nil)
   388  	writer.Close()
   389  
   390  	// Read all the rows so that we are sure that also the non-dictionary pages are read correctly
   391  	p.SetupValuesOut(VeryLargeSize)
   392  	valuesRead := p.readColumnFully(compress.Codecs.Uncompressed)
   393  	p.EqualValues(VeryLargeSize, valuesRead)
   394  	p.Equal(p.Values, p.ValuesOut)
   395  
   396  	encodings := p.metadataEncodings()
   397  	if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) {
   398  		// dictionary encoding is not allowed for booleans
   399  		// there are 2 encodings (PLAIN, RLE) in a non dictionary encoding case
   400  		p.Equal([]parquet.Encoding{parquet.Encodings.Plain, parquet.Encodings.RLE}, encodings)
   401  	} else if version == parquet.V1_0 {
   402  		// There are 4 encodings (PLAIN_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case
   403  		// for version 1.0
   404  		p.Equal([]parquet.Encoding{parquet.Encodings.PlainDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings)
   405  	} else {
   406  		// There are 4 encodings (RLE_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case for
   407  		// version 2.0
   408  		p.Equal([]parquet.Encoding{parquet.Encodings.RLEDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings)
   409  	}
   410  
   411  	encodingStats := p.metadataEncodingStats()
   412  	if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) {
   413  		p.Equal(parquet.Encodings.Plain, encodingStats[0].Encoding)
   414  		p.Equal(format.PageType_DATA_PAGE, encodingStats[0].PageType)
   415  	} else if version == parquet.V1_0 {
   416  		expected := []metadata.PageEncodingStats{
   417  			{Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DICTIONARY_PAGE},
   418  			{Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE},
   419  			{Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DATA_PAGE}}
   420  		p.Equal(expected[0], encodingStats[0])
   421  		p.ElementsMatch(expected[1:], encodingStats[1:])
   422  	} else {
   423  		expected := []metadata.PageEncodingStats{
   424  			{Encoding: parquet.Encodings.Plain, PageType: format.PageType_DICTIONARY_PAGE},
   425  			{Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE},
   426  			{Encoding: parquet.Encodings.RLEDict, PageType: format.PageType_DATA_PAGE}}
   427  		p.Equal(expected[0], encodingStats[0])
   428  		p.ElementsMatch(expected[1:], encodingStats[1:])
   429  	}
   430  }
   431  
   432  func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) {
   433  	// skip boolean as dictionary encoding is not used
   434  	if p.Typ.Kind() == reflect.Bool {
   435  		return
   436  	}
   437  
   438  	p.GenerateData(SmallSize)
   439  	props := parquet.DefaultColumnProperties()
   440  	props.DictionaryEnabled = true
   441  
   442  	if version == parquet.V1_0 {
   443  		props.Encoding = parquet.Encodings.PlainDict
   444  	} else {
   445  		props.Encoding = parquet.Encodings.RLEDict
   446  	}
   447  
   448  	writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version), parquet.WithDataPageSize(SmallSize-1))
   449  	p.WriteBatchValues(writer, nil, nil)
   450  	p.NotZero(writer.TotalBytesWritten())
   451  	writer.FallbackToPlain()
   452  	p.NotZero(writer.TotalCompressedBytes())
   453  	writer.Close()
   454  	p.NotZero(writer.TotalCompressedBytes())
   455  	p.NotZero(writer.TotalBytesWritten())
   456  }
   457  
   458  func (p *PrimitiveWriterTestSuite) TestRequiredPlain() {
   459  	p.testRequiredWithEncoding(parquet.Encodings.Plain)
   460  }
   461  
   462  func (p *PrimitiveWriterTestSuite) TestRequiredDictionary() {
   463  	p.testRequiredWithEncoding(parquet.Encodings.PlainDict)
   464  }
   465  
   466  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStats() {
   467  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Uncompressed, false, true, LargeSize, compress.DefaultCompressionLevel)
   468  }
   469  
   470  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithSnappy() {
   471  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, false, LargeSize, compress.DefaultCompressionLevel)
   472  }
   473  
   474  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndSnappy() {
   475  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, true, LargeSize, compress.DefaultCompressionLevel)
   476  }
   477  
   478  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotli() {
   479  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, compress.DefaultCompressionLevel)
   480  }
   481  
   482  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotliAndLevel() {
   483  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, 10)
   484  }
   485  
   486  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndBrotli() {
   487  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, true, LargeSize, compress.DefaultCompressionLevel)
   488  }
   489  
   490  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzip() {
   491  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, compress.DefaultCompressionLevel)
   492  }
   493  
   494  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzipAndLevel() {
   495  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, 10)
   496  }
   497  
   498  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndGzip() {
   499  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, true, LargeSize, compress.DefaultCompressionLevel)
   500  }
   501  
   502  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstd() {
   503  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, compress.DefaultCompressionLevel)
   504  }
   505  
   506  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstdAndLevel() {
   507  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, 6)
   508  }
   509  
   510  func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndZstd() {
   511  	p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, true, LargeSize, compress.DefaultCompressionLevel)
   512  }
   513  
   514  func (p *PrimitiveWriterTestSuite) TestOptionalNonRepeated() {
   515  	p.SetupSchema(parquet.Repetitions.Optional, 1)
   516  	p.descr = p.Schema.Column(0)
   517  
   518  	p.GenerateData(SmallSize)
   519  	p.DefLevels[1] = 0
   520  
   521  	writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   522  	p.WriteBatchValues(writer, p.DefLevels, nil)
   523  	writer.Close()
   524  
   525  	p.Equal(int64(100), p.metadataNumValues())
   526  
   527  	values := p.readColumn(compress.Codecs.Uncompressed)
   528  	p.EqualValues(99, values)
   529  	p.Equal(reflect.ValueOf(p.Values).Slice(0, 99).Interface(), reflect.ValueOf(p.ValuesOut).Slice(0, 99).Interface())
   530  }
   531  
   532  func (p *PrimitiveWriterTestSuite) TestOptionalSpaced() {
   533  	p.SetupSchema(parquet.Repetitions.Optional, 1)
   534  	p.descr = p.Schema.Column(0)
   535  
   536  	p.GenerateData(SmallSize)
   537  	validBits := make([]byte, int(bitutil.BytesForBits(SmallSize)))
   538  	memory.Set(validBits, 255)
   539  	p.DefLevels[SmallSize-1] = 0
   540  	bitutil.ClearBit(validBits, SmallSize-1)
   541  	p.DefLevels[1] = 0
   542  	bitutil.ClearBit(validBits, 1)
   543  
   544  	writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   545  	p.WriteBatchValuesSpaced(writer, p.DefLevels, nil, validBits, 0)
   546  	writer.Close()
   547  
   548  	p.Equal(int64(100), p.metadataNumValues())
   549  
   550  	values := p.readColumn(compress.Codecs.Uncompressed)
   551  	p.EqualValues(98, values)
   552  
   553  	orig := reflect.ValueOf(p.Values)
   554  	orig = orig.Slice(0, 99)
   555  	reflect.Copy(orig.Slice(1, orig.Len()), orig.Slice(2, orig.Len()))
   556  	orig = orig.Slice(0, 98)
   557  	out := reflect.ValueOf(p.ValuesOut)
   558  	out = out.Slice(0, 98)
   559  
   560  	p.Equal(orig.Interface(), out.Interface())
   561  }
   562  
   563  func (p *PrimitiveWriterTestSuite) TestWriteRepeated() {
   564  	// optional and repeated so def and repetition levels
   565  	p.SetupSchema(parquet.Repetitions.Repeated, 1)
   566  	p.descr = p.Schema.Column(0)
   567  	p.GenerateData(SmallSize)
   568  	p.DefLevels[1] = 0
   569  	p.RepLevels = make([]int16, SmallSize)
   570  	for idx := range p.RepLevels {
   571  		p.RepLevels[idx] = 0
   572  	}
   573  
   574  	writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   575  	p.WriteBatchValues(writer, p.DefLevels, p.RepLevels)
   576  	writer.Close()
   577  
   578  	values := p.readColumn(compress.Codecs.Uncompressed)
   579  	p.EqualValues(SmallSize-1, values)
   580  	out := reflect.ValueOf(p.ValuesOut).Slice(0, SmallSize-1).Interface()
   581  	vals := reflect.ValueOf(p.Values).Slice(0, SmallSize-1).Interface()
   582  	p.Equal(vals, out)
   583  }
   584  
   585  func (p *PrimitiveWriterTestSuite) TestRequiredLargeChunk() {
   586  	p.GenerateData(LargeSize)
   587  
   588  	// Test 1: required and non-repeated, so no def or rep levels
   589  	writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   590  	p.WriteBatchValues(writer, nil, nil)
   591  	writer.Close()
   592  
   593  	// just read the first SmallSize rows to ensure we could read it back in
   594  	values := p.readColumn(compress.Codecs.Uncompressed)
   595  	p.EqualValues(SmallSize, values)
   596  	p.Equal(reflect.ValueOf(p.Values).Slice(0, SmallSize).Interface(), p.ValuesOut)
   597  }
   598  
   599  func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV1() {
   600  	p.testDictionaryFallbackEncoding(parquet.V1_0)
   601  }
   602  
   603  func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() {
   604  	p.testDictionaryFallbackEncoding(parquet.V2_LATEST)
   605  }
   606  
   607  func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV1() {
   608  	p.testDictionaryFallbackAndCompressedSize(parquet.V1_0)
   609  }
   610  
   611  func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV2() {
   612  	p.testDictionaryFallbackAndCompressedSize(parquet.V2_LATEST)
   613  }
   614  
   615  func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() {
   616  	// test case for NULL values
   617  	p.SetupSchema(parquet.Repetitions.Optional, 1)
   618  	p.descr = p.Schema.Column(0)
   619  	p.GenerateData(LargeSize)
   620  	p.RepLevels = make([]int16, LargeSize)
   621  	for idx := range p.DefLevels {
   622  		p.DefLevels[idx] = 0
   623  		p.RepLevels[idx] = 0
   624  	}
   625  
   626  	writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   627  	p.WriteBatchValues(writer, p.DefLevels, p.RepLevels)
   628  	writer.Close()
   629  
   630  	valuesRead := p.readColumn(compress.Codecs.Uncompressed)
   631  	p.Zero(valuesRead)
   632  }
   633  
   634  func createWriterTestSuite(typ reflect.Type) suite.TestingSuite {
   635  	switch typ {
   636  	case reflect.TypeOf(true):
   637  		return &BooleanValueWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}}
   638  	case reflect.TypeOf(parquet.ByteArray{}):
   639  		return &ByteArrayWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}}
   640  	}
   641  	return &PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}
   642  }
   643  
   644  func TestColumnWriter(t *testing.T) {
   645  	t.Parallel()
   646  	types := []struct {
   647  		typ reflect.Type
   648  	}{
   649  		{reflect.TypeOf(true)},
   650  		{reflect.TypeOf(int32(0))},
   651  		{reflect.TypeOf(int64(0))},
   652  		{reflect.TypeOf(float32(0))},
   653  		{reflect.TypeOf(float64(0))},
   654  		{reflect.TypeOf(parquet.Int96{})},
   655  		{reflect.TypeOf(parquet.ByteArray{})},
   656  		{reflect.TypeOf(parquet.FixedLenByteArray{})},
   657  	}
   658  	for _, tt := range types {
   659  		tt := tt
   660  		t.Run(tt.typ.String(), func(t *testing.T) {
   661  			t.Parallel()
   662  			suite.Run(t, createWriterTestSuite(tt.typ))
   663  		})
   664  	}
   665  }
   666  
   667  type ByteArrayWriterSuite struct {
   668  	PrimitiveWriterTestSuite
   669  }
   670  
   671  func (b *ByteArrayWriterSuite) TestOmitStats() {
   672  	// prevent writing large MIN,MAX stats
   673  	minLen := 1024 * 4
   674  	maxLen := 1024 * 8
   675  	b.SetupSchema(parquet.Repetitions.Required, 1)
   676  	b.Values = make([]parquet.ByteArray, SmallSize)
   677  	writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   678  	testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen)
   679  	writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil)
   680  	writer.Close()
   681  
   682  	hasMin, hasMax := b.metadataStatsHasMinMax()
   683  	b.False(hasMin)
   684  	b.False(hasMax)
   685  }
   686  
   687  func (b *ByteArrayWriterSuite) TestOmitDataPageStats() {
   688  	// prevent writing large stats in DataPageHeader
   689  	minLen := math.Pow10(7)
   690  	maxLen := math.Pow10(7)
   691  	b.SetupSchema(parquet.Repetitions.Required, 1)
   692  	colprops := parquet.DefaultColumnProperties()
   693  	colprops.StatsEnabled = false
   694  
   695  	writer := b.buildWriter(SmallSize, colprops, parquet.WithVersion(parquet.V1_0))
   696  	b.Values = make([]parquet.ByteArray, 1)
   697  	testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, int(minLen), int(maxLen))
   698  	writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil)
   699  	writer.Close()
   700  
   701  	b.NotPanics(func() { b.readColumn(compress.Codecs.Uncompressed) })
   702  }
   703  
   704  func (b *ByteArrayWriterSuite) TestLimitStats() {
   705  	minLen := 1024 * 4
   706  	maxLen := 1024 * 8
   707  	b.SetupSchema(parquet.Repetitions.Required, 1)
   708  	colprops := parquet.DefaultColumnProperties()
   709  	colprops.MaxStatsSize = int64(maxLen)
   710  
   711  	writer := b.buildWriter(SmallSize, colprops, parquet.WithVersion(parquet.V1_0)).(*file.ByteArrayColumnChunkWriter)
   712  	b.Values = make([]parquet.ByteArray, SmallSize)
   713  	testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen)
   714  	writer.WriteBatch(b.Values.([]parquet.ByteArray), nil, nil)
   715  	writer.Close()
   716  
   717  	b.True(b.metadataIsStatsSet())
   718  }
   719  
   720  func (b *ByteArrayWriterSuite) TestCheckDefaultStats() {
   721  	b.SetupSchema(parquet.Repetitions.Required, 1)
   722  	writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0))
   723  	b.GenerateData(SmallSize)
   724  	b.WriteBatchValues(writer, nil, nil)
   725  	writer.Close()
   726  
   727  	b.True(b.metadataIsStatsSet())
   728  }
   729  
   730  type BooleanValueWriterSuite struct {
   731  	PrimitiveWriterTestSuite
   732  }
   733  
   734  func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() {
   735  	b.SetupSchema(parquet.Repetitions.Required, 1)
   736  	// We use an unusual data-page size to try to flush out Boolean encoder issues in usage of the BitMapWriter
   737  	writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0), parquet.WithDataPageSize(7)).(*file.BooleanColumnChunkWriter)
   738  	for i := 0; i < SmallSize; i++ {
   739  		val := i%2 == 0
   740  		writer.WriteBatch([]bool{val}, nil, nil)
   741  	}
   742  	writer.Close()
   743  	b.readColumn(compress.Codecs.Uncompressed)
   744  	for i := 0; i < SmallSize; i++ {
   745  		b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i])
   746  	}
   747  }
   748  
   749  func TestDictionaryReslice(t *testing.T) {
   750  	pts := []arrow.DataType{
   751  		arrow.PrimitiveTypes.Int8,
   752  		arrow.PrimitiveTypes.Int16,
   753  		arrow.PrimitiveTypes.Int32,
   754  		arrow.PrimitiveTypes.Int64,
   755  		arrow.PrimitiveTypes.Uint8,
   756  		arrow.PrimitiveTypes.Uint16,
   757  		arrow.PrimitiveTypes.Uint32,
   758  		arrow.PrimitiveTypes.Uint64,
   759  	}
   760  	for _, pt := range pts {
   761  		t.Run(pt.String(), func(t *testing.T) {
   762  			mem := memory.NewGoAllocator()
   763  			dt := &arrow.DictionaryType{
   764  				IndexType: pt,
   765  				ValueType: &arrow.StringType{},
   766  			}
   767  			field := arrow.Field{Name: "test_field", Type: dt, Nullable: true}
   768  			schema := arrow.NewSchema([]arrow.Field{field}, nil)
   769  			b := array.NewRecordBuilder(mem, schema)
   770  			for i := 0; i < 2000; i++ {
   771  				b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value")
   772  			}
   773  			rec := b.NewRecord()
   774  			out := &bytes.Buffer{}
   775  			pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties())
   776  			assert.NoError(t, err)
   777  			err = pqw.WriteBuffered(rec)
   778  			assert.NoError(t, err)
   779  
   780  		})
   781  	}
   782  }