github.com/apache/arrow/go/v16@v16.1.0/parquet/file/column_writer_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "math" 22 "reflect" 23 "runtime" 24 "sync" 25 "testing" 26 27 "github.com/apache/arrow/go/v16/arrow" 28 "github.com/apache/arrow/go/v16/arrow/array" 29 "github.com/apache/arrow/go/v16/arrow/bitutil" 30 "github.com/apache/arrow/go/v16/arrow/memory" 31 arrutils "github.com/apache/arrow/go/v16/internal/utils" 32 "github.com/apache/arrow/go/v16/parquet" 33 "github.com/apache/arrow/go/v16/parquet/compress" 34 "github.com/apache/arrow/go/v16/parquet/file" 35 "github.com/apache/arrow/go/v16/parquet/internal/encoding" 36 "github.com/apache/arrow/go/v16/parquet/internal/encryption" 37 format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 38 "github.com/apache/arrow/go/v16/parquet/internal/testutils" 39 "github.com/apache/arrow/go/v16/parquet/internal/utils" 40 "github.com/apache/arrow/go/v16/parquet/metadata" 41 "github.com/apache/arrow/go/v16/parquet/pqarrow" 42 "github.com/apache/arrow/go/v16/parquet/schema" 43 "github.com/stretchr/testify/assert" 44 "github.com/stretchr/testify/mock" 45 "github.com/stretchr/testify/suite" 46 ) 47 48 const ( 49 SmallSize = 100 50 // larger to test some corner cases, only in some specific cases 51 LargeSize = 100000 52 // very large to test dictionary fallback 53 VeryLargeSize = 400000 54 // dictionary page size for testing fallback 55 DictionaryPageSize = 1024 * 1024 56 ) 57 58 type mockpagewriter struct { 59 mock.Mock 60 } 61 62 func (m *mockpagewriter) Close(hasDict, fallBack bool) error { 63 return m.Called(hasDict, fallBack).Error(0) 64 } 65 func (m *mockpagewriter) WriteDataPage(page file.DataPage) (int64, error) { 66 args := m.Called(page) 67 return int64(args.Int(0)), args.Error(1) 68 } 69 func (m *mockpagewriter) WriteDictionaryPage(page *file.DictionaryPage) (int64, error) { 70 args := m.Called(page) 71 return int64(args.Int(0)), args.Error(1) 72 } 73 func (m *mockpagewriter) HasCompressor() bool { 74 return m.Called().Bool(0) 75 } 76 func (m *mockpagewriter) Compress(buf *bytes.Buffer, src []byte) []byte { 77 return m.Called(buf, src).Get(0).([]byte) 78 } 79 func (m *mockpagewriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error { 80 return m.Called().Error(0) 81 } 82 83 func TestWriteDataPageV1NumValues(t *testing.T) { 84 sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 85 schema.Must(schema.ListOf( 86 schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)), 87 parquet.Repetitions.Optional, -1)), 88 }, -1))) 89 descr := sc.Column(0) 90 props := parquet.NewWriterProperties( 91 parquet.WithStats(true), 92 parquet.WithVersion(parquet.V1_0), 93 parquet.WithDataPageVersion(parquet.DataPageV1), 94 parquet.WithDictionaryDefault(false)) 95 96 metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr) 97 pager := new(mockpagewriter) 98 defer pager.AssertExpectations(t) 99 pager.On("HasCompressor").Return(false) 100 wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter) 101 102 // write a list "[[0, 1], null, [2, null, 3]]" 103 // should be 6 values, 2 nulls and 3 rows 104 wr.WriteBatch([]int32{0, 1, 2, 3}, 105 []int16{3, 3, 0, 3, 2, 3}, 106 []int16{0, 1, 0, 0, 1, 1}) 107 108 pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool { 109 pagev1, ok := page.(*file.DataPageV1) 110 if !ok { 111 return false 112 } 113 114 encodedStats := pagev1.Statistics() 115 // only match if the page being written has 2 nulls, 6 values and 3 rows 116 return pagev1.NumValues() == 6 && 117 encodedStats.HasNullCount && 118 encodedStats.NullCount == 2 119 })).Return(10, nil) 120 121 wr.FlushBufferedDataPages() 122 assert.EqualValues(t, 3, wr.RowsWritten()) 123 } 124 125 func TestWriteDataPageV2NumRows(t *testing.T) { 126 // test issue from PARQUET-2066 127 sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 128 schema.Must(schema.ListOf( 129 schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)), 130 parquet.Repetitions.Optional, -1)), 131 }, -1))) 132 descr := sc.Column(0) 133 props := parquet.NewWriterProperties( 134 parquet.WithStats(true), 135 parquet.WithVersion(parquet.V2_LATEST), 136 parquet.WithDataPageVersion(parquet.DataPageV2), 137 parquet.WithDictionaryDefault(false)) 138 139 metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr) 140 pager := new(mockpagewriter) 141 defer pager.AssertExpectations(t) 142 pager.On("HasCompressor").Return(false) 143 wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter) 144 145 // write a list "[[0, 1], null, [2, null, 3]]" 146 // should be 6 values, 2 nulls and 3 rows 147 wr.WriteBatch([]int32{0, 1, 2, 3}, 148 []int16{3, 3, 0, 3, 2, 3}, 149 []int16{0, 1, 0, 0, 1, 1}) 150 151 pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool { 152 pagev2, ok := page.(*file.DataPageV2) 153 if !ok { 154 return false 155 } 156 157 encodedStats := pagev2.Statistics() 158 // only match if the page being written has 2 nulls, 6 values and 3 rows 159 return !pagev2.IsCompressed() && 160 pagev2.NumNulls() == 2 && encodedStats.NullCount == 2 && 161 pagev2.NumValues() == 6 && 162 pagev2.NumRows() == 3 163 })).Return(10, nil) 164 165 wr.FlushBufferedDataPages() 166 assert.EqualValues(t, 3, wr.RowsWritten()) 167 } 168 169 func TestDataPageV2RowBoundaries(t *testing.T) { 170 sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 171 schema.Must(schema.ListOf( 172 schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)), 173 parquet.Repetitions.Optional, -1)), 174 }, -1))) 175 descr := sc.Column(0) 176 props := parquet.NewWriterProperties( 177 parquet.WithBatchSize(128), 178 parquet.WithDataPageSize(1024), 179 parquet.WithVersion(parquet.V2_LATEST), 180 parquet.WithDataPageVersion(parquet.DataPageV2), 181 parquet.WithDictionaryDefault(false)) 182 183 metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr) 184 pager := new(mockpagewriter) 185 defer pager.AssertExpectations(t) 186 pager.On("HasCompressor").Return(false) 187 wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter) 188 189 pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool { 190 pagev2, ok := page.(*file.DataPageV2) 191 if !ok { 192 return false 193 } 194 195 // only match if the page being written has 2 nulls, 6 values and 3 rows 196 return !pagev2.IsCompressed() && 197 pagev2.NumNulls() == 0 && 198 pagev2.NumValues() == 378 && 199 pagev2.NumRows() == 126 200 })).Return(10, nil) 201 202 // create rows of lists of 3 values each 203 values := make([]int32, 1024) 204 defLevels := make([]int16, 1024) 205 repLevels := make([]int16, 1024) 206 for i := range values { 207 values[i] = int32(i) 208 defLevels[i] = 3 209 210 switch i % 3 { 211 case 0: 212 repLevels[i] = 0 213 case 1, 2: 214 repLevels[i] = 1 215 } 216 } 217 218 wr.WriteBatch(values, defLevels, repLevels) 219 } 220 221 type PrimitiveWriterTestSuite struct { 222 testutils.PrimitiveTypedTest 223 suite.Suite 224 225 props *parquet.WriterProperties 226 descr *schema.Column 227 228 metadata *metadata.ColumnChunkMetaDataBuilder 229 sink *encoding.BufferWriter 230 readbuffer *memory.Buffer 231 232 bufferPool sync.Pool 233 } 234 235 func (p *PrimitiveWriterTestSuite) SetupTest() { 236 p.SetupValuesOut(SmallSize) 237 p.props = parquet.NewWriterProperties() 238 p.SetupSchema(parquet.Repetitions.Required, 1) 239 p.descr = p.Schema.Column(0) 240 241 p.bufferPool = sync.Pool{ 242 New: func() interface{} { 243 buf := memory.NewResizableBuffer(mem) 244 runtime.SetFinalizer(buf, func(obj *memory.Buffer) { 245 obj.Release() 246 }) 247 return buf 248 }, 249 } 250 } 251 252 func (p *PrimitiveWriterTestSuite) TearDownTest() { 253 p.bufferPool = sync.Pool{} 254 } 255 256 func (p *PrimitiveWriterTestSuite) buildReader(nrows int64, compression compress.Compression) file.ColumnChunkReader { 257 p.readbuffer = p.sink.Finish() 258 pagereader, _ := file.NewPageReader(arrutils.NewBufferedReader(bytes.NewReader(p.readbuffer.Bytes()), p.readbuffer.Len()), nrows, compression, mem, nil) 259 return file.NewColumnReader(p.descr, pagereader, mem, &p.bufferPool) 260 } 261 262 func (p *PrimitiveWriterTestSuite) buildWriter(_ int64, columnProps parquet.ColumnProperties, opts ...parquet.WriterProperty) file.ColumnChunkWriter { 263 p.sink = encoding.NewBufferWriter(0, mem) 264 if columnProps.Encoding == parquet.Encodings.PlainDict || columnProps.Encoding == parquet.Encodings.RLEDict { 265 opts = append(opts, parquet.WithDictionaryDefault(true), parquet.WithDictionaryPageSizeLimit(DictionaryPageSize)) 266 } else { 267 opts = append(opts, parquet.WithDictionaryDefault(false), parquet.WithEncoding(columnProps.Encoding)) 268 } 269 opts = append(opts, parquet.WithMaxStatsSize(columnProps.MaxStatsSize), parquet.WithStats(columnProps.StatsEnabled)) 270 p.props = parquet.NewWriterProperties(opts...) 271 272 p.metadata = metadata.NewColumnChunkMetaDataBuilder(p.props, p.descr) 273 pager, _ := file.NewPageWriter(p.sink, columnProps.Codec, compress.DefaultCompressionLevel, p.metadata, -1, -1, memory.DefaultAllocator, false, nil, nil) 274 return file.NewColumnChunkWriter(p.metadata, pager, p.props) 275 } 276 277 func (p *PrimitiveWriterTestSuite) readColumn(compression compress.Compression) int64 { 278 totalValues := int64(len(p.DefLevelsOut)) 279 reader := p.buildReader(totalValues, compression) 280 return p.ReadBatch(reader, totalValues, 0, p.DefLevelsOut, p.RepLevelsOut) 281 } 282 283 func (p *PrimitiveWriterTestSuite) readColumnFully(compression compress.Compression) int64 { 284 totalValues := int64(len(p.DefLevelsOut)) 285 reader := p.buildReader(totalValues, compression) 286 valuesRead := int64(0) 287 for valuesRead < totalValues { 288 read := p.ReadBatch(reader, totalValues-valuesRead, valuesRead, p.DefLevelsOut[valuesRead:], p.RepLevelsOut[valuesRead:]) 289 valuesRead += read 290 } 291 return valuesRead 292 } 293 294 func (p *PrimitiveWriterTestSuite) readAndCompare(compression compress.Compression, nrows int64) { 295 p.SetupValuesOut(nrows) 296 p.readColumnFully(compression) 297 p.Equal(p.Values, p.ValuesOut) 298 } 299 300 func (p *PrimitiveWriterTestSuite) writeRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, compressLvl int, nrows int64) { 301 columnProperties := parquet.ColumnProperties{ 302 Encoding: encoding, 303 Codec: compression, 304 DictionaryEnabled: dict, 305 StatsEnabled: stats, 306 CompressionLevel: compressLvl, 307 } 308 writer := p.buildWriter(nrows, columnProperties, parquet.WithVersion(parquet.V1_0)) 309 p.WriteBatchValues(writer, nil, nil) 310 // behavior should be independent of the number of calls to Close 311 writer.Close() 312 writer.Close() 313 } 314 315 func (p *PrimitiveWriterTestSuite) writeRequiredWithSettingsSpaced(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressionLvl int) { 316 validBits := make([]byte, int(bitutil.BytesForBits(int64(len(p.DefLevels))))+1) 317 memory.Set(validBits, 255) 318 columnProperties := parquet.ColumnProperties{ 319 Encoding: encoding, 320 Codec: compression, 321 DictionaryEnabled: dict, 322 StatsEnabled: stats, 323 CompressionLevel: compressionLvl, 324 } 325 writer := p.buildWriter(nrows, columnProperties, parquet.WithVersion(parquet.V1_0)) 326 p.WriteBatchValuesSpaced(writer, nil, nil, validBits, 0) 327 // behavior should be independent from the number of close calls 328 writer.Close() 329 writer.Close() 330 } 331 332 func (p *PrimitiveWriterTestSuite) testRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressLvl int) { 333 p.GenerateData(nrows) 334 p.writeRequiredWithSettings(encoding, compression, dict, stats, compressLvl, nrows) 335 p.NotPanics(func() { p.readAndCompare(compression, nrows) }) 336 p.writeRequiredWithSettingsSpaced(encoding, compression, dict, stats, nrows, compressLvl) 337 p.NotPanics(func() { p.readAndCompare(compression, nrows) }) 338 } 339 340 func (p *PrimitiveWriterTestSuite) testRequiredWithEncoding(encoding parquet.Encoding) { 341 p.testRequiredWithSettings(encoding, compress.Codecs.Uncompressed, false, false, SmallSize, compress.DefaultCompressionLevel) 342 } 343 344 func (p *PrimitiveWriterTestSuite) metadataNumValues() int64 { 345 // metadata accessor created lazily 346 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil) 347 return metadata.NumValues() 348 } 349 350 func (p *PrimitiveWriterTestSuite) metadataEncodings() []parquet.Encoding { 351 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil) 352 return metadata.Encodings() 353 } 354 355 func (p *PrimitiveWriterTestSuite) metadataEncodingStats() []metadata.PageEncodingStats { 356 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil) 357 return metadata.EncodingStats() 358 } 359 360 func (p *PrimitiveWriterTestSuite) metadataStatsHasMinMax() (hasMin, hasMax bool) { 361 appVersion := metadata.NewAppVersion(p.props.CreatedBy()) 362 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil) 363 stats, _ := metadata.Statistics() 364 encoded, _ := stats.Encode() 365 return encoded.HasMin, encoded.HasMax 366 } 367 368 func (p *PrimitiveWriterTestSuite) metadataIsStatsSet() bool { 369 appVersion := metadata.NewAppVersion(p.props.CreatedBy()) 370 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil) 371 set, _ := metadata.StatsSet() 372 return set 373 } 374 375 func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parquet.Version) { 376 p.GenerateData(VeryLargeSize) 377 props := parquet.DefaultColumnProperties() 378 props.DictionaryEnabled = true 379 380 if version == parquet.V1_0 { 381 props.Encoding = parquet.Encodings.PlainDict 382 } else { 383 props.Encoding = parquet.Encodings.RLEDict 384 } 385 386 writer := p.buildWriter(VeryLargeSize, props, parquet.WithVersion(version)) 387 p.WriteBatchValues(writer, nil, nil) 388 writer.Close() 389 390 // Read all the rows so that we are sure that also the non-dictionary pages are read correctly 391 p.SetupValuesOut(VeryLargeSize) 392 valuesRead := p.readColumnFully(compress.Codecs.Uncompressed) 393 p.EqualValues(VeryLargeSize, valuesRead) 394 p.Equal(p.Values, p.ValuesOut) 395 396 encodings := p.metadataEncodings() 397 if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) { 398 // dictionary encoding is not allowed for booleans 399 // there are 2 encodings (PLAIN, RLE) in a non dictionary encoding case 400 p.Equal([]parquet.Encoding{parquet.Encodings.Plain, parquet.Encodings.RLE}, encodings) 401 } else if version == parquet.V1_0 { 402 // There are 4 encodings (PLAIN_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case 403 // for version 1.0 404 p.Equal([]parquet.Encoding{parquet.Encodings.PlainDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings) 405 } else { 406 // There are 4 encodings (RLE_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case for 407 // version 2.0 408 p.Equal([]parquet.Encoding{parquet.Encodings.RLEDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings) 409 } 410 411 encodingStats := p.metadataEncodingStats() 412 if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) { 413 p.Equal(parquet.Encodings.Plain, encodingStats[0].Encoding) 414 p.Equal(format.PageType_DATA_PAGE, encodingStats[0].PageType) 415 } else if version == parquet.V1_0 { 416 expected := []metadata.PageEncodingStats{ 417 {Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DICTIONARY_PAGE}, 418 {Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE}, 419 {Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DATA_PAGE}} 420 p.Equal(expected[0], encodingStats[0]) 421 p.ElementsMatch(expected[1:], encodingStats[1:]) 422 } else { 423 expected := []metadata.PageEncodingStats{ 424 {Encoding: parquet.Encodings.Plain, PageType: format.PageType_DICTIONARY_PAGE}, 425 {Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE}, 426 {Encoding: parquet.Encodings.RLEDict, PageType: format.PageType_DATA_PAGE}} 427 p.Equal(expected[0], encodingStats[0]) 428 p.ElementsMatch(expected[1:], encodingStats[1:]) 429 } 430 } 431 432 func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) { 433 // skip boolean as dictionary encoding is not used 434 if p.Typ.Kind() == reflect.Bool { 435 return 436 } 437 438 p.GenerateData(SmallSize) 439 props := parquet.DefaultColumnProperties() 440 props.DictionaryEnabled = true 441 442 if version == parquet.V1_0 { 443 props.Encoding = parquet.Encodings.PlainDict 444 } else { 445 props.Encoding = parquet.Encodings.RLEDict 446 } 447 448 writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version), parquet.WithDataPageSize(SmallSize-1)) 449 p.WriteBatchValues(writer, nil, nil) 450 p.NotZero(writer.TotalBytesWritten()) 451 writer.FallbackToPlain() 452 p.NotZero(writer.TotalCompressedBytes()) 453 writer.Close() 454 p.NotZero(writer.TotalCompressedBytes()) 455 p.NotZero(writer.TotalBytesWritten()) 456 } 457 458 func (p *PrimitiveWriterTestSuite) TestRequiredPlain() { 459 p.testRequiredWithEncoding(parquet.Encodings.Plain) 460 } 461 462 func (p *PrimitiveWriterTestSuite) TestRequiredDictionary() { 463 p.testRequiredWithEncoding(parquet.Encodings.PlainDict) 464 } 465 466 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStats() { 467 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Uncompressed, false, true, LargeSize, compress.DefaultCompressionLevel) 468 } 469 470 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithSnappy() { 471 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, false, LargeSize, compress.DefaultCompressionLevel) 472 } 473 474 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndSnappy() { 475 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, true, LargeSize, compress.DefaultCompressionLevel) 476 } 477 478 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotli() { 479 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, compress.DefaultCompressionLevel) 480 } 481 482 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotliAndLevel() { 483 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, 10) 484 } 485 486 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndBrotli() { 487 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, true, LargeSize, compress.DefaultCompressionLevel) 488 } 489 490 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzip() { 491 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, compress.DefaultCompressionLevel) 492 } 493 494 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzipAndLevel() { 495 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, 10) 496 } 497 498 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndGzip() { 499 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, true, LargeSize, compress.DefaultCompressionLevel) 500 } 501 502 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstd() { 503 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, compress.DefaultCompressionLevel) 504 } 505 506 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstdAndLevel() { 507 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, 6) 508 } 509 510 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndZstd() { 511 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, true, LargeSize, compress.DefaultCompressionLevel) 512 } 513 514 func (p *PrimitiveWriterTestSuite) TestOptionalNonRepeated() { 515 p.SetupSchema(parquet.Repetitions.Optional, 1) 516 p.descr = p.Schema.Column(0) 517 518 p.GenerateData(SmallSize) 519 p.DefLevels[1] = 0 520 521 writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 522 p.WriteBatchValues(writer, p.DefLevels, nil) 523 writer.Close() 524 525 p.Equal(int64(100), p.metadataNumValues()) 526 527 values := p.readColumn(compress.Codecs.Uncompressed) 528 p.EqualValues(99, values) 529 p.Equal(reflect.ValueOf(p.Values).Slice(0, 99).Interface(), reflect.ValueOf(p.ValuesOut).Slice(0, 99).Interface()) 530 } 531 532 func (p *PrimitiveWriterTestSuite) TestOptionalSpaced() { 533 p.SetupSchema(parquet.Repetitions.Optional, 1) 534 p.descr = p.Schema.Column(0) 535 536 p.GenerateData(SmallSize) 537 validBits := make([]byte, int(bitutil.BytesForBits(SmallSize))) 538 memory.Set(validBits, 255) 539 p.DefLevels[SmallSize-1] = 0 540 bitutil.ClearBit(validBits, SmallSize-1) 541 p.DefLevels[1] = 0 542 bitutil.ClearBit(validBits, 1) 543 544 writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 545 p.WriteBatchValuesSpaced(writer, p.DefLevels, nil, validBits, 0) 546 writer.Close() 547 548 p.Equal(int64(100), p.metadataNumValues()) 549 550 values := p.readColumn(compress.Codecs.Uncompressed) 551 p.EqualValues(98, values) 552 553 orig := reflect.ValueOf(p.Values) 554 orig = orig.Slice(0, 99) 555 reflect.Copy(orig.Slice(1, orig.Len()), orig.Slice(2, orig.Len())) 556 orig = orig.Slice(0, 98) 557 out := reflect.ValueOf(p.ValuesOut) 558 out = out.Slice(0, 98) 559 560 p.Equal(orig.Interface(), out.Interface()) 561 } 562 563 func (p *PrimitiveWriterTestSuite) TestWriteRepeated() { 564 // optional and repeated so def and repetition levels 565 p.SetupSchema(parquet.Repetitions.Repeated, 1) 566 p.descr = p.Schema.Column(0) 567 p.GenerateData(SmallSize) 568 p.DefLevels[1] = 0 569 p.RepLevels = make([]int16, SmallSize) 570 for idx := range p.RepLevels { 571 p.RepLevels[idx] = 0 572 } 573 574 writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 575 p.WriteBatchValues(writer, p.DefLevels, p.RepLevels) 576 writer.Close() 577 578 values := p.readColumn(compress.Codecs.Uncompressed) 579 p.EqualValues(SmallSize-1, values) 580 out := reflect.ValueOf(p.ValuesOut).Slice(0, SmallSize-1).Interface() 581 vals := reflect.ValueOf(p.Values).Slice(0, SmallSize-1).Interface() 582 p.Equal(vals, out) 583 } 584 585 func (p *PrimitiveWriterTestSuite) TestRequiredLargeChunk() { 586 p.GenerateData(LargeSize) 587 588 // Test 1: required and non-repeated, so no def or rep levels 589 writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 590 p.WriteBatchValues(writer, nil, nil) 591 writer.Close() 592 593 // just read the first SmallSize rows to ensure we could read it back in 594 values := p.readColumn(compress.Codecs.Uncompressed) 595 p.EqualValues(SmallSize, values) 596 p.Equal(reflect.ValueOf(p.Values).Slice(0, SmallSize).Interface(), p.ValuesOut) 597 } 598 599 func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV1() { 600 p.testDictionaryFallbackEncoding(parquet.V1_0) 601 } 602 603 func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() { 604 p.testDictionaryFallbackEncoding(parquet.V2_LATEST) 605 } 606 607 func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV1() { 608 p.testDictionaryFallbackAndCompressedSize(parquet.V1_0) 609 } 610 611 func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV2() { 612 p.testDictionaryFallbackAndCompressedSize(parquet.V2_LATEST) 613 } 614 615 func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() { 616 // test case for NULL values 617 p.SetupSchema(parquet.Repetitions.Optional, 1) 618 p.descr = p.Schema.Column(0) 619 p.GenerateData(LargeSize) 620 p.RepLevels = make([]int16, LargeSize) 621 for idx := range p.DefLevels { 622 p.DefLevels[idx] = 0 623 p.RepLevels[idx] = 0 624 } 625 626 writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 627 p.WriteBatchValues(writer, p.DefLevels, p.RepLevels) 628 writer.Close() 629 630 valuesRead := p.readColumn(compress.Codecs.Uncompressed) 631 p.Zero(valuesRead) 632 } 633 634 func createWriterTestSuite(typ reflect.Type) suite.TestingSuite { 635 switch typ { 636 case reflect.TypeOf(true): 637 return &BooleanValueWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}} 638 case reflect.TypeOf(parquet.ByteArray{}): 639 return &ByteArrayWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}} 640 } 641 return &PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)} 642 } 643 644 func TestColumnWriter(t *testing.T) { 645 t.Parallel() 646 types := []struct { 647 typ reflect.Type 648 }{ 649 {reflect.TypeOf(true)}, 650 {reflect.TypeOf(int32(0))}, 651 {reflect.TypeOf(int64(0))}, 652 {reflect.TypeOf(float32(0))}, 653 {reflect.TypeOf(float64(0))}, 654 {reflect.TypeOf(parquet.Int96{})}, 655 {reflect.TypeOf(parquet.ByteArray{})}, 656 {reflect.TypeOf(parquet.FixedLenByteArray{})}, 657 } 658 for _, tt := range types { 659 tt := tt 660 t.Run(tt.typ.String(), func(t *testing.T) { 661 t.Parallel() 662 suite.Run(t, createWriterTestSuite(tt.typ)) 663 }) 664 } 665 } 666 667 type ByteArrayWriterSuite struct { 668 PrimitiveWriterTestSuite 669 } 670 671 func (b *ByteArrayWriterSuite) TestOmitStats() { 672 // prevent writing large MIN,MAX stats 673 minLen := 1024 * 4 674 maxLen := 1024 * 8 675 b.SetupSchema(parquet.Repetitions.Required, 1) 676 b.Values = make([]parquet.ByteArray, SmallSize) 677 writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 678 testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen) 679 writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil) 680 writer.Close() 681 682 hasMin, hasMax := b.metadataStatsHasMinMax() 683 b.False(hasMin) 684 b.False(hasMax) 685 } 686 687 func (b *ByteArrayWriterSuite) TestOmitDataPageStats() { 688 // prevent writing large stats in DataPageHeader 689 minLen := math.Pow10(7) 690 maxLen := math.Pow10(7) 691 b.SetupSchema(parquet.Repetitions.Required, 1) 692 colprops := parquet.DefaultColumnProperties() 693 colprops.StatsEnabled = false 694 695 writer := b.buildWriter(SmallSize, colprops, parquet.WithVersion(parquet.V1_0)) 696 b.Values = make([]parquet.ByteArray, 1) 697 testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, int(minLen), int(maxLen)) 698 writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil) 699 writer.Close() 700 701 b.NotPanics(func() { b.readColumn(compress.Codecs.Uncompressed) }) 702 } 703 704 func (b *ByteArrayWriterSuite) TestLimitStats() { 705 minLen := 1024 * 4 706 maxLen := 1024 * 8 707 b.SetupSchema(parquet.Repetitions.Required, 1) 708 colprops := parquet.DefaultColumnProperties() 709 colprops.MaxStatsSize = int64(maxLen) 710 711 writer := b.buildWriter(SmallSize, colprops, parquet.WithVersion(parquet.V1_0)).(*file.ByteArrayColumnChunkWriter) 712 b.Values = make([]parquet.ByteArray, SmallSize) 713 testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen) 714 writer.WriteBatch(b.Values.([]parquet.ByteArray), nil, nil) 715 writer.Close() 716 717 b.True(b.metadataIsStatsSet()) 718 } 719 720 func (b *ByteArrayWriterSuite) TestCheckDefaultStats() { 721 b.SetupSchema(parquet.Repetitions.Required, 1) 722 writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0)) 723 b.GenerateData(SmallSize) 724 b.WriteBatchValues(writer, nil, nil) 725 writer.Close() 726 727 b.True(b.metadataIsStatsSet()) 728 } 729 730 type BooleanValueWriterSuite struct { 731 PrimitiveWriterTestSuite 732 } 733 734 func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() { 735 b.SetupSchema(parquet.Repetitions.Required, 1) 736 // We use an unusual data-page size to try to flush out Boolean encoder issues in usage of the BitMapWriter 737 writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.WithVersion(parquet.V1_0), parquet.WithDataPageSize(7)).(*file.BooleanColumnChunkWriter) 738 for i := 0; i < SmallSize; i++ { 739 val := i%2 == 0 740 writer.WriteBatch([]bool{val}, nil, nil) 741 } 742 writer.Close() 743 b.readColumn(compress.Codecs.Uncompressed) 744 for i := 0; i < SmallSize; i++ { 745 b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i]) 746 } 747 } 748 749 func TestDictionaryReslice(t *testing.T) { 750 pts := []arrow.DataType{ 751 arrow.PrimitiveTypes.Int8, 752 arrow.PrimitiveTypes.Int16, 753 arrow.PrimitiveTypes.Int32, 754 arrow.PrimitiveTypes.Int64, 755 arrow.PrimitiveTypes.Uint8, 756 arrow.PrimitiveTypes.Uint16, 757 arrow.PrimitiveTypes.Uint32, 758 arrow.PrimitiveTypes.Uint64, 759 } 760 for _, pt := range pts { 761 t.Run(pt.String(), func(t *testing.T) { 762 mem := memory.NewGoAllocator() 763 dt := &arrow.DictionaryType{ 764 IndexType: pt, 765 ValueType: &arrow.StringType{}, 766 } 767 field := arrow.Field{Name: "test_field", Type: dt, Nullable: true} 768 schema := arrow.NewSchema([]arrow.Field{field}, nil) 769 b := array.NewRecordBuilder(mem, schema) 770 for i := 0; i < 2000; i++ { 771 b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value") 772 } 773 rec := b.NewRecord() 774 out := &bytes.Buffer{} 775 pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties()) 776 assert.NoError(t, err) 777 err = pqw.WriteBuffered(rec) 778 assert.NoError(t, err) 779 780 }) 781 } 782 }