github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_writer_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "math" 22 "reflect" 23 "testing" 24 25 "github.com/apache/arrow/go/v7/arrow/bitutil" 26 "github.com/apache/arrow/go/v7/arrow/memory" 27 "github.com/apache/arrow/go/v7/parquet" 28 "github.com/apache/arrow/go/v7/parquet/compress" 29 "github.com/apache/arrow/go/v7/parquet/file" 30 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 31 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 32 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 33 "github.com/apache/arrow/go/v7/parquet/internal/testutils" 34 "github.com/apache/arrow/go/v7/parquet/internal/utils" 35 "github.com/apache/arrow/go/v7/parquet/metadata" 36 "github.com/apache/arrow/go/v7/parquet/schema" 37 "github.com/stretchr/testify/assert" 38 "github.com/stretchr/testify/mock" 39 "github.com/stretchr/testify/suite" 40 ) 41 42 const ( 43 SmallSize = 100 44 // larger to test some corner cases, only in some specific cases 45 LargeSize = 100000 46 // very large to test dictionary fallback 47 VeryLargeSize = 400000 48 // dictionary page size for testing fallback 49 DictionaryPageSize = 1024 * 1024 50 ) 51 52 type mockpagewriter struct { 53 mock.Mock 54 } 55 56 func (m *mockpagewriter) Close(hasDict, fallBack bool) error { 57 return m.Called(hasDict, fallBack).Error(0) 58 } 59 func (m *mockpagewriter) WriteDataPage(page file.DataPage) (int64, error) { 60 args := m.Called(page) 61 return int64(args.Int(0)), args.Error(1) 62 } 63 func (m *mockpagewriter) WriteDictionaryPage(page *file.DictionaryPage) (int64, error) { 64 args := m.Called(page) 65 return int64(args.Int(0)), args.Error(1) 66 } 67 func (m *mockpagewriter) HasCompressor() bool { 68 return m.Called().Bool(0) 69 } 70 func (m *mockpagewriter) Compress(buf *bytes.Buffer, src []byte) []byte { 71 return m.Called(buf, src).Get(0).([]byte) 72 } 73 func (m *mockpagewriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error { 74 return m.Called().Error(0) 75 } 76 77 func TestWriteDataPageV1NumValues(t *testing.T) { 78 sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 79 schema.Must(schema.ListOf( 80 schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)), 81 parquet.Repetitions.Optional, -1)), 82 }, -1))) 83 descr := sc.Column(0) 84 props := parquet.NewWriterProperties( 85 parquet.WithStats(true), 86 parquet.WithVersion(parquet.V1_0), 87 parquet.WithDataPageVersion(parquet.DataPageV1), 88 parquet.WithDictionaryDefault(false)) 89 90 metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr) 91 pager := new(mockpagewriter) 92 defer pager.AssertExpectations(t) 93 pager.On("HasCompressor").Return(false) 94 wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter) 95 96 // write a list "[[0, 1], null, [2, null, 3]]" 97 // should be 6 values, 2 nulls and 3 rows 98 wr.WriteBatch([]int32{0, 1, 2, 3}, 99 []int16{3, 3, 0, 3, 2, 3}, 100 []int16{0, 1, 0, 0, 1, 1}) 101 102 pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool { 103 pagev1, ok := page.(*file.DataPageV1) 104 if !ok { 105 return false 106 } 107 108 encodedStats := pagev1.Statistics() 109 // only match if the page being written has 2 nulls, 6 values and 3 rows 110 return pagev1.NumValues() == 6 && 111 encodedStats.HasNullCount && 112 encodedStats.NullCount == 2 113 })).Return(10, nil) 114 115 wr.FlushBufferedDataPages() 116 assert.EqualValues(t, 3, wr.RowsWritten()) 117 } 118 119 func TestWriteDataPageV2NumRows(t *testing.T) { 120 // test issue from PARQUET-2066 121 sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 122 schema.Must(schema.ListOf( 123 schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)), 124 parquet.Repetitions.Optional, -1)), 125 }, -1))) 126 descr := sc.Column(0) 127 props := parquet.NewWriterProperties( 128 parquet.WithStats(true), 129 parquet.WithVersion(parquet.V2_LATEST), 130 parquet.WithDataPageVersion(parquet.DataPageV2), 131 parquet.WithDictionaryDefault(false)) 132 133 metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr) 134 pager := new(mockpagewriter) 135 defer pager.AssertExpectations(t) 136 pager.On("HasCompressor").Return(false) 137 wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter) 138 139 // write a list "[[0, 1], null, [2, null, 3]]" 140 // should be 6 values, 2 nulls and 3 rows 141 wr.WriteBatch([]int32{0, 1, 2, 3}, 142 []int16{3, 3, 0, 3, 2, 3}, 143 []int16{0, 1, 0, 0, 1, 1}) 144 145 pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool { 146 pagev2, ok := page.(*file.DataPageV2) 147 if !ok { 148 return false 149 } 150 151 encodedStats := pagev2.Statistics() 152 // only match if the page being written has 2 nulls, 6 values and 3 rows 153 return !pagev2.IsCompressed() && 154 pagev2.NumNulls() == 2 && encodedStats.NullCount == 2 && 155 pagev2.NumValues() == 6 && 156 pagev2.NumRows() == 3 157 })).Return(10, nil) 158 159 wr.FlushBufferedDataPages() 160 assert.EqualValues(t, 3, wr.RowsWritten()) 161 } 162 163 func TestDataPageV2RowBoundaries(t *testing.T) { 164 sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 165 schema.Must(schema.ListOf( 166 schema.Must(schema.NewPrimitiveNode("column", parquet.Repetitions.Optional, parquet.Types.Int32, -1, -1)), 167 parquet.Repetitions.Optional, -1)), 168 }, -1))) 169 descr := sc.Column(0) 170 props := parquet.NewWriterProperties( 171 parquet.WithBatchSize(128), 172 parquet.WithDataPageSize(1024), 173 parquet.WithVersion(parquet.V2_LATEST), 174 parquet.WithDataPageVersion(parquet.DataPageV2), 175 parquet.WithDictionaryDefault(false)) 176 177 metadata := metadata.NewColumnChunkMetaDataBuilder(props, descr) 178 pager := new(mockpagewriter) 179 defer pager.AssertExpectations(t) 180 pager.On("HasCompressor").Return(false) 181 wr := file.NewColumnChunkWriter(metadata, pager, props).(*file.Int32ColumnChunkWriter) 182 183 pager.On("WriteDataPage", mock.MatchedBy(func(page file.DataPage) bool { 184 pagev2, ok := page.(*file.DataPageV2) 185 if !ok { 186 return false 187 } 188 189 // only match if the page being written has 2 nulls, 6 values and 3 rows 190 return !pagev2.IsCompressed() && 191 pagev2.NumNulls() == 0 && 192 pagev2.NumValues() == 378 && 193 pagev2.NumRows() == 126 194 })).Return(10, nil) 195 196 // create rows of lists of 3 values each 197 values := make([]int32, 1024) 198 defLevels := make([]int16, 1024) 199 repLevels := make([]int16, 1024) 200 for i := range values { 201 values[i] = int32(i) 202 defLevels[i] = 3 203 204 switch i % 3 { 205 case 0: 206 repLevels[i] = 0 207 case 1, 2: 208 repLevels[i] = 1 209 } 210 } 211 212 wr.WriteBatch(values, defLevels, repLevels) 213 } 214 215 type PrimitiveWriterTestSuite struct { 216 testutils.PrimitiveTypedTest 217 suite.Suite 218 219 props *parquet.WriterProperties 220 descr *schema.Column 221 222 metadata *metadata.ColumnChunkMetaDataBuilder 223 sink *encoding.BufferWriter 224 readbuffer *memory.Buffer 225 reader file.ColumnChunkReader 226 } 227 228 func (p *PrimitiveWriterTestSuite) SetupTest() { 229 p.SetupValuesOut(SmallSize) 230 p.props = parquet.NewWriterProperties() 231 p.SetupSchema(parquet.Repetitions.Required, 1) 232 p.descr = p.Schema.Column(0) 233 } 234 235 func (p *PrimitiveWriterTestSuite) buildReader(nrows int64, compression compress.Compression) file.ColumnChunkReader { 236 p.readbuffer = p.sink.Finish() 237 pagereader, _ := file.NewPageReader(bytes.NewReader(p.readbuffer.Bytes()), nrows, compression, mem, nil) 238 return file.NewColumnReader(p.descr, pagereader, mem) 239 } 240 241 func (p *PrimitiveWriterTestSuite) buildWriter(_ int64, columnProps parquet.ColumnProperties, version parquet.Version) file.ColumnChunkWriter { 242 p.sink = encoding.NewBufferWriter(0, mem) 243 opts := make([]parquet.WriterProperty, 0) 244 opts = append(opts, parquet.WithVersion(version)) 245 if columnProps.Encoding == parquet.Encodings.PlainDict || columnProps.Encoding == parquet.Encodings.RLEDict { 246 opts = append(opts, parquet.WithDictionaryDefault(true), parquet.WithDictionaryPageSizeLimit(DictionaryPageSize)) 247 } else { 248 opts = append(opts, parquet.WithDictionaryDefault(false), parquet.WithEncoding(columnProps.Encoding)) 249 } 250 opts = append(opts, parquet.WithMaxStatsSize(columnProps.MaxStatsSize), parquet.WithStats(columnProps.StatsEnabled)) 251 p.props = parquet.NewWriterProperties(opts...) 252 253 p.metadata = metadata.NewColumnChunkMetaDataBuilder(p.props, p.descr) 254 pager, _ := file.NewPageWriter(p.sink, columnProps.Codec, compress.DefaultCompressionLevel, p.metadata, -1, -1, memory.DefaultAllocator, false, nil, nil) 255 return file.NewColumnChunkWriter(p.metadata, pager, p.props) 256 } 257 258 func (p *PrimitiveWriterTestSuite) readColumn(compression compress.Compression) int64 { 259 totalValues := int64(len(p.DefLevelsOut)) 260 reader := p.buildReader(totalValues, compression) 261 return p.ReadBatch(reader, totalValues, 0, p.DefLevelsOut, p.RepLevelsOut) 262 } 263 264 func (p *PrimitiveWriterTestSuite) readColumnFully(compression compress.Compression) int64 { 265 totalValues := int64(len(p.DefLevelsOut)) 266 reader := p.buildReader(totalValues, compression) 267 valuesRead := int64(0) 268 for valuesRead < totalValues { 269 read := p.ReadBatch(reader, totalValues-valuesRead, valuesRead, p.DefLevelsOut[valuesRead:], p.RepLevelsOut[valuesRead:]) 270 valuesRead += read 271 } 272 return valuesRead 273 } 274 275 func (p *PrimitiveWriterTestSuite) readAndCompare(compression compress.Compression, nrows int64) { 276 p.SetupValuesOut(nrows) 277 p.readColumnFully(compression) 278 p.Equal(p.Values, p.ValuesOut) 279 } 280 281 func (p *PrimitiveWriterTestSuite) writeRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, compressLvl int, nrows int64) { 282 columnProperties := parquet.ColumnProperties{ 283 Encoding: encoding, 284 Codec: compression, 285 DictionaryEnabled: dict, 286 StatsEnabled: stats, 287 CompressionLevel: compressLvl, 288 } 289 writer := p.buildWriter(nrows, columnProperties, parquet.V1_0) 290 p.WriteBatchValues(writer, nil, nil) 291 // behavior should be independant of the number of calls to Close 292 writer.Close() 293 writer.Close() 294 } 295 296 func (p *PrimitiveWriterTestSuite) writeRequiredWithSettingsSpaced(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressionLvl int) { 297 validBits := make([]byte, int(bitutil.BytesForBits(int64(len(p.DefLevels))))+1) 298 memory.Set(validBits, 255) 299 columnProperties := parquet.ColumnProperties{ 300 Encoding: encoding, 301 Codec: compression, 302 DictionaryEnabled: dict, 303 StatsEnabled: stats, 304 CompressionLevel: compressionLvl, 305 } 306 writer := p.buildWriter(nrows, columnProperties, parquet.V1_0) 307 p.WriteBatchValuesSpaced(writer, nil, nil, validBits, 0) 308 // behavior should be independant from the number of close calls 309 writer.Close() 310 writer.Close() 311 } 312 313 func (p *PrimitiveWriterTestSuite) testRequiredWithSettings(encoding parquet.Encoding, compression compress.Compression, dict, stats bool, nrows int64, compressLvl int) { 314 p.GenerateData(nrows) 315 p.writeRequiredWithSettings(encoding, compression, dict, stats, compressLvl, nrows) 316 p.NotPanics(func() { p.readAndCompare(compression, nrows) }) 317 p.writeRequiredWithSettingsSpaced(encoding, compression, dict, stats, nrows, compressLvl) 318 p.NotPanics(func() { p.readAndCompare(compression, nrows) }) 319 } 320 321 func (p *PrimitiveWriterTestSuite) testRequiredWithEncoding(encoding parquet.Encoding) { 322 p.testRequiredWithSettings(encoding, compress.Codecs.Uncompressed, false, false, SmallSize, compress.DefaultCompressionLevel) 323 } 324 325 func (p *PrimitiveWriterTestSuite) metadataNumValues() int64 { 326 // metadata accessor created lazily 327 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil) 328 return metadata.NumValues() 329 } 330 331 func (p *PrimitiveWriterTestSuite) metadataEncodings() []parquet.Encoding { 332 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil) 333 return metadata.Encodings() 334 } 335 336 func (p *PrimitiveWriterTestSuite) metadataEncodingStats() []metadata.PageEncodingStats { 337 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, nil, 0, 0, nil) 338 return metadata.EncodingStats() 339 } 340 341 func (p *PrimitiveWriterTestSuite) metadataStatsHasMinMax() (hasMin, hasMax bool) { 342 appVersion := metadata.NewAppVersion(p.props.CreatedBy()) 343 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil) 344 stats, _ := metadata.Statistics() 345 encoded, _ := stats.Encode() 346 return encoded.HasMin, encoded.HasMax 347 } 348 349 func (p *PrimitiveWriterTestSuite) metadataIsStatsSet() bool { 350 appVersion := metadata.NewAppVersion(p.props.CreatedBy()) 351 metadata, _ := metadata.NewColumnChunkMetaData(p.metadata.Contents(), p.descr, appVersion, 0, 0, nil) 352 set, _ := metadata.StatsSet() 353 return set 354 } 355 356 func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parquet.Version) { 357 p.GenerateData(VeryLargeSize) 358 props := parquet.DefaultColumnProperties() 359 props.DictionaryEnabled = true 360 361 if version == parquet.V1_0 { 362 props.Encoding = parquet.Encodings.PlainDict 363 } else { 364 props.Encoding = parquet.Encodings.RLEDict 365 } 366 367 writer := p.buildWriter(VeryLargeSize, props, version) 368 p.WriteBatchValues(writer, nil, nil) 369 writer.Close() 370 371 // Read all the rows so that we are sure that also the non-dictionary pages are read correctly 372 p.SetupValuesOut(VeryLargeSize) 373 valuesRead := p.readColumnFully(compress.Codecs.Uncompressed) 374 p.EqualValues(VeryLargeSize, valuesRead) 375 p.Equal(p.Values, p.ValuesOut) 376 377 encodings := p.metadataEncodings() 378 if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) { 379 // dictionary encoding is not allowed for booleans 380 // there are 2 encodings (PLAIN, RLE) in a non dictionary encoding case 381 p.Equal([]parquet.Encoding{parquet.Encodings.Plain, parquet.Encodings.RLE}, encodings) 382 } else if version == parquet.V1_0 { 383 // There are 4 encodings (PLAIN_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case 384 // for version 1.0 385 p.Equal([]parquet.Encoding{parquet.Encodings.PlainDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings) 386 } else { 387 // There are 4 encodings (RLE_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case for 388 // version 2.0 389 p.Equal([]parquet.Encoding{parquet.Encodings.RLEDict, parquet.Encodings.Plain, parquet.Encodings.RLE, parquet.Encodings.Plain}, encodings) 390 } 391 392 encodingStats := p.metadataEncodingStats() 393 if p.Typ.Kind() == reflect.Bool || p.Typ == reflect.TypeOf(parquet.Int96{}) { 394 p.Equal(parquet.Encodings.Plain, encodingStats[0].Encoding) 395 p.Equal(format.PageType_DATA_PAGE, encodingStats[0].PageType) 396 } else if version == parquet.V1_0 { 397 expected := []metadata.PageEncodingStats{ 398 {Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DICTIONARY_PAGE}, 399 {Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE}, 400 {Encoding: parquet.Encodings.PlainDict, PageType: format.PageType_DATA_PAGE}} 401 p.Equal(expected[0], encodingStats[0]) 402 p.ElementsMatch(expected[1:], encodingStats[1:]) 403 } else { 404 expected := []metadata.PageEncodingStats{ 405 {Encoding: parquet.Encodings.Plain, PageType: format.PageType_DICTIONARY_PAGE}, 406 {Encoding: parquet.Encodings.Plain, PageType: format.PageType_DATA_PAGE}, 407 {Encoding: parquet.Encodings.RLEDict, PageType: format.PageType_DATA_PAGE}} 408 p.Equal(expected[0], encodingStats[0]) 409 p.ElementsMatch(expected[1:], encodingStats[1:]) 410 } 411 } 412 413 func (p *PrimitiveWriterTestSuite) TestRequiredPlain() { 414 p.testRequiredWithEncoding(parquet.Encodings.Plain) 415 } 416 417 func (p *PrimitiveWriterTestSuite) TestRequiredDictionary() { 418 p.testRequiredWithEncoding(parquet.Encodings.PlainDict) 419 } 420 421 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStats() { 422 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Uncompressed, false, true, LargeSize, compress.DefaultCompressionLevel) 423 } 424 425 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithSnappy() { 426 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, false, LargeSize, compress.DefaultCompressionLevel) 427 } 428 429 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndSnappy() { 430 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Snappy, false, true, LargeSize, compress.DefaultCompressionLevel) 431 } 432 433 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotli() { 434 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, compress.DefaultCompressionLevel) 435 } 436 437 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithBrotliAndLevel() { 438 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, false, LargeSize, 10) 439 } 440 441 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndBrotli() { 442 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Brotli, false, true, LargeSize, compress.DefaultCompressionLevel) 443 } 444 445 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzip() { 446 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, compress.DefaultCompressionLevel) 447 } 448 449 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithGzipAndLevel() { 450 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, false, LargeSize, 10) 451 } 452 453 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndGzip() { 454 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Gzip, false, true, LargeSize, compress.DefaultCompressionLevel) 455 } 456 457 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstd() { 458 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, compress.DefaultCompressionLevel) 459 } 460 461 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithZstdAndLevel() { 462 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, false, LargeSize, 6) 463 } 464 465 func (p *PrimitiveWriterTestSuite) TestRequiredPlainWithStatsAndZstd() { 466 p.testRequiredWithSettings(parquet.Encodings.Plain, compress.Codecs.Zstd, false, true, LargeSize, compress.DefaultCompressionLevel) 467 } 468 469 func (p *PrimitiveWriterTestSuite) TestOptionalNonRepeated() { 470 p.SetupSchema(parquet.Repetitions.Optional, 1) 471 p.descr = p.Schema.Column(0) 472 473 p.GenerateData(SmallSize) 474 p.DefLevels[1] = 0 475 476 writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0) 477 p.WriteBatchValues(writer, p.DefLevels, nil) 478 writer.Close() 479 480 p.Equal(int64(100), p.metadataNumValues()) 481 482 values := p.readColumn(compress.Codecs.Uncompressed) 483 p.EqualValues(99, values) 484 p.Equal(reflect.ValueOf(p.Values).Slice(0, 99).Interface(), reflect.ValueOf(p.ValuesOut).Slice(0, 99).Interface()) 485 } 486 487 func (p *PrimitiveWriterTestSuite) TestOptionalSpaced() { 488 p.SetupSchema(parquet.Repetitions.Optional, 1) 489 p.descr = p.Schema.Column(0) 490 491 p.GenerateData(SmallSize) 492 validBits := make([]byte, int(bitutil.BytesForBits(SmallSize))) 493 memory.Set(validBits, 255) 494 p.DefLevels[SmallSize-1] = 0 495 bitutil.ClearBit(validBits, SmallSize-1) 496 p.DefLevels[1] = 0 497 bitutil.ClearBit(validBits, 1) 498 499 writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0) 500 p.WriteBatchValuesSpaced(writer, p.DefLevels, nil, validBits, 0) 501 writer.Close() 502 503 p.Equal(int64(100), p.metadataNumValues()) 504 505 values := p.readColumn(compress.Codecs.Uncompressed) 506 p.EqualValues(98, values) 507 508 orig := reflect.ValueOf(p.Values) 509 orig = orig.Slice(0, 99) 510 reflect.Copy(orig.Slice(1, orig.Len()), orig.Slice(2, orig.Len())) 511 orig = orig.Slice(0, 98) 512 out := reflect.ValueOf(p.ValuesOut) 513 out = out.Slice(0, 98) 514 515 p.Equal(orig.Interface(), out.Interface()) 516 } 517 518 func (p *PrimitiveWriterTestSuite) TestWriteRepeated() { 519 // optional and repeated so def and repetition levels 520 p.SetupSchema(parquet.Repetitions.Repeated, 1) 521 p.descr = p.Schema.Column(0) 522 p.GenerateData(SmallSize) 523 p.DefLevels[1] = 0 524 p.RepLevels = make([]int16, SmallSize) 525 for idx := range p.RepLevels { 526 p.RepLevels[idx] = 0 527 } 528 529 writer := p.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0) 530 p.WriteBatchValues(writer, p.DefLevels, p.RepLevels) 531 writer.Close() 532 533 values := p.readColumn(compress.Codecs.Uncompressed) 534 p.EqualValues(SmallSize-1, values) 535 out := reflect.ValueOf(p.ValuesOut).Slice(0, SmallSize-1).Interface() 536 vals := reflect.ValueOf(p.Values).Slice(0, SmallSize-1).Interface() 537 p.Equal(vals, out) 538 } 539 540 func (p *PrimitiveWriterTestSuite) TestRequiredLargeChunk() { 541 p.GenerateData(LargeSize) 542 543 // Test 1: required and non-repeated, so no def or rep levels 544 writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.V1_0) 545 p.WriteBatchValues(writer, nil, nil) 546 writer.Close() 547 548 // just read the first SmallSize rows to ensure we could read it back in 549 values := p.readColumn(compress.Codecs.Uncompressed) 550 p.EqualValues(SmallSize, values) 551 p.Equal(reflect.ValueOf(p.Values).Slice(0, SmallSize).Interface(), p.ValuesOut) 552 } 553 554 func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV1() { 555 p.testDictionaryFallbackEncoding(parquet.V1_0) 556 } 557 558 func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() { 559 p.testDictionaryFallbackEncoding(parquet.V2_LATEST) 560 } 561 562 func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() { 563 // test case for NULL values 564 p.SetupSchema(parquet.Repetitions.Optional, 1) 565 p.descr = p.Schema.Column(0) 566 p.GenerateData(LargeSize) 567 p.RepLevels = make([]int16, LargeSize) 568 for idx := range p.DefLevels { 569 p.DefLevels[idx] = 0 570 p.RepLevels[idx] = 0 571 } 572 573 writer := p.buildWriter(LargeSize, parquet.DefaultColumnProperties(), parquet.V1_0) 574 p.WriteBatchValues(writer, p.DefLevels, p.RepLevels) 575 writer.Close() 576 577 valuesRead := p.readColumn(compress.Codecs.Uncompressed) 578 p.Zero(valuesRead) 579 } 580 581 func createWriterTestSuite(typ reflect.Type) suite.TestingSuite { 582 switch typ { 583 case reflect.TypeOf(true): 584 return &BooleanValueWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}} 585 case reflect.TypeOf(parquet.ByteArray{}): 586 return &ByteArrayWriterSuite{PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}} 587 } 588 return &PrimitiveWriterTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)} 589 } 590 591 func TestColumnWriter(t *testing.T) { 592 t.Parallel() 593 types := []struct { 594 typ reflect.Type 595 }{ 596 {reflect.TypeOf(true)}, 597 {reflect.TypeOf(int32(0))}, 598 {reflect.TypeOf(int64(0))}, 599 {reflect.TypeOf(float32(0))}, 600 {reflect.TypeOf(float64(0))}, 601 {reflect.TypeOf(parquet.Int96{})}, 602 {reflect.TypeOf(parquet.ByteArray{})}, 603 {reflect.TypeOf(parquet.FixedLenByteArray{})}, 604 } 605 for _, tt := range types { 606 tt := tt 607 t.Run(tt.typ.String(), func(t *testing.T) { 608 t.Parallel() 609 suite.Run(t, createWriterTestSuite(tt.typ)) 610 }) 611 } 612 } 613 614 type ByteArrayWriterSuite struct { 615 PrimitiveWriterTestSuite 616 } 617 618 func (b *ByteArrayWriterSuite) TestOmitStats() { 619 // prevent writing large MIN,MAX stats 620 minLen := 1024 * 4 621 maxLen := 1024 * 8 622 b.SetupSchema(parquet.Repetitions.Required, 1) 623 b.Values = make([]parquet.ByteArray, SmallSize) 624 writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0) 625 testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen) 626 writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil) 627 writer.Close() 628 629 hasMin, hasMax := b.metadataStatsHasMinMax() 630 b.False(hasMin) 631 b.False(hasMax) 632 } 633 634 func (b *ByteArrayWriterSuite) TestOmitDataPageStats() { 635 // prevent writing large stats in DataPageHeader 636 minLen := math.Pow10(7) 637 maxLen := math.Pow10(7) 638 b.SetupSchema(parquet.Repetitions.Required, 1) 639 colprops := parquet.DefaultColumnProperties() 640 colprops.StatsEnabled = false 641 642 writer := b.buildWriter(SmallSize, colprops, parquet.V1_0) 643 b.Values = make([]parquet.ByteArray, 1) 644 testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, int(minLen), int(maxLen)) 645 writer.(*file.ByteArrayColumnChunkWriter).WriteBatch(b.Values.([]parquet.ByteArray), nil, nil) 646 writer.Close() 647 648 b.NotPanics(func() { b.readColumn(compress.Codecs.Uncompressed) }) 649 } 650 651 func (b *ByteArrayWriterSuite) TestLimitStats() { 652 minLen := 1024 * 4 653 maxLen := 1024 * 8 654 b.SetupSchema(parquet.Repetitions.Required, 1) 655 colprops := parquet.DefaultColumnProperties() 656 colprops.MaxStatsSize = int64(maxLen) 657 658 writer := b.buildWriter(SmallSize, colprops, parquet.V1_0).(*file.ByteArrayColumnChunkWriter) 659 b.Values = make([]parquet.ByteArray, SmallSize) 660 testutils.RandomByteArray(0, b.Values.([]parquet.ByteArray), b.Buffer, minLen, maxLen) 661 writer.WriteBatch(b.Values.([]parquet.ByteArray), nil, nil) 662 writer.Close() 663 664 b.True(b.metadataIsStatsSet()) 665 } 666 667 func (b *ByteArrayWriterSuite) TestCheckDefaultStats() { 668 b.SetupSchema(parquet.Repetitions.Required, 1) 669 writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0) 670 b.GenerateData(SmallSize) 671 b.WriteBatchValues(writer, nil, nil) 672 writer.Close() 673 674 b.True(b.metadataIsStatsSet()) 675 } 676 677 type BooleanValueWriterSuite struct { 678 PrimitiveWriterTestSuite 679 } 680 681 func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() { 682 b.SetupSchema(parquet.Repetitions.Required, 1) 683 writer := b.buildWriter(SmallSize, parquet.DefaultColumnProperties(), parquet.V1_0).(*file.BooleanColumnChunkWriter) 684 for i := 0; i < SmallSize; i++ { 685 val := i%2 == 0 686 writer.WriteBatch([]bool{val}, nil, nil) 687 } 688 writer.Close() 689 b.readColumn(compress.Codecs.Uncompressed) 690 for i := 0; i < SmallSize; i++ { 691 b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i]) 692 } 693 }