github.com/apache/arrow/go/v10@v10.0.1/parquet/file/file_writer_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "reflect" 22 "testing" 23 24 "github.com/apache/arrow/go/v10/arrow/memory" 25 "github.com/apache/arrow/go/v10/parquet" 26 "github.com/apache/arrow/go/v10/parquet/compress" 27 "github.com/apache/arrow/go/v10/parquet/file" 28 "github.com/apache/arrow/go/v10/parquet/internal/encoding" 29 "github.com/apache/arrow/go/v10/parquet/internal/testutils" 30 "github.com/apache/arrow/go/v10/parquet/schema" 31 "github.com/stretchr/testify/assert" 32 "github.com/stretchr/testify/suite" 33 ) 34 35 type SerializeTestSuite struct { 36 testutils.PrimitiveTypedTest 37 suite.Suite 38 39 numCols int 40 numRowGroups int 41 rowsPerRG int 42 rowsPerBatch int 43 } 44 45 func (t *SerializeTestSuite) SetupTest() { 46 t.numCols = 4 47 t.numRowGroups = 4 48 t.rowsPerRG = 50 49 t.rowsPerBatch = 10 50 t.SetupSchema(parquet.Repetitions.Optional, t.numCols) 51 } 52 53 func (t *SerializeTestSuite) fileSerializeTest(codec compress.Compression, expected compress.Compression) { 54 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 55 56 opts := make([]parquet.WriterProperty, 0) 57 for i := 0; i < t.numCols; i++ { 58 opts = append(opts, parquet.WithCompressionFor(t.Schema.Column(i).Name(), codec)) 59 } 60 61 props := parquet.NewWriterProperties(opts...) 62 63 writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props)) 64 t.GenerateData(int64(t.rowsPerRG)) 65 for rg := 0; rg < t.numRowGroups/2; rg++ { 66 rgw := writer.AppendRowGroup() 67 for col := 0; col < t.numCols; col++ { 68 cw, _ := rgw.NextColumn() 69 t.WriteBatchValues(cw, t.DefLevels, nil) 70 cw.Close() 71 // ensure column() api which is specific to bufferedrowgroups cannot be called 72 t.Panics(func() { rgw.(file.BufferedRowGroupWriter).Column(col) }) 73 } 74 rgw.Close() 75 } 76 77 // write half buffered row groups 78 for rg := 0; rg < t.numRowGroups/2; rg++ { 79 rgw := writer.AppendBufferedRowGroup() 80 for batch := 0; batch < (t.rowsPerRG / t.rowsPerBatch); batch++ { 81 for col := 0; col < t.numCols; col++ { 82 cw, _ := rgw.Column(col) 83 offset := batch * t.rowsPerBatch 84 t.WriteBatchSubset(t.rowsPerBatch, offset, cw, t.DefLevels[offset:t.rowsPerBatch+offset], nil) 85 // Ensure NextColumn api which is specific to RowGroup cannot be called 86 t.Panics(func() { rgw.(file.SerialRowGroupWriter).NextColumn() }) 87 } 88 } 89 for col := 0; col < t.numCols; col++ { 90 cw, _ := rgw.Column(col) 91 cw.Close() 92 } 93 rgw.Close() 94 } 95 writer.Close() 96 97 nrows := t.numRowGroups * t.rowsPerRG 98 reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes())) 99 t.NoError(err) 100 t.Equal(t.numCols, reader.MetaData().Schema.NumColumns()) 101 t.Equal(t.numRowGroups, reader.NumRowGroups()) 102 t.EqualValues(nrows, reader.NumRows()) 103 104 for rg := 0; rg < t.numRowGroups; rg++ { 105 rgr := reader.RowGroup(rg) 106 t.Equal(t.numCols, rgr.NumColumns()) 107 t.EqualValues(t.rowsPerRG, rgr.NumRows()) 108 chunk, _ := rgr.MetaData().ColumnChunk(0) 109 t.Equal(expected, chunk.Compression()) 110 111 valuesRead := int64(0) 112 113 for i := 0; i < t.numCols; i++ { 114 chunk, _ := rgr.MetaData().ColumnChunk(i) 115 t.False(chunk.HasIndexPage()) 116 t.DefLevelsOut = make([]int16, t.rowsPerRG) 117 t.RepLevelsOut = make([]int16, t.rowsPerRG) 118 colReader, err := rgr.Column(i) 119 t.NoError(err) 120 t.SetupValuesOut(int64(t.rowsPerRG)) 121 valuesRead = t.ReadBatch(colReader, int64(t.rowsPerRG), 0, t.DefLevelsOut, t.RepLevelsOut) 122 t.EqualValues(t.rowsPerRG, valuesRead) 123 t.Equal(t.Values, t.ValuesOut) 124 t.Equal(t.DefLevels, t.DefLevelsOut) 125 } 126 } 127 } 128 129 func (t *SerializeTestSuite) unequalNumRows(maxRows int64, rowsPerCol []int64) { 130 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 131 props := parquet.NewWriterProperties() 132 writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props)) 133 defer writer.Close() 134 135 rgw := writer.AppendRowGroup() 136 t.GenerateData(maxRows) 137 for col := 0; col < t.numCols; col++ { 138 cw, _ := rgw.NextColumn() 139 t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil) 140 cw.Close() 141 } 142 t.Error(rgw.Close()) 143 } 144 145 func (t *SerializeTestSuite) unequalNumRowsBuffered(maxRows int64, rowsPerCol []int64) { 146 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 147 writer := file.NewParquetWriter(sink, t.Schema.Root()) 148 defer writer.Close() 149 150 rgw := writer.AppendBufferedRowGroup() 151 t.GenerateData(maxRows) 152 for col := 0; col < t.numCols; col++ { 153 cw, _ := rgw.Column(col) 154 t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil) 155 cw.Close() 156 } 157 t.Error(rgw.Close()) 158 } 159 160 func (t *SerializeTestSuite) TestZeroRows() { 161 t.NotPanics(func() { 162 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 163 writer := file.NewParquetWriter(sink, t.Schema.Root()) 164 defer writer.Close() 165 166 srgw := writer.AppendRowGroup() 167 for col := 0; col < t.numCols; col++ { 168 cw, _ := srgw.NextColumn() 169 cw.Close() 170 } 171 srgw.Close() 172 173 brgw := writer.AppendBufferedRowGroup() 174 for col := 0; col < t.numCols; col++ { 175 cw, _ := brgw.Column(col) 176 cw.Close() 177 } 178 brgw.Close() 179 }) 180 } 181 182 func (t *SerializeTestSuite) TestTooManyColumns() { 183 t.SetupSchema(parquet.Repetitions.Optional, 1) 184 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 185 writer := file.NewParquetWriter(sink, t.Schema.Root()) 186 rgw := writer.AppendRowGroup() 187 188 rgw.NextColumn() // first column 189 t.Panics(func() { rgw.NextColumn() }) // only one column! 190 } 191 192 func (t *SerializeTestSuite) TestRepeatedTooFewRows() { 193 // optional and repeated, so definition and repetition levels 194 t.SetupSchema(parquet.Repetitions.Repeated, 1) 195 const nrows = 100 196 t.GenerateData(nrows) 197 198 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 199 writer := file.NewParquetWriter(sink, t.Schema.Root()) 200 201 rgw := writer.AppendRowGroup() 202 t.RepLevels = make([]int16, nrows) 203 for idx := range t.RepLevels { 204 t.RepLevels[idx] = 0 205 } 206 207 cw, _ := rgw.NextColumn() 208 t.WriteBatchValues(cw, t.DefLevels, t.RepLevels) 209 cw.Close() 210 211 t.RepLevels[3] = 1 // this makes it so that values 2 and 3 are a single row 212 // as a result there's one too few rows in the result 213 214 t.Panics(func() { 215 cw, _ = rgw.NextColumn() 216 t.WriteBatchValues(cw, t.DefLevels, t.RepLevels) 217 cw.Close() 218 }) 219 } 220 221 func (t *SerializeTestSuite) TestTooFewRows() { 222 rowsPerCol := []int64{100, 100, 100, 99} 223 t.NotPanics(func() { t.unequalNumRows(100, rowsPerCol) }) 224 t.NotPanics(func() { t.unequalNumRowsBuffered(100, rowsPerCol) }) 225 } 226 227 func (t *SerializeTestSuite) TestTooManyRows() { 228 rowsPerCol := []int64{100, 100, 100, 101} 229 t.NotPanics(func() { t.unequalNumRows(101, rowsPerCol) }) 230 t.NotPanics(func() { t.unequalNumRowsBuffered(101, rowsPerCol) }) 231 } 232 233 func (t *SerializeTestSuite) TestSmallFile() { 234 codecs := []compress.Compression{ 235 compress.Codecs.Uncompressed, 236 compress.Codecs.Snappy, 237 compress.Codecs.Brotli, 238 compress.Codecs.Gzip, 239 compress.Codecs.Zstd, 240 // compress.Codecs.Lz4, 241 // compress.Codecs.Lzo, 242 } 243 for _, c := range codecs { 244 t.Run(c.String(), func() { 245 t.NotPanics(func() { t.fileSerializeTest(c, c) }) 246 }) 247 } 248 } 249 250 func TestBufferedDisabledDictionary(t *testing.T) { 251 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 252 fields := schema.FieldList{schema.NewInt32Node("col", parquet.Repetitions.Required, 1)} 253 sc, _ := schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, 0) 254 props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false)) 255 256 writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props)) 257 rgw := writer.AppendBufferedRowGroup() 258 cwr, _ := rgw.Column(0) 259 cw := cwr.(*file.Int32ColumnChunkWriter) 260 cw.WriteBatch([]int32{1}, nil, nil) 261 rgw.Close() 262 writer.Close() 263 264 buffer := sink.Finish() 265 defer buffer.Release() 266 reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes())) 267 assert.NoError(t, err) 268 assert.EqualValues(t, 1, reader.NumRowGroups()) 269 rgReader := reader.RowGroup(0) 270 assert.EqualValues(t, 1, rgReader.NumRows()) 271 chunk, _ := rgReader.MetaData().ColumnChunk(0) 272 assert.False(t, chunk.HasDictionaryPage()) 273 } 274 275 func TestBufferedMultiPageDisabledDictionary(t *testing.T) { 276 const ( 277 valueCount = 10000 278 pageSize = 16384 279 ) 280 var ( 281 sink = encoding.NewBufferWriter(0, memory.DefaultAllocator) 282 props = parquet.NewWriterProperties(parquet.WithDictionaryDefault(false), parquet.WithDataPageSize(pageSize)) 283 sc, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 284 schema.NewInt32Node("col", parquet.Repetitions.Required, -1), 285 }, -1) 286 ) 287 288 writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props)) 289 rgWriter := writer.AppendBufferedRowGroup() 290 cwr, _ := rgWriter.Column(0) 291 cw := cwr.(*file.Int32ColumnChunkWriter) 292 valuesIn := make([]int32, 0, valueCount) 293 for i := int32(0); i < valueCount; i++ { 294 valuesIn = append(valuesIn, (i%100)+1) 295 } 296 cw.WriteBatch(valuesIn, nil, nil) 297 rgWriter.Close() 298 writer.Close() 299 buffer := sink.Finish() 300 defer buffer.Release() 301 302 reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes())) 303 assert.NoError(t, err) 304 305 assert.EqualValues(t, 1, reader.NumRowGroups()) 306 valuesOut := make([]int32, valueCount) 307 308 for r := 0; r < reader.NumRowGroups(); r++ { 309 rgr := reader.RowGroup(r) 310 assert.EqualValues(t, 1, rgr.NumColumns()) 311 assert.EqualValues(t, valueCount, rgr.NumRows()) 312 313 var totalRead int64 314 col, err := rgr.Column(0) 315 assert.NoError(t, err) 316 colReader := col.(*file.Int32ColumnChunkReader) 317 for colReader.HasNext() { 318 total, _, _ := colReader.ReadBatch(valueCount-totalRead, valuesOut[totalRead:], nil, nil) 319 totalRead += total 320 } 321 assert.EqualValues(t, valueCount, totalRead) 322 assert.Equal(t, valuesIn, valuesOut) 323 } 324 } 325 326 func TestAllNulls(t *testing.T) { 327 sc, _ := schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{ 328 schema.NewInt32Node("nulls", parquet.Repetitions.Optional, -1), 329 }, -1) 330 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 331 332 writer := file.NewParquetWriter(sink, sc) 333 rgw := writer.AppendRowGroup() 334 cwr, _ := rgw.NextColumn() 335 cw := cwr.(*file.Int32ColumnChunkWriter) 336 337 var ( 338 values [3]int32 339 defLevels = [...]int16{0, 0, 0} 340 ) 341 342 cw.WriteBatch(values[:], defLevels[:], nil) 343 cw.Close() 344 rgw.Close() 345 writer.Close() 346 347 buffer := sink.Finish() 348 defer buffer.Release() 349 props := parquet.NewReaderProperties(memory.DefaultAllocator) 350 props.BufferedStreamEnabled = true 351 352 reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()), file.WithReadProps(props)) 353 assert.NoError(t, err) 354 355 rgr := reader.RowGroup(0) 356 col, err := rgr.Column(0) 357 assert.NoError(t, err) 358 cr := col.(*file.Int32ColumnChunkReader) 359 360 defLevels[0] = -1 361 defLevels[1] = -1 362 defLevels[2] = -1 363 valRead, read, _ := cr.ReadBatch(3, values[:], defLevels[:], nil) 364 assert.EqualValues(t, 3, valRead) 365 assert.EqualValues(t, 0, read) 366 assert.Equal(t, []int16{0, 0, 0}, defLevels[:]) 367 } 368 369 func createSerializeTestSuite(typ reflect.Type) suite.TestingSuite { 370 return &SerializeTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)} 371 } 372 373 func TestSerialize(t *testing.T) { 374 t.Parallel() 375 types := []struct { 376 typ reflect.Type 377 }{ 378 {reflect.TypeOf(true)}, 379 {reflect.TypeOf(int32(0))}, 380 {reflect.TypeOf(int64(0))}, 381 {reflect.TypeOf(float32(0))}, 382 {reflect.TypeOf(float64(0))}, 383 {reflect.TypeOf(parquet.Int96{})}, 384 {reflect.TypeOf(parquet.ByteArray{})}, 385 } 386 for _, tt := range types { 387 tt := tt 388 t.Run(tt.typ.String(), func(t *testing.T) { 389 t.Parallel() 390 suite.Run(t, createSerializeTestSuite(tt.typ)) 391 }) 392 } 393 }