github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_writer_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "reflect" 22 "testing" 23 24 "github.com/apache/arrow/go/v7/arrow/memory" 25 "github.com/apache/arrow/go/v7/parquet" 26 "github.com/apache/arrow/go/v7/parquet/compress" 27 "github.com/apache/arrow/go/v7/parquet/file" 28 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 29 "github.com/apache/arrow/go/v7/parquet/internal/testutils" 30 "github.com/apache/arrow/go/v7/parquet/schema" 31 "github.com/stretchr/testify/assert" 32 "github.com/stretchr/testify/suite" 33 ) 34 35 type SerializeTestSuite struct { 36 testutils.PrimitiveTypedTest 37 suite.Suite 38 39 numCols int 40 numRowGroups int 41 rowsPerRG int 42 rowsPerBatch int 43 } 44 45 func (t *SerializeTestSuite) SetupTest() { 46 t.numCols = 4 47 t.numRowGroups = 4 48 t.rowsPerRG = 50 49 t.rowsPerBatch = 10 50 t.SetupSchema(parquet.Repetitions.Optional, t.numCols) 51 } 52 53 func (t *SerializeTestSuite) fileSerializeTest(codec compress.Compression, expected compress.Compression) { 54 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 55 56 opts := make([]parquet.WriterProperty, 0) 57 for i := 0; i < t.numCols; i++ { 58 opts = append(opts, parquet.WithCompressionFor(t.Schema.Column(i).Name(), codec)) 59 } 60 61 props := parquet.NewWriterProperties(opts...) 62 63 writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props)) 64 t.GenerateData(int64(t.rowsPerRG)) 65 for rg := 0; rg < t.numRowGroups/2; rg++ { 66 rgw := writer.AppendRowGroup() 67 for col := 0; col < t.numCols; col++ { 68 cw, _ := rgw.NextColumn() 69 t.WriteBatchValues(cw, t.DefLevels, nil) 70 cw.Close() 71 // ensure column() api which is specific to bufferedrowgroups cannot be called 72 t.Panics(func() { rgw.(file.BufferedRowGroupWriter).Column(col) }) 73 } 74 rgw.Close() 75 } 76 77 // write half buffered row groups 78 for rg := 0; rg < t.numRowGroups/2; rg++ { 79 rgw := writer.AppendBufferedRowGroup() 80 for batch := 0; batch < (t.rowsPerRG / t.rowsPerBatch); batch++ { 81 for col := 0; col < t.numCols; col++ { 82 cw, _ := rgw.Column(col) 83 offset := batch * t.rowsPerBatch 84 t.WriteBatchSubset(t.rowsPerBatch, offset, cw, t.DefLevels[offset:t.rowsPerBatch+offset], nil) 85 // Ensure NextColumn api which is specific to RowGroup cannot be called 86 t.Panics(func() { rgw.(file.SerialRowGroupWriter).NextColumn() }) 87 } 88 } 89 for col := 0; col < t.numCols; col++ { 90 cw, _ := rgw.Column(col) 91 cw.Close() 92 } 93 rgw.Close() 94 } 95 writer.Close() 96 97 nrows := t.numRowGroups * t.rowsPerRG 98 reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes())) 99 t.NoError(err) 100 t.Equal(t.numCols, reader.MetaData().Schema.NumColumns()) 101 t.Equal(t.numRowGroups, reader.NumRowGroups()) 102 t.EqualValues(nrows, reader.NumRows()) 103 104 for rg := 0; rg < t.numRowGroups; rg++ { 105 rgr := reader.RowGroup(rg) 106 t.Equal(t.numCols, rgr.NumColumns()) 107 t.EqualValues(t.rowsPerRG, rgr.NumRows()) 108 chunk, _ := rgr.MetaData().ColumnChunk(0) 109 t.Equal(expected, chunk.Compression()) 110 111 valuesRead := int64(0) 112 113 for i := 0; i < t.numCols; i++ { 114 chunk, _ := rgr.MetaData().ColumnChunk(i) 115 t.False(chunk.HasIndexPage()) 116 t.DefLevelsOut = make([]int16, t.rowsPerRG) 117 t.RepLevelsOut = make([]int16, t.rowsPerRG) 118 colReader := rgr.Column(i) 119 t.SetupValuesOut(int64(t.rowsPerRG)) 120 valuesRead = t.ReadBatch(colReader, int64(t.rowsPerRG), 0, t.DefLevelsOut, t.RepLevelsOut) 121 t.EqualValues(t.rowsPerRG, valuesRead) 122 t.Equal(t.Values, t.ValuesOut) 123 t.Equal(t.DefLevels, t.DefLevelsOut) 124 } 125 } 126 } 127 128 func (t *SerializeTestSuite) unequalNumRows(maxRows int64, rowsPerCol []int64) { 129 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 130 props := parquet.NewWriterProperties() 131 writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props)) 132 defer writer.Close() 133 134 rgw := writer.AppendRowGroup() 135 t.GenerateData(maxRows) 136 for col := 0; col < t.numCols; col++ { 137 cw, _ := rgw.NextColumn() 138 t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil) 139 cw.Close() 140 } 141 t.Error(rgw.Close()) 142 } 143 144 func (t *SerializeTestSuite) unequalNumRowsBuffered(maxRows int64, rowsPerCol []int64) { 145 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 146 writer := file.NewParquetWriter(sink, t.Schema.Root()) 147 defer writer.Close() 148 149 rgw := writer.AppendBufferedRowGroup() 150 t.GenerateData(maxRows) 151 for col := 0; col < t.numCols; col++ { 152 cw, _ := rgw.Column(col) 153 t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil) 154 cw.Close() 155 } 156 t.Error(rgw.Close()) 157 } 158 159 func (t *SerializeTestSuite) TestZeroRows() { 160 t.NotPanics(func() { 161 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 162 writer := file.NewParquetWriter(sink, t.Schema.Root()) 163 defer writer.Close() 164 165 srgw := writer.AppendRowGroup() 166 for col := 0; col < t.numCols; col++ { 167 cw, _ := srgw.NextColumn() 168 cw.Close() 169 } 170 srgw.Close() 171 172 brgw := writer.AppendBufferedRowGroup() 173 for col := 0; col < t.numCols; col++ { 174 cw, _ := brgw.Column(col) 175 cw.Close() 176 } 177 brgw.Close() 178 }) 179 } 180 181 func (t *SerializeTestSuite) TestTooManyColumns() { 182 t.SetupSchema(parquet.Repetitions.Optional, 1) 183 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 184 writer := file.NewParquetWriter(sink, t.Schema.Root()) 185 rgw := writer.AppendRowGroup() 186 187 rgw.NextColumn() // first column 188 t.Panics(func() { rgw.NextColumn() }) // only one column! 189 } 190 191 func (t *SerializeTestSuite) TestRepeatedTooFewRows() { 192 // optional and repeated, so definition and repetition levels 193 t.SetupSchema(parquet.Repetitions.Repeated, 1) 194 const nrows = 100 195 t.GenerateData(nrows) 196 197 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 198 writer := file.NewParquetWriter(sink, t.Schema.Root()) 199 200 rgw := writer.AppendRowGroup() 201 t.RepLevels = make([]int16, nrows) 202 for idx := range t.RepLevels { 203 t.RepLevels[idx] = 0 204 } 205 206 cw, _ := rgw.NextColumn() 207 t.WriteBatchValues(cw, t.DefLevels, t.RepLevels) 208 cw.Close() 209 210 t.RepLevels[3] = 1 // this makes it so that values 2 and 3 are a single row 211 // as a result there's one too few rows in the result 212 213 t.Panics(func() { 214 cw, _ = rgw.NextColumn() 215 t.WriteBatchValues(cw, t.DefLevels, t.RepLevels) 216 cw.Close() 217 }) 218 } 219 220 func (t *SerializeTestSuite) TestTooFewRows() { 221 rowsPerCol := []int64{100, 100, 100, 99} 222 t.NotPanics(func() { t.unequalNumRows(100, rowsPerCol) }) 223 t.NotPanics(func() { t.unequalNumRowsBuffered(100, rowsPerCol) }) 224 } 225 226 func (t *SerializeTestSuite) TestTooManyRows() { 227 rowsPerCol := []int64{100, 100, 100, 101} 228 t.NotPanics(func() { t.unequalNumRows(101, rowsPerCol) }) 229 t.NotPanics(func() { t.unequalNumRowsBuffered(101, rowsPerCol) }) 230 } 231 232 func (t *SerializeTestSuite) TestSmallFile() { 233 codecs := []compress.Compression{ 234 compress.Codecs.Uncompressed, 235 compress.Codecs.Snappy, 236 compress.Codecs.Brotli, 237 compress.Codecs.Gzip, 238 compress.Codecs.Zstd, 239 // compress.Codecs.Lz4, 240 // compress.Codecs.Lzo, 241 } 242 for _, c := range codecs { 243 t.Run(c.String(), func() { 244 t.NotPanics(func() { t.fileSerializeTest(c, c) }) 245 }) 246 } 247 } 248 249 func TestBufferedDisabledDictionary(t *testing.T) { 250 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 251 fields := schema.FieldList{schema.NewInt32Node("col", parquet.Repetitions.Required, 1)} 252 sc, _ := schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, 0) 253 props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false)) 254 255 writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props)) 256 rgw := writer.AppendBufferedRowGroup() 257 cwr, _ := rgw.Column(0) 258 cw := cwr.(*file.Int32ColumnChunkWriter) 259 cw.WriteBatch([]int32{1}, nil, nil) 260 rgw.Close() 261 writer.Close() 262 263 buffer := sink.Finish() 264 defer buffer.Release() 265 reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes())) 266 assert.NoError(t, err) 267 assert.EqualValues(t, 1, reader.NumRowGroups()) 268 rgReader := reader.RowGroup(0) 269 assert.EqualValues(t, 1, rgReader.NumRows()) 270 chunk, _ := rgReader.MetaData().ColumnChunk(0) 271 assert.False(t, chunk.HasDictionaryPage()) 272 } 273 274 func TestBufferedMultiPageDisabledDictionary(t *testing.T) { 275 const ( 276 valueCount = 10000 277 pageSize = 16384 278 ) 279 var ( 280 sink = encoding.NewBufferWriter(0, memory.DefaultAllocator) 281 props = parquet.NewWriterProperties(parquet.WithDictionaryDefault(false), parquet.WithDataPageSize(pageSize)) 282 sc, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 283 schema.NewInt32Node("col", parquet.Repetitions.Required, -1), 284 }, -1) 285 ) 286 287 writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props)) 288 rgWriter := writer.AppendBufferedRowGroup() 289 cwr, _ := rgWriter.Column(0) 290 cw := cwr.(*file.Int32ColumnChunkWriter) 291 valuesIn := make([]int32, 0, valueCount) 292 for i := int32(0); i < valueCount; i++ { 293 valuesIn = append(valuesIn, (i%100)+1) 294 } 295 cw.WriteBatch(valuesIn, nil, nil) 296 rgWriter.Close() 297 writer.Close() 298 buffer := sink.Finish() 299 defer buffer.Release() 300 301 reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes())) 302 assert.NoError(t, err) 303 304 assert.EqualValues(t, 1, reader.NumRowGroups()) 305 valuesOut := make([]int32, valueCount) 306 307 for r := 0; r < reader.NumRowGroups(); r++ { 308 rgr := reader.RowGroup(r) 309 assert.EqualValues(t, 1, rgr.NumColumns()) 310 assert.EqualValues(t, valueCount, rgr.NumRows()) 311 312 var totalRead int64 313 colReader := rgr.Column(0).(*file.Int32ColumnChunkReader) 314 for colReader.HasNext() { 315 total, _, _ := colReader.ReadBatch(valueCount-totalRead, valuesOut[totalRead:], nil, nil) 316 totalRead += total 317 } 318 assert.EqualValues(t, valueCount, totalRead) 319 assert.Equal(t, valuesIn, valuesOut) 320 } 321 } 322 323 func TestAllNulls(t *testing.T) { 324 sc, _ := schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{ 325 schema.NewInt32Node("nulls", parquet.Repetitions.Optional, -1), 326 }, -1) 327 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 328 329 writer := file.NewParquetWriter(sink, sc) 330 rgw := writer.AppendRowGroup() 331 cwr, _ := rgw.NextColumn() 332 cw := cwr.(*file.Int32ColumnChunkWriter) 333 334 var ( 335 values [3]int32 336 defLevels = [...]int16{0, 0, 0} 337 ) 338 339 cw.WriteBatch(values[:], defLevels[:], nil) 340 cw.Close() 341 rgw.Close() 342 writer.Close() 343 344 buffer := sink.Finish() 345 defer buffer.Release() 346 props := parquet.NewReaderProperties(memory.DefaultAllocator) 347 props.BufferedStreamEnabled = true 348 349 reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()), file.WithReadProps(props)) 350 assert.NoError(t, err) 351 352 rgr := reader.RowGroup(0) 353 cr := rgr.Column(0).(*file.Int32ColumnChunkReader) 354 355 defLevels[0] = -1 356 defLevels[1] = -1 357 defLevels[2] = -1 358 valRead, read, _ := cr.ReadBatch(3, values[:], defLevels[:], nil) 359 assert.EqualValues(t, 3, valRead) 360 assert.EqualValues(t, 0, read) 361 assert.Equal(t, []int16{0, 0, 0}, defLevels[:]) 362 } 363 364 func createSerializeTestSuite(typ reflect.Type) suite.TestingSuite { 365 return &SerializeTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)} 366 } 367 368 func TestSerialize(t *testing.T) { 369 t.Parallel() 370 types := []struct { 371 typ reflect.Type 372 }{ 373 {reflect.TypeOf(true)}, 374 {reflect.TypeOf(int32(0))}, 375 {reflect.TypeOf(int64(0))}, 376 {reflect.TypeOf(float32(0))}, 377 {reflect.TypeOf(float64(0))}, 378 {reflect.TypeOf(parquet.Int96{})}, 379 {reflect.TypeOf(parquet.ByteArray{})}, 380 } 381 for _, tt := range types { 382 tt := tt 383 t.Run(tt.typ.String(), func(t *testing.T) { 384 t.Parallel() 385 suite.Run(t, createSerializeTestSuite(tt.typ)) 386 }) 387 } 388 }