github.com/apache/arrow/go/v10@v10.0.1/parquet/file/file_writer_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"reflect"
    22  	"testing"
    23  
    24  	"github.com/apache/arrow/go/v10/arrow/memory"
    25  	"github.com/apache/arrow/go/v10/parquet"
    26  	"github.com/apache/arrow/go/v10/parquet/compress"
    27  	"github.com/apache/arrow/go/v10/parquet/file"
    28  	"github.com/apache/arrow/go/v10/parquet/internal/encoding"
    29  	"github.com/apache/arrow/go/v10/parquet/internal/testutils"
    30  	"github.com/apache/arrow/go/v10/parquet/schema"
    31  	"github.com/stretchr/testify/assert"
    32  	"github.com/stretchr/testify/suite"
    33  )
    34  
    35  type SerializeTestSuite struct {
    36  	testutils.PrimitiveTypedTest
    37  	suite.Suite
    38  
    39  	numCols      int
    40  	numRowGroups int
    41  	rowsPerRG    int
    42  	rowsPerBatch int
    43  }
    44  
    45  func (t *SerializeTestSuite) SetupTest() {
    46  	t.numCols = 4
    47  	t.numRowGroups = 4
    48  	t.rowsPerRG = 50
    49  	t.rowsPerBatch = 10
    50  	t.SetupSchema(parquet.Repetitions.Optional, t.numCols)
    51  }
    52  
    53  func (t *SerializeTestSuite) fileSerializeTest(codec compress.Compression, expected compress.Compression) {
    54  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
    55  
    56  	opts := make([]parquet.WriterProperty, 0)
    57  	for i := 0; i < t.numCols; i++ {
    58  		opts = append(opts, parquet.WithCompressionFor(t.Schema.Column(i).Name(), codec))
    59  	}
    60  
    61  	props := parquet.NewWriterProperties(opts...)
    62  
    63  	writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props))
    64  	t.GenerateData(int64(t.rowsPerRG))
    65  	for rg := 0; rg < t.numRowGroups/2; rg++ {
    66  		rgw := writer.AppendRowGroup()
    67  		for col := 0; col < t.numCols; col++ {
    68  			cw, _ := rgw.NextColumn()
    69  			t.WriteBatchValues(cw, t.DefLevels, nil)
    70  			cw.Close()
    71  			// ensure column() api which is specific to bufferedrowgroups cannot be called
    72  			t.Panics(func() { rgw.(file.BufferedRowGroupWriter).Column(col) })
    73  		}
    74  		rgw.Close()
    75  	}
    76  
    77  	// write half buffered row groups
    78  	for rg := 0; rg < t.numRowGroups/2; rg++ {
    79  		rgw := writer.AppendBufferedRowGroup()
    80  		for batch := 0; batch < (t.rowsPerRG / t.rowsPerBatch); batch++ {
    81  			for col := 0; col < t.numCols; col++ {
    82  				cw, _ := rgw.Column(col)
    83  				offset := batch * t.rowsPerBatch
    84  				t.WriteBatchSubset(t.rowsPerBatch, offset, cw, t.DefLevels[offset:t.rowsPerBatch+offset], nil)
    85  				// Ensure NextColumn api which is specific to RowGroup cannot be called
    86  				t.Panics(func() { rgw.(file.SerialRowGroupWriter).NextColumn() })
    87  			}
    88  		}
    89  		for col := 0; col < t.numCols; col++ {
    90  			cw, _ := rgw.Column(col)
    91  			cw.Close()
    92  		}
    93  		rgw.Close()
    94  	}
    95  	writer.Close()
    96  
    97  	nrows := t.numRowGroups * t.rowsPerRG
    98  	reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes()))
    99  	t.NoError(err)
   100  	t.Equal(t.numCols, reader.MetaData().Schema.NumColumns())
   101  	t.Equal(t.numRowGroups, reader.NumRowGroups())
   102  	t.EqualValues(nrows, reader.NumRows())
   103  
   104  	for rg := 0; rg < t.numRowGroups; rg++ {
   105  		rgr := reader.RowGroup(rg)
   106  		t.Equal(t.numCols, rgr.NumColumns())
   107  		t.EqualValues(t.rowsPerRG, rgr.NumRows())
   108  		chunk, _ := rgr.MetaData().ColumnChunk(0)
   109  		t.Equal(expected, chunk.Compression())
   110  
   111  		valuesRead := int64(0)
   112  
   113  		for i := 0; i < t.numCols; i++ {
   114  			chunk, _ := rgr.MetaData().ColumnChunk(i)
   115  			t.False(chunk.HasIndexPage())
   116  			t.DefLevelsOut = make([]int16, t.rowsPerRG)
   117  			t.RepLevelsOut = make([]int16, t.rowsPerRG)
   118  			colReader, err := rgr.Column(i)
   119  			t.NoError(err)
   120  			t.SetupValuesOut(int64(t.rowsPerRG))
   121  			valuesRead = t.ReadBatch(colReader, int64(t.rowsPerRG), 0, t.DefLevelsOut, t.RepLevelsOut)
   122  			t.EqualValues(t.rowsPerRG, valuesRead)
   123  			t.Equal(t.Values, t.ValuesOut)
   124  			t.Equal(t.DefLevels, t.DefLevelsOut)
   125  		}
   126  	}
   127  }
   128  
   129  func (t *SerializeTestSuite) unequalNumRows(maxRows int64, rowsPerCol []int64) {
   130  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   131  	props := parquet.NewWriterProperties()
   132  	writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props))
   133  	defer writer.Close()
   134  
   135  	rgw := writer.AppendRowGroup()
   136  	t.GenerateData(maxRows)
   137  	for col := 0; col < t.numCols; col++ {
   138  		cw, _ := rgw.NextColumn()
   139  		t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil)
   140  		cw.Close()
   141  	}
   142  	t.Error(rgw.Close())
   143  }
   144  
   145  func (t *SerializeTestSuite) unequalNumRowsBuffered(maxRows int64, rowsPerCol []int64) {
   146  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   147  	writer := file.NewParquetWriter(sink, t.Schema.Root())
   148  	defer writer.Close()
   149  
   150  	rgw := writer.AppendBufferedRowGroup()
   151  	t.GenerateData(maxRows)
   152  	for col := 0; col < t.numCols; col++ {
   153  		cw, _ := rgw.Column(col)
   154  		t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil)
   155  		cw.Close()
   156  	}
   157  	t.Error(rgw.Close())
   158  }
   159  
   160  func (t *SerializeTestSuite) TestZeroRows() {
   161  	t.NotPanics(func() {
   162  		sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   163  		writer := file.NewParquetWriter(sink, t.Schema.Root())
   164  		defer writer.Close()
   165  
   166  		srgw := writer.AppendRowGroup()
   167  		for col := 0; col < t.numCols; col++ {
   168  			cw, _ := srgw.NextColumn()
   169  			cw.Close()
   170  		}
   171  		srgw.Close()
   172  
   173  		brgw := writer.AppendBufferedRowGroup()
   174  		for col := 0; col < t.numCols; col++ {
   175  			cw, _ := brgw.Column(col)
   176  			cw.Close()
   177  		}
   178  		brgw.Close()
   179  	})
   180  }
   181  
   182  func (t *SerializeTestSuite) TestTooManyColumns() {
   183  	t.SetupSchema(parquet.Repetitions.Optional, 1)
   184  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   185  	writer := file.NewParquetWriter(sink, t.Schema.Root())
   186  	rgw := writer.AppendRowGroup()
   187  
   188  	rgw.NextColumn()                      // first column
   189  	t.Panics(func() { rgw.NextColumn() }) // only one column!
   190  }
   191  
   192  func (t *SerializeTestSuite) TestRepeatedTooFewRows() {
   193  	// optional and repeated, so definition and repetition levels
   194  	t.SetupSchema(parquet.Repetitions.Repeated, 1)
   195  	const nrows = 100
   196  	t.GenerateData(nrows)
   197  
   198  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   199  	writer := file.NewParquetWriter(sink, t.Schema.Root())
   200  
   201  	rgw := writer.AppendRowGroup()
   202  	t.RepLevels = make([]int16, nrows)
   203  	for idx := range t.RepLevels {
   204  		t.RepLevels[idx] = 0
   205  	}
   206  
   207  	cw, _ := rgw.NextColumn()
   208  	t.WriteBatchValues(cw, t.DefLevels, t.RepLevels)
   209  	cw.Close()
   210  
   211  	t.RepLevels[3] = 1 // this makes it so that values 2 and 3 are a single row
   212  	// as a result there's one too few rows in the result
   213  
   214  	t.Panics(func() {
   215  		cw, _ = rgw.NextColumn()
   216  		t.WriteBatchValues(cw, t.DefLevels, t.RepLevels)
   217  		cw.Close()
   218  	})
   219  }
   220  
   221  func (t *SerializeTestSuite) TestTooFewRows() {
   222  	rowsPerCol := []int64{100, 100, 100, 99}
   223  	t.NotPanics(func() { t.unequalNumRows(100, rowsPerCol) })
   224  	t.NotPanics(func() { t.unequalNumRowsBuffered(100, rowsPerCol) })
   225  }
   226  
   227  func (t *SerializeTestSuite) TestTooManyRows() {
   228  	rowsPerCol := []int64{100, 100, 100, 101}
   229  	t.NotPanics(func() { t.unequalNumRows(101, rowsPerCol) })
   230  	t.NotPanics(func() { t.unequalNumRowsBuffered(101, rowsPerCol) })
   231  }
   232  
   233  func (t *SerializeTestSuite) TestSmallFile() {
   234  	codecs := []compress.Compression{
   235  		compress.Codecs.Uncompressed,
   236  		compress.Codecs.Snappy,
   237  		compress.Codecs.Brotli,
   238  		compress.Codecs.Gzip,
   239  		compress.Codecs.Zstd,
   240  		// compress.Codecs.Lz4,
   241  		// compress.Codecs.Lzo,
   242  	}
   243  	for _, c := range codecs {
   244  		t.Run(c.String(), func() {
   245  			t.NotPanics(func() { t.fileSerializeTest(c, c) })
   246  		})
   247  	}
   248  }
   249  
   250  func TestBufferedDisabledDictionary(t *testing.T) {
   251  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   252  	fields := schema.FieldList{schema.NewInt32Node("col", parquet.Repetitions.Required, 1)}
   253  	sc, _ := schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, 0)
   254  	props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false))
   255  
   256  	writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props))
   257  	rgw := writer.AppendBufferedRowGroup()
   258  	cwr, _ := rgw.Column(0)
   259  	cw := cwr.(*file.Int32ColumnChunkWriter)
   260  	cw.WriteBatch([]int32{1}, nil, nil)
   261  	rgw.Close()
   262  	writer.Close()
   263  
   264  	buffer := sink.Finish()
   265  	defer buffer.Release()
   266  	reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()))
   267  	assert.NoError(t, err)
   268  	assert.EqualValues(t, 1, reader.NumRowGroups())
   269  	rgReader := reader.RowGroup(0)
   270  	assert.EqualValues(t, 1, rgReader.NumRows())
   271  	chunk, _ := rgReader.MetaData().ColumnChunk(0)
   272  	assert.False(t, chunk.HasDictionaryPage())
   273  }
   274  
   275  func TestBufferedMultiPageDisabledDictionary(t *testing.T) {
   276  	const (
   277  		valueCount = 10000
   278  		pageSize   = 16384
   279  	)
   280  	var (
   281  		sink  = encoding.NewBufferWriter(0, memory.DefaultAllocator)
   282  		props = parquet.NewWriterProperties(parquet.WithDictionaryDefault(false), parquet.WithDataPageSize(pageSize))
   283  		sc, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
   284  			schema.NewInt32Node("col", parquet.Repetitions.Required, -1),
   285  		}, -1)
   286  	)
   287  
   288  	writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props))
   289  	rgWriter := writer.AppendBufferedRowGroup()
   290  	cwr, _ := rgWriter.Column(0)
   291  	cw := cwr.(*file.Int32ColumnChunkWriter)
   292  	valuesIn := make([]int32, 0, valueCount)
   293  	for i := int32(0); i < valueCount; i++ {
   294  		valuesIn = append(valuesIn, (i%100)+1)
   295  	}
   296  	cw.WriteBatch(valuesIn, nil, nil)
   297  	rgWriter.Close()
   298  	writer.Close()
   299  	buffer := sink.Finish()
   300  	defer buffer.Release()
   301  
   302  	reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()))
   303  	assert.NoError(t, err)
   304  
   305  	assert.EqualValues(t, 1, reader.NumRowGroups())
   306  	valuesOut := make([]int32, valueCount)
   307  
   308  	for r := 0; r < reader.NumRowGroups(); r++ {
   309  		rgr := reader.RowGroup(r)
   310  		assert.EqualValues(t, 1, rgr.NumColumns())
   311  		assert.EqualValues(t, valueCount, rgr.NumRows())
   312  
   313  		var totalRead int64
   314  		col, err := rgr.Column(0)
   315  		assert.NoError(t, err)
   316  		colReader := col.(*file.Int32ColumnChunkReader)
   317  		for colReader.HasNext() {
   318  			total, _, _ := colReader.ReadBatch(valueCount-totalRead, valuesOut[totalRead:], nil, nil)
   319  			totalRead += total
   320  		}
   321  		assert.EqualValues(t, valueCount, totalRead)
   322  		assert.Equal(t, valuesIn, valuesOut)
   323  	}
   324  }
   325  
   326  func TestAllNulls(t *testing.T) {
   327  	sc, _ := schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{
   328  		schema.NewInt32Node("nulls", parquet.Repetitions.Optional, -1),
   329  	}, -1)
   330  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   331  
   332  	writer := file.NewParquetWriter(sink, sc)
   333  	rgw := writer.AppendRowGroup()
   334  	cwr, _ := rgw.NextColumn()
   335  	cw := cwr.(*file.Int32ColumnChunkWriter)
   336  
   337  	var (
   338  		values    [3]int32
   339  		defLevels = [...]int16{0, 0, 0}
   340  	)
   341  
   342  	cw.WriteBatch(values[:], defLevels[:], nil)
   343  	cw.Close()
   344  	rgw.Close()
   345  	writer.Close()
   346  
   347  	buffer := sink.Finish()
   348  	defer buffer.Release()
   349  	props := parquet.NewReaderProperties(memory.DefaultAllocator)
   350  	props.BufferedStreamEnabled = true
   351  
   352  	reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()), file.WithReadProps(props))
   353  	assert.NoError(t, err)
   354  
   355  	rgr := reader.RowGroup(0)
   356  	col, err := rgr.Column(0)
   357  	assert.NoError(t, err)
   358  	cr := col.(*file.Int32ColumnChunkReader)
   359  
   360  	defLevels[0] = -1
   361  	defLevels[1] = -1
   362  	defLevels[2] = -1
   363  	valRead, read, _ := cr.ReadBatch(3, values[:], defLevels[:], nil)
   364  	assert.EqualValues(t, 3, valRead)
   365  	assert.EqualValues(t, 0, read)
   366  	assert.Equal(t, []int16{0, 0, 0}, defLevels[:])
   367  }
   368  
   369  func createSerializeTestSuite(typ reflect.Type) suite.TestingSuite {
   370  	return &SerializeTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}
   371  }
   372  
   373  func TestSerialize(t *testing.T) {
   374  	t.Parallel()
   375  	types := []struct {
   376  		typ reflect.Type
   377  	}{
   378  		{reflect.TypeOf(true)},
   379  		{reflect.TypeOf(int32(0))},
   380  		{reflect.TypeOf(int64(0))},
   381  		{reflect.TypeOf(float32(0))},
   382  		{reflect.TypeOf(float64(0))},
   383  		{reflect.TypeOf(parquet.Int96{})},
   384  		{reflect.TypeOf(parquet.ByteArray{})},
   385  	}
   386  	for _, tt := range types {
   387  		tt := tt
   388  		t.Run(tt.typ.String(), func(t *testing.T) {
   389  			t.Parallel()
   390  			suite.Run(t, createSerializeTestSuite(tt.typ))
   391  		})
   392  	}
   393  }