github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_writer_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"reflect"
    22  	"testing"
    23  
    24  	"github.com/apache/arrow/go/v7/arrow/memory"
    25  	"github.com/apache/arrow/go/v7/parquet"
    26  	"github.com/apache/arrow/go/v7/parquet/compress"
    27  	"github.com/apache/arrow/go/v7/parquet/file"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
    30  	"github.com/apache/arrow/go/v7/parquet/schema"
    31  	"github.com/stretchr/testify/assert"
    32  	"github.com/stretchr/testify/suite"
    33  )
    34  
    35  type SerializeTestSuite struct {
    36  	testutils.PrimitiveTypedTest
    37  	suite.Suite
    38  
    39  	numCols      int
    40  	numRowGroups int
    41  	rowsPerRG    int
    42  	rowsPerBatch int
    43  }
    44  
    45  func (t *SerializeTestSuite) SetupTest() {
    46  	t.numCols = 4
    47  	t.numRowGroups = 4
    48  	t.rowsPerRG = 50
    49  	t.rowsPerBatch = 10
    50  	t.SetupSchema(parquet.Repetitions.Optional, t.numCols)
    51  }
    52  
    53  func (t *SerializeTestSuite) fileSerializeTest(codec compress.Compression, expected compress.Compression) {
    54  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
    55  
    56  	opts := make([]parquet.WriterProperty, 0)
    57  	for i := 0; i < t.numCols; i++ {
    58  		opts = append(opts, parquet.WithCompressionFor(t.Schema.Column(i).Name(), codec))
    59  	}
    60  
    61  	props := parquet.NewWriterProperties(opts...)
    62  
    63  	writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props))
    64  	t.GenerateData(int64(t.rowsPerRG))
    65  	for rg := 0; rg < t.numRowGroups/2; rg++ {
    66  		rgw := writer.AppendRowGroup()
    67  		for col := 0; col < t.numCols; col++ {
    68  			cw, _ := rgw.NextColumn()
    69  			t.WriteBatchValues(cw, t.DefLevels, nil)
    70  			cw.Close()
    71  			// ensure column() api which is specific to bufferedrowgroups cannot be called
    72  			t.Panics(func() { rgw.(file.BufferedRowGroupWriter).Column(col) })
    73  		}
    74  		rgw.Close()
    75  	}
    76  
    77  	// write half buffered row groups
    78  	for rg := 0; rg < t.numRowGroups/2; rg++ {
    79  		rgw := writer.AppendBufferedRowGroup()
    80  		for batch := 0; batch < (t.rowsPerRG / t.rowsPerBatch); batch++ {
    81  			for col := 0; col < t.numCols; col++ {
    82  				cw, _ := rgw.Column(col)
    83  				offset := batch * t.rowsPerBatch
    84  				t.WriteBatchSubset(t.rowsPerBatch, offset, cw, t.DefLevels[offset:t.rowsPerBatch+offset], nil)
    85  				// Ensure NextColumn api which is specific to RowGroup cannot be called
    86  				t.Panics(func() { rgw.(file.SerialRowGroupWriter).NextColumn() })
    87  			}
    88  		}
    89  		for col := 0; col < t.numCols; col++ {
    90  			cw, _ := rgw.Column(col)
    91  			cw.Close()
    92  		}
    93  		rgw.Close()
    94  	}
    95  	writer.Close()
    96  
    97  	nrows := t.numRowGroups * t.rowsPerRG
    98  	reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes()))
    99  	t.NoError(err)
   100  	t.Equal(t.numCols, reader.MetaData().Schema.NumColumns())
   101  	t.Equal(t.numRowGroups, reader.NumRowGroups())
   102  	t.EqualValues(nrows, reader.NumRows())
   103  
   104  	for rg := 0; rg < t.numRowGroups; rg++ {
   105  		rgr := reader.RowGroup(rg)
   106  		t.Equal(t.numCols, rgr.NumColumns())
   107  		t.EqualValues(t.rowsPerRG, rgr.NumRows())
   108  		chunk, _ := rgr.MetaData().ColumnChunk(0)
   109  		t.Equal(expected, chunk.Compression())
   110  
   111  		valuesRead := int64(0)
   112  
   113  		for i := 0; i < t.numCols; i++ {
   114  			chunk, _ := rgr.MetaData().ColumnChunk(i)
   115  			t.False(chunk.HasIndexPage())
   116  			t.DefLevelsOut = make([]int16, t.rowsPerRG)
   117  			t.RepLevelsOut = make([]int16, t.rowsPerRG)
   118  			colReader := rgr.Column(i)
   119  			t.SetupValuesOut(int64(t.rowsPerRG))
   120  			valuesRead = t.ReadBatch(colReader, int64(t.rowsPerRG), 0, t.DefLevelsOut, t.RepLevelsOut)
   121  			t.EqualValues(t.rowsPerRG, valuesRead)
   122  			t.Equal(t.Values, t.ValuesOut)
   123  			t.Equal(t.DefLevels, t.DefLevelsOut)
   124  		}
   125  	}
   126  }
   127  
   128  func (t *SerializeTestSuite) unequalNumRows(maxRows int64, rowsPerCol []int64) {
   129  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   130  	props := parquet.NewWriterProperties()
   131  	writer := file.NewParquetWriter(sink, t.Schema.Root(), file.WithWriterProps(props))
   132  	defer writer.Close()
   133  
   134  	rgw := writer.AppendRowGroup()
   135  	t.GenerateData(maxRows)
   136  	for col := 0; col < t.numCols; col++ {
   137  		cw, _ := rgw.NextColumn()
   138  		t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil)
   139  		cw.Close()
   140  	}
   141  	t.Error(rgw.Close())
   142  }
   143  
   144  func (t *SerializeTestSuite) unequalNumRowsBuffered(maxRows int64, rowsPerCol []int64) {
   145  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   146  	writer := file.NewParquetWriter(sink, t.Schema.Root())
   147  	defer writer.Close()
   148  
   149  	rgw := writer.AppendBufferedRowGroup()
   150  	t.GenerateData(maxRows)
   151  	for col := 0; col < t.numCols; col++ {
   152  		cw, _ := rgw.Column(col)
   153  		t.WriteBatchSubset(int(rowsPerCol[col]), 0, cw, t.DefLevels[:rowsPerCol[col]], nil)
   154  		cw.Close()
   155  	}
   156  	t.Error(rgw.Close())
   157  }
   158  
   159  func (t *SerializeTestSuite) TestZeroRows() {
   160  	t.NotPanics(func() {
   161  		sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   162  		writer := file.NewParquetWriter(sink, t.Schema.Root())
   163  		defer writer.Close()
   164  
   165  		srgw := writer.AppendRowGroup()
   166  		for col := 0; col < t.numCols; col++ {
   167  			cw, _ := srgw.NextColumn()
   168  			cw.Close()
   169  		}
   170  		srgw.Close()
   171  
   172  		brgw := writer.AppendBufferedRowGroup()
   173  		for col := 0; col < t.numCols; col++ {
   174  			cw, _ := brgw.Column(col)
   175  			cw.Close()
   176  		}
   177  		brgw.Close()
   178  	})
   179  }
   180  
   181  func (t *SerializeTestSuite) TestTooManyColumns() {
   182  	t.SetupSchema(parquet.Repetitions.Optional, 1)
   183  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   184  	writer := file.NewParquetWriter(sink, t.Schema.Root())
   185  	rgw := writer.AppendRowGroup()
   186  
   187  	rgw.NextColumn()                      // first column
   188  	t.Panics(func() { rgw.NextColumn() }) // only one column!
   189  }
   190  
   191  func (t *SerializeTestSuite) TestRepeatedTooFewRows() {
   192  	// optional and repeated, so definition and repetition levels
   193  	t.SetupSchema(parquet.Repetitions.Repeated, 1)
   194  	const nrows = 100
   195  	t.GenerateData(nrows)
   196  
   197  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   198  	writer := file.NewParquetWriter(sink, t.Schema.Root())
   199  
   200  	rgw := writer.AppendRowGroup()
   201  	t.RepLevels = make([]int16, nrows)
   202  	for idx := range t.RepLevels {
   203  		t.RepLevels[idx] = 0
   204  	}
   205  
   206  	cw, _ := rgw.NextColumn()
   207  	t.WriteBatchValues(cw, t.DefLevels, t.RepLevels)
   208  	cw.Close()
   209  
   210  	t.RepLevels[3] = 1 // this makes it so that values 2 and 3 are a single row
   211  	// as a result there's one too few rows in the result
   212  
   213  	t.Panics(func() {
   214  		cw, _ = rgw.NextColumn()
   215  		t.WriteBatchValues(cw, t.DefLevels, t.RepLevels)
   216  		cw.Close()
   217  	})
   218  }
   219  
   220  func (t *SerializeTestSuite) TestTooFewRows() {
   221  	rowsPerCol := []int64{100, 100, 100, 99}
   222  	t.NotPanics(func() { t.unequalNumRows(100, rowsPerCol) })
   223  	t.NotPanics(func() { t.unequalNumRowsBuffered(100, rowsPerCol) })
   224  }
   225  
   226  func (t *SerializeTestSuite) TestTooManyRows() {
   227  	rowsPerCol := []int64{100, 100, 100, 101}
   228  	t.NotPanics(func() { t.unequalNumRows(101, rowsPerCol) })
   229  	t.NotPanics(func() { t.unequalNumRowsBuffered(101, rowsPerCol) })
   230  }
   231  
   232  func (t *SerializeTestSuite) TestSmallFile() {
   233  	codecs := []compress.Compression{
   234  		compress.Codecs.Uncompressed,
   235  		compress.Codecs.Snappy,
   236  		compress.Codecs.Brotli,
   237  		compress.Codecs.Gzip,
   238  		compress.Codecs.Zstd,
   239  		// compress.Codecs.Lz4,
   240  		// compress.Codecs.Lzo,
   241  	}
   242  	for _, c := range codecs {
   243  		t.Run(c.String(), func() {
   244  			t.NotPanics(func() { t.fileSerializeTest(c, c) })
   245  		})
   246  	}
   247  }
   248  
   249  func TestBufferedDisabledDictionary(t *testing.T) {
   250  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   251  	fields := schema.FieldList{schema.NewInt32Node("col", parquet.Repetitions.Required, 1)}
   252  	sc, _ := schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, 0)
   253  	props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false))
   254  
   255  	writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props))
   256  	rgw := writer.AppendBufferedRowGroup()
   257  	cwr, _ := rgw.Column(0)
   258  	cw := cwr.(*file.Int32ColumnChunkWriter)
   259  	cw.WriteBatch([]int32{1}, nil, nil)
   260  	rgw.Close()
   261  	writer.Close()
   262  
   263  	buffer := sink.Finish()
   264  	defer buffer.Release()
   265  	reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()))
   266  	assert.NoError(t, err)
   267  	assert.EqualValues(t, 1, reader.NumRowGroups())
   268  	rgReader := reader.RowGroup(0)
   269  	assert.EqualValues(t, 1, rgReader.NumRows())
   270  	chunk, _ := rgReader.MetaData().ColumnChunk(0)
   271  	assert.False(t, chunk.HasDictionaryPage())
   272  }
   273  
   274  func TestBufferedMultiPageDisabledDictionary(t *testing.T) {
   275  	const (
   276  		valueCount = 10000
   277  		pageSize   = 16384
   278  	)
   279  	var (
   280  		sink  = encoding.NewBufferWriter(0, memory.DefaultAllocator)
   281  		props = parquet.NewWriterProperties(parquet.WithDictionaryDefault(false), parquet.WithDataPageSize(pageSize))
   282  		sc, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
   283  			schema.NewInt32Node("col", parquet.Repetitions.Required, -1),
   284  		}, -1)
   285  	)
   286  
   287  	writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(props))
   288  	rgWriter := writer.AppendBufferedRowGroup()
   289  	cwr, _ := rgWriter.Column(0)
   290  	cw := cwr.(*file.Int32ColumnChunkWriter)
   291  	valuesIn := make([]int32, 0, valueCount)
   292  	for i := int32(0); i < valueCount; i++ {
   293  		valuesIn = append(valuesIn, (i%100)+1)
   294  	}
   295  	cw.WriteBatch(valuesIn, nil, nil)
   296  	rgWriter.Close()
   297  	writer.Close()
   298  	buffer := sink.Finish()
   299  	defer buffer.Release()
   300  
   301  	reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()))
   302  	assert.NoError(t, err)
   303  
   304  	assert.EqualValues(t, 1, reader.NumRowGroups())
   305  	valuesOut := make([]int32, valueCount)
   306  
   307  	for r := 0; r < reader.NumRowGroups(); r++ {
   308  		rgr := reader.RowGroup(r)
   309  		assert.EqualValues(t, 1, rgr.NumColumns())
   310  		assert.EqualValues(t, valueCount, rgr.NumRows())
   311  
   312  		var totalRead int64
   313  		colReader := rgr.Column(0).(*file.Int32ColumnChunkReader)
   314  		for colReader.HasNext() {
   315  			total, _, _ := colReader.ReadBatch(valueCount-totalRead, valuesOut[totalRead:], nil, nil)
   316  			totalRead += total
   317  		}
   318  		assert.EqualValues(t, valueCount, totalRead)
   319  		assert.Equal(t, valuesIn, valuesOut)
   320  	}
   321  }
   322  
   323  func TestAllNulls(t *testing.T) {
   324  	sc, _ := schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{
   325  		schema.NewInt32Node("nulls", parquet.Repetitions.Optional, -1),
   326  	}, -1)
   327  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   328  
   329  	writer := file.NewParquetWriter(sink, sc)
   330  	rgw := writer.AppendRowGroup()
   331  	cwr, _ := rgw.NextColumn()
   332  	cw := cwr.(*file.Int32ColumnChunkWriter)
   333  
   334  	var (
   335  		values    [3]int32
   336  		defLevels = [...]int16{0, 0, 0}
   337  	)
   338  
   339  	cw.WriteBatch(values[:], defLevels[:], nil)
   340  	cw.Close()
   341  	rgw.Close()
   342  	writer.Close()
   343  
   344  	buffer := sink.Finish()
   345  	defer buffer.Release()
   346  	props := parquet.NewReaderProperties(memory.DefaultAllocator)
   347  	props.BufferedStreamEnabled = true
   348  
   349  	reader, err := file.NewParquetReader(bytes.NewReader(buffer.Bytes()), file.WithReadProps(props))
   350  	assert.NoError(t, err)
   351  
   352  	rgr := reader.RowGroup(0)
   353  	cr := rgr.Column(0).(*file.Int32ColumnChunkReader)
   354  
   355  	defLevels[0] = -1
   356  	defLevels[1] = -1
   357  	defLevels[2] = -1
   358  	valRead, read, _ := cr.ReadBatch(3, values[:], defLevels[:], nil)
   359  	assert.EqualValues(t, 3, valRead)
   360  	assert.EqualValues(t, 0, read)
   361  	assert.Equal(t, []int16{0, 0, 0}, defLevels[:])
   362  }
   363  
   364  func createSerializeTestSuite(typ reflect.Type) suite.TestingSuite {
   365  	return &SerializeTestSuite{PrimitiveTypedTest: testutils.NewPrimitiveTypedTest(typ)}
   366  }
   367  
   368  func TestSerialize(t *testing.T) {
   369  	t.Parallel()
   370  	types := []struct {
   371  		typ reflect.Type
   372  	}{
   373  		{reflect.TypeOf(true)},
   374  		{reflect.TypeOf(int32(0))},
   375  		{reflect.TypeOf(int64(0))},
   376  		{reflect.TypeOf(float32(0))},
   377  		{reflect.TypeOf(float64(0))},
   378  		{reflect.TypeOf(parquet.Int96{})},
   379  		{reflect.TypeOf(parquet.ByteArray{})},
   380  	}
   381  	for _, tt := range types {
   382  		tt := tt
   383  		t.Run(tt.typ.String(), func(t *testing.T) {
   384  			t.Parallel()
   385  			suite.Run(t, createSerializeTestSuite(tt.typ))
   386  		})
   387  	}
   388  }