github.com/apache/arrow/go/v16@v16.1.0/parquet/pqarrow/encode_arrow_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow_test
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"math"
    24  	"strconv"
    25  	"strings"
    26  	"testing"
    27  
    28  	"github.com/apache/arrow/go/v16/arrow"
    29  	"github.com/apache/arrow/go/v16/arrow/array"
    30  	"github.com/apache/arrow/go/v16/arrow/bitutil"
    31  	"github.com/apache/arrow/go/v16/arrow/decimal128"
    32  	"github.com/apache/arrow/go/v16/arrow/decimal256"
    33  	"github.com/apache/arrow/go/v16/arrow/ipc"
    34  	"github.com/apache/arrow/go/v16/arrow/memory"
    35  	"github.com/apache/arrow/go/v16/internal/types"
    36  	"github.com/apache/arrow/go/v16/internal/utils"
    37  	"github.com/apache/arrow/go/v16/parquet"
    38  	"github.com/apache/arrow/go/v16/parquet/compress"
    39  	"github.com/apache/arrow/go/v16/parquet/file"
    40  	"github.com/apache/arrow/go/v16/parquet/internal/encoding"
    41  	"github.com/apache/arrow/go/v16/parquet/internal/testutils"
    42  	"github.com/apache/arrow/go/v16/parquet/pqarrow"
    43  	"github.com/apache/arrow/go/v16/parquet/schema"
    44  	"github.com/google/uuid"
    45  	"github.com/stretchr/testify/assert"
    46  	"github.com/stretchr/testify/require"
    47  	"github.com/stretchr/testify/suite"
    48  )
    49  
    50  func makeSimpleTable(values *arrow.Chunked, nullable bool) arrow.Table {
    51  	sc := arrow.NewSchema([]arrow.Field{{Name: "col", Type: values.DataType(), Nullable: nullable,
    52  		Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}}, nil)
    53  	column := arrow.NewColumn(sc.Field(0), values)
    54  	defer column.Release()
    55  	return array.NewTable(sc, []arrow.Column{*column}, -1)
    56  }
    57  
    58  func makeDateTimeTypesTable(mem memory.Allocator, expected bool, addFieldMeta bool) arrow.Table {
    59  	isValid := []bool{true, true, true, false, true, true}
    60  
    61  	// roundtrip without modification
    62  	f0 := arrow.Field{Name: "f0", Type: arrow.FixedWidthTypes.Date32, Nullable: true}
    63  	f1 := arrow.Field{Name: "f1", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: true}
    64  	f2 := arrow.Field{Name: "f2", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}
    65  	f3 := arrow.Field{Name: "f3", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: true}
    66  	f3X := arrow.Field{Name: "f3", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}
    67  	f4 := arrow.Field{Name: "f4", Type: arrow.FixedWidthTypes.Time32ms, Nullable: true}
    68  	f5 := arrow.Field{Name: "f5", Type: arrow.FixedWidthTypes.Time64us, Nullable: true}
    69  	f6 := arrow.Field{Name: "f6", Type: arrow.FixedWidthTypes.Time64ns, Nullable: true}
    70  
    71  	fieldList := []arrow.Field{f0, f1, f2}
    72  	if expected {
    73  		fieldList = append(fieldList, f3X)
    74  	} else {
    75  		fieldList = append(fieldList, f3)
    76  	}
    77  	fieldList = append(fieldList, f4, f5, f6)
    78  
    79  	if addFieldMeta {
    80  		for idx := range fieldList {
    81  			fieldList[idx].Metadata = arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(idx + 1)})
    82  		}
    83  	}
    84  	arrsc := arrow.NewSchema(fieldList, nil)
    85  
    86  	d32Values := []arrow.Date32{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}
    87  	ts64nsValues := []arrow.Timestamp{1489269000000, 1489270000000, 1489271000000, 1489272000000, 1489272000000, 1489273000000}
    88  	ts64usValues := []arrow.Timestamp{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}
    89  	ts64msValues := []arrow.Timestamp{1489269, 1489270, 1489271, 1489272, 1489272, 1489273}
    90  	t32Values := []arrow.Time32{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}
    91  	t64nsValues := []arrow.Time64{1489269000000, 1489270000000, 1489271000000, 1489272000000, 1489272000000, 1489273000000}
    92  	t64usValues := []arrow.Time64{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}
    93  
    94  	builders := make([]array.Builder, 0, len(fieldList))
    95  	for _, f := range fieldList {
    96  		bldr := array.NewBuilder(mem, f.Type)
    97  		defer bldr.Release()
    98  		builders = append(builders, bldr)
    99  	}
   100  
   101  	builders[0].(*array.Date32Builder).AppendValues(d32Values, isValid)
   102  	builders[1].(*array.TimestampBuilder).AppendValues(ts64msValues, isValid)
   103  	builders[2].(*array.TimestampBuilder).AppendValues(ts64usValues, isValid)
   104  	if expected {
   105  		builders[3].(*array.TimestampBuilder).AppendValues(ts64usValues, isValid)
   106  	} else {
   107  		builders[3].(*array.TimestampBuilder).AppendValues(ts64nsValues, isValid)
   108  	}
   109  	builders[4].(*array.Time32Builder).AppendValues(t32Values, isValid)
   110  	builders[5].(*array.Time64Builder).AppendValues(t64usValues, isValid)
   111  	builders[6].(*array.Time64Builder).AppendValues(t64nsValues, isValid)
   112  
   113  	cols := make([]arrow.Column, 0, len(fieldList))
   114  	for idx, field := range fieldList {
   115  		arr := builders[idx].NewArray()
   116  		defer arr.Release()
   117  
   118  		chunked := arrow.NewChunked(field.Type, []arrow.Array{arr})
   119  		defer chunked.Release()
   120  		col := arrow.NewColumn(field, chunked)
   121  		defer col.Release()
   122  		cols = append(cols, *col)
   123  	}
   124  
   125  	return array.NewTable(arrsc, cols, int64(len(isValid)))
   126  }
   127  
   128  func makeDateTypeTable(mem memory.Allocator, expected bool, partialDays bool) arrow.Table {
   129  	const (
   130  		millisPerHour int64 = 1000 * 60 * 60
   131  		millisPerDay  int64 = millisPerHour * 24
   132  	)
   133  	isValid := []bool{true, true, true, false, true, true}
   134  
   135  	var field arrow.Field
   136  	if expected {
   137  		field = arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: true}
   138  	} else {
   139  		field = arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date64, Nullable: true}
   140  	}
   141  
   142  	field.Metadata = arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"1"})
   143  
   144  	arrsc := arrow.NewSchema([]arrow.Field{field}, nil)
   145  
   146  	d32Values := []arrow.Date32{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}
   147  
   148  	d64Values := make([]arrow.Date64, len(d32Values))
   149  	for i := range d64Values {
   150  		// Calculate number of milliseconds at date boundary
   151  		d64Values[i] = arrow.Date64(int64(d32Values[i]) * millisPerDay)
   152  		if partialDays {
   153  			// Offset 1 or more hours past the date boundary
   154  			hoursIntoDay := int64(i) * millisPerHour
   155  			d64Values[i] += arrow.Date64(hoursIntoDay)
   156  		}
   157  	}
   158  
   159  	bldr := array.NewRecordBuilder(mem, arrsc)
   160  	defer bldr.Release()
   161  
   162  	if expected {
   163  		bldr.Field(0).(*array.Date32Builder).AppendValues(d32Values, isValid)
   164  	} else {
   165  		bldr.Field(0).(*array.Date64Builder).AppendValues(d64Values, isValid)
   166  	}
   167  
   168  	rec := bldr.NewRecord()
   169  	defer rec.Release()
   170  
   171  	return array.NewTableFromRecords(arrsc, []arrow.Record{rec})
   172  }
   173  
   174  func makeTimestampTypeTable(mem memory.Allocator, expected bool) arrow.Table {
   175  	isValid := []bool{true, true, true, false, true, true}
   176  
   177  	// Timestamp with relative (i.e. local) semantics. Make sure it roundtrips without being incorrectly converted to an absolute point in time.
   178  	f0 := arrow.Field{Name: "f0", Type: &arrow.TimestampType{Unit: arrow.Millisecond}, Nullable: true, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"1"})}
   179  
   180  	// Timestamp with absolute (i.e. instant) semantics. The physical representation is always from Unix epoch in UTC timezone.
   181  	// TimeZone is used for display purposes and can be stripped on roundtrip without changing the actual instant referred to.
   182  	// WithStoreSchema will preserve the original timezone, but the instant in will be equivalent even if it's not used.
   183  	f1 := arrow.Field{Name: "f1", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}, Nullable: true, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"2"})}
   184  	f1X := arrow.Field{Name: "f1", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "UTC"}, Nullable: true, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"2"})}
   185  
   186  	fieldList := []arrow.Field{f0}
   187  	if expected {
   188  		fieldList = append(fieldList, f1X)
   189  	} else {
   190  		fieldList = append(fieldList, f1)
   191  	}
   192  
   193  	arrsc := arrow.NewSchema(fieldList, nil)
   194  
   195  	ts64msValues := []arrow.Timestamp{1489269, 1489270, 1489271, 1489272, 1489272, 1489273}
   196  
   197  	bldr := array.NewRecordBuilder(mem, arrsc)
   198  	defer bldr.Release()
   199  
   200  	bldr.Field(0).(*array.TimestampBuilder).AppendValues(ts64msValues, isValid)
   201  	bldr.Field(1).(*array.TimestampBuilder).AppendValues(ts64msValues, isValid)
   202  
   203  	rec := bldr.NewRecord()
   204  	defer rec.Release()
   205  
   206  	return array.NewTableFromRecords(arrsc, []arrow.Record{rec})
   207  }
   208  
   209  func TestWriteArrowCols(t *testing.T) {
   210  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   211  	defer mem.AssertSize(t, 0)
   212  
   213  	tbl := makeDateTimeTypesTable(mem, false, false)
   214  	defer tbl.Release()
   215  
   216  	sink := encoding.NewBufferWriter(0, mem)
   217  	defer sink.Release()
   218  
   219  	fileWriter, err := pqarrow.NewFileWriter(
   220  		tbl.Schema(),
   221  		sink,
   222  		parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_4)),
   223  		pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)),
   224  	)
   225  	require.NoError(t, err)
   226  
   227  	fileWriter.NewRowGroup()
   228  	for i := int64(0); i < tbl.NumCols(); i++ {
   229  		colChunk := tbl.Column(int(i)).Data()
   230  		err := fileWriter.WriteColumnChunked(colChunk, 0, int64(colChunk.Len()))
   231  		require.NoError(t, err)
   232  	}
   233  	require.NoError(t, fileWriter.Close())
   234  
   235  	expected := makeDateTimeTypesTable(mem, true, false)
   236  	defer expected.Release()
   237  
   238  	reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes()))
   239  	require.NoError(t, err)
   240  
   241  	assert.EqualValues(t, expected.NumCols(), reader.MetaData().Schema.NumColumns())
   242  	assert.EqualValues(t, expected.NumRows(), reader.NumRows())
   243  	assert.EqualValues(t, 1, reader.NumRowGroups())
   244  
   245  	rgr := reader.RowGroup(0)
   246  
   247  	for i := 0; i < int(expected.NumCols()); i++ {
   248  		var (
   249  			total        int64
   250  			read         int
   251  			defLevelsOut = make([]int16, int(expected.NumRows()))
   252  			arr          = expected.Column(i).Data().Chunk(0)
   253  		)
   254  		switch expected.Schema().Field(i).Type.(arrow.FixedWidthDataType).BitWidth() {
   255  		case 32:
   256  			col, err := rgr.Column(i)
   257  			assert.NoError(t, err)
   258  			colReader := col.(*file.Int32ColumnChunkReader)
   259  			vals := make([]int32, int(expected.NumRows()))
   260  			total, read, err = colReader.ReadBatch(expected.NumRows(), vals, defLevelsOut, nil)
   261  			require.NoError(t, err)
   262  
   263  			nulls := 0
   264  			for j := 0; j < arr.Len(); j++ {
   265  				if arr.IsNull(j) {
   266  					nulls++
   267  					continue
   268  				}
   269  
   270  				switch v := arr.(type) {
   271  				case *array.Date32:
   272  					assert.EqualValues(t, v.Value(j), vals[j-nulls])
   273  				case *array.Time32:
   274  					assert.EqualValues(t, v.Value(j), vals[j-nulls])
   275  				}
   276  			}
   277  		case 64:
   278  			col, err := rgr.Column(i)
   279  			assert.NoError(t, err)
   280  			colReader := col.(*file.Int64ColumnChunkReader)
   281  			vals := make([]int64, int(expected.NumRows()))
   282  			total, read, err = colReader.ReadBatch(expected.NumRows(), vals, defLevelsOut, nil)
   283  			require.NoError(t, err)
   284  
   285  			nulls := 0
   286  			for j := 0; j < arr.Len(); j++ {
   287  				if arr.IsNull(j) {
   288  					nulls++
   289  					continue
   290  				}
   291  
   292  				switch v := arr.(type) {
   293  				case *array.Date64:
   294  					assert.EqualValues(t, v.Value(j), vals[j-nulls])
   295  				case *array.Time64:
   296  					assert.EqualValues(t, v.Value(j), vals[j-nulls])
   297  				case *array.Timestamp:
   298  					assert.EqualValues(t, v.Value(j), vals[j-nulls])
   299  				}
   300  			}
   301  		}
   302  		assert.EqualValues(t, expected.NumRows(), total)
   303  		assert.EqualValues(t, expected.NumRows()-1, read)
   304  		assert.Equal(t, []int16{1, 1, 1, 0, 1, 1}, defLevelsOut)
   305  	}
   306  }
   307  
   308  func TestWriteArrowInt96(t *testing.T) {
   309  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   310  	defer mem.AssertSize(t, 0)
   311  
   312  	tbl := makeDateTimeTypesTable(mem, false, false)
   313  	defer tbl.Release()
   314  
   315  	sink := encoding.NewBufferWriter(0, mem)
   316  	defer sink.Release()
   317  
   318  	fileWriter, err := pqarrow.NewFileWriter(
   319  		tbl.Schema(),
   320  		sink,
   321  		parquet.NewWriterProperties(parquet.WithAllocator(mem)),
   322  		pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true), pqarrow.WithAllocator(mem)),
   323  	)
   324  	require.NoError(t, err)
   325  
   326  	fileWriter.NewRowGroup()
   327  	for i := int64(0); i < tbl.NumCols(); i++ {
   328  		colChunk := tbl.Column(int(i)).Data()
   329  		err := fileWriter.WriteColumnChunked(colChunk, 0, int64(colChunk.Len()))
   330  		require.NoError(t, err)
   331  	}
   332  	require.NoError(t, fileWriter.Close())
   333  
   334  	expected := makeDateTimeTypesTable(mem, false, false)
   335  	defer expected.Release()
   336  
   337  	reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes()))
   338  	require.NoError(t, err)
   339  
   340  	assert.EqualValues(t, expected.NumCols(), reader.MetaData().Schema.NumColumns())
   341  	assert.EqualValues(t, expected.NumRows(), reader.NumRows())
   342  	assert.EqualValues(t, 1, reader.NumRowGroups())
   343  
   344  	rgr := reader.RowGroup(0)
   345  	tsRdr, err := rgr.Column(3)
   346  	assert.NoError(t, err)
   347  	assert.Equal(t, parquet.Types.Int96, tsRdr.Type())
   348  
   349  	rdr := tsRdr.(*file.Int96ColumnChunkReader)
   350  	vals := make([]parquet.Int96, expected.NumRows())
   351  	defLevels := make([]int16, int(expected.NumRows()))
   352  
   353  	total, read, _ := rdr.ReadBatch(expected.NumRows(), vals, defLevels, nil)
   354  	assert.EqualValues(t, expected.NumRows(), total)
   355  	assert.EqualValues(t, expected.NumRows()-1, read)
   356  	assert.Equal(t, []int16{1, 1, 1, 0, 1, 1}, defLevels)
   357  
   358  	data := expected.Column(3).Data().Chunk(0).(*array.Timestamp)
   359  	assert.EqualValues(t, data.Value(0), vals[0].ToTime().UnixNano())
   360  	assert.EqualValues(t, data.Value(1), vals[1].ToTime().UnixNano())
   361  	assert.EqualValues(t, data.Value(2), vals[2].ToTime().UnixNano())
   362  	assert.EqualValues(t, data.Value(4), vals[3].ToTime().UnixNano())
   363  	assert.EqualValues(t, data.Value(5), vals[4].ToTime().UnixNano())
   364  }
   365  
   366  func writeTableToBuffer(t *testing.T, mem memory.Allocator, tbl arrow.Table, rowGroupSize int64, props pqarrow.ArrowWriterProperties) *memory.Buffer {
   367  	sink := encoding.NewBufferWriter(0, mem)
   368  	defer sink.Release()
   369  
   370  	fileWriter, err := pqarrow.NewFileWriter(
   371  		tbl.Schema(),
   372  		sink,
   373  		parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)),
   374  		props,
   375  	)
   376  	require.NoError(t, err)
   377  
   378  	offset := int64(0)
   379  	for offset < tbl.NumRows() {
   380  		sz := utils.Min(rowGroupSize, tbl.NumRows()-offset)
   381  		fileWriter.NewRowGroup()
   382  		for i := 0; i < int(tbl.NumCols()); i++ {
   383  			colChunk := tbl.Column(i).Data()
   384  			err := fileWriter.WriteColumnChunked(colChunk, 0, int64(colChunk.Len()))
   385  			require.NoError(t, err)
   386  		}
   387  		offset += sz
   388  	}
   389  
   390  	require.NoError(t, fileWriter.Close())
   391  	return sink.Finish()
   392  }
   393  
   394  func simpleRoundTrip(t *testing.T, tbl arrow.Table, rowGroupSize int64) {
   395  	t.Helper()
   396  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   397  	defer mem.AssertSize(t, 0)
   398  
   399  	buf := writeTableToBuffer(t, mem, tbl, rowGroupSize, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
   400  	defer buf.Release()
   401  
   402  	rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   403  	require.NoError(t, err)
   404  
   405  	ardr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem)
   406  	require.NoError(t, err)
   407  
   408  	for i := 0; i < int(tbl.NumCols()); i++ {
   409  		crdr, err := ardr.GetColumn(context.TODO(), i)
   410  		require.NoError(t, err)
   411  
   412  		chunked, err := crdr.NextBatch(tbl.NumRows())
   413  		require.NoError(t, err)
   414  		defer chunked.Release()
   415  
   416  		require.EqualValues(t, tbl.NumRows(), chunked.Len())
   417  
   418  		chunkList := tbl.Column(i).Data().Chunks()
   419  		offset := int64(0)
   420  		for _, chnk := range chunkList {
   421  			slc := array.NewChunkedSlice(chunked, offset, offset+int64(chnk.Len()))
   422  			defer slc.Release()
   423  
   424  			assert.EqualValues(t, chnk.Len(), slc.Len())
   425  			if len(slc.Chunks()) == 1 {
   426  				offset += int64(chnk.Len())
   427  				assert.True(t, array.Equal(chnk, slc.Chunk(0)))
   428  			}
   429  		}
   430  		crdr.Release()
   431  	}
   432  }
   433  
   434  func TestWriteKeyValueMetadata(t *testing.T) {
   435  	kv := map[string]string{
   436  		"key1": "value1",
   437  		"key2": "value2",
   438  		"key3": "value3",
   439  	}
   440  
   441  	sc := arrow.NewSchema([]arrow.Field{
   442  		{Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
   443  	}, nil)
   444  	bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc)
   445  	defer bldr.Release()
   446  	for _, b := range bldr.Fields() {
   447  		b.AppendNull()
   448  	}
   449  
   450  	rec := bldr.NewRecord()
   451  	defer rec.Release()
   452  
   453  	props := parquet.NewWriterProperties(
   454  		parquet.WithVersion(parquet.V1_0),
   455  	)
   456  	var buf bytes.Buffer
   457  	fw, err := pqarrow.NewFileWriter(sc, &buf, props, pqarrow.DefaultWriterProps())
   458  	require.NoError(t, err)
   459  	err = fw.Write(rec)
   460  	require.NoError(t, err)
   461  
   462  	for key, value := range kv {
   463  		require.NoError(t, fw.AppendKeyValueMetadata(key, value))
   464  	}
   465  
   466  	err = fw.Close()
   467  	require.NoError(t, err)
   468  
   469  	reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   470  	require.NoError(t, err)
   471  
   472  	for key, value := range kv {
   473  		got := reader.MetaData().KeyValueMetadata().FindValue(key)
   474  		require.NotNil(t, got)
   475  		assert.Equal(t, value, *got)
   476  	}
   477  }
   478  
   479  func TestWriteEmptyLists(t *testing.T) {
   480  	sc := arrow.NewSchema([]arrow.Field{
   481  		{Name: "f1", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)},
   482  		{Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)},
   483  		{Name: "f3", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_us)},
   484  		{Name: "f4", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_ms)},
   485  		{Name: "f5", Type: arrow.ListOf(arrow.FixedWidthTypes.Time32ms)},
   486  		{Name: "f6", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64ns)},
   487  		{Name: "f7", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64us)},
   488  	}, nil)
   489  	bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc)
   490  	defer bldr.Release()
   491  	for _, b := range bldr.Fields() {
   492  		b.AppendNull()
   493  	}
   494  
   495  	rec := bldr.NewRecord()
   496  	defer rec.Release()
   497  
   498  	props := parquet.NewWriterProperties(
   499  		parquet.WithVersion(parquet.V1_0),
   500  	)
   501  	arrprops := pqarrow.DefaultWriterProps()
   502  	var buf bytes.Buffer
   503  	fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops)
   504  	require.NoError(t, err)
   505  	err = fw.Write(rec)
   506  	require.NoError(t, err)
   507  	err = fw.Close()
   508  	require.NoError(t, err)
   509  }
   510  
   511  func TestWriteAllNullsWithDeltaEncoding(t *testing.T) {
   512  	sc := arrow.NewSchema([]arrow.Field{
   513  		{Name: "f1", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
   514  		{Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)},
   515  		{Name: "f3", Type: arrow.BinaryTypes.String, Nullable: true},
   516  		{Name: "f4", Type: arrow.ListOf(arrow.BinaryTypes.String)},
   517  		{Name: "f5", Type: arrow.BinaryTypes.LargeString, Nullable: true},
   518  		{Name: "f6", Type: arrow.ListOf(arrow.BinaryTypes.LargeString)},
   519  		{Name: "f7", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
   520  		{Name: "f8", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)},
   521  		{Name: "f9", Type: arrow.BinaryTypes.String, Nullable: true},
   522  		{Name: "f10", Type: arrow.ListOf(arrow.BinaryTypes.LargeString)},
   523  		{Name: "f11", Type: arrow.FixedWidthTypes.Boolean, Nullable: true},
   524  		{Name: "f12", Type: arrow.ListOf(arrow.FixedWidthTypes.Boolean)},
   525  		{Name: "f13", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
   526  		{Name: "f14", Type: arrow.ListOf(arrow.PrimitiveTypes.Float32)},
   527  	}, nil)
   528  	bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc)
   529  	defer bldr.Release()
   530  	for _, b := range bldr.Fields() {
   531  		b.AppendNull()
   532  	}
   533  
   534  	rec := bldr.NewRecord()
   535  	defer rec.Release()
   536  
   537  	props := parquet.NewWriterProperties(
   538  		parquet.WithVersion(parquet.V1_0),
   539  		parquet.WithDictionaryDefault(false),
   540  		parquet.WithDictionaryFor("f9", true),
   541  		parquet.WithDictionaryFor("f10", true),
   542  		parquet.WithDictionaryFor("f13", true),
   543  		parquet.WithDictionaryFor("f14", true),
   544  		parquet.WithEncodingFor("f1", parquet.Encodings.DeltaBinaryPacked),
   545  		parquet.WithEncodingFor("f2", parquet.Encodings.DeltaBinaryPacked),
   546  		parquet.WithEncodingFor("f3", parquet.Encodings.DeltaByteArray),
   547  		parquet.WithEncodingFor("f4", parquet.Encodings.DeltaByteArray),
   548  		parquet.WithEncodingFor("f5", parquet.Encodings.DeltaLengthByteArray),
   549  		parquet.WithEncodingFor("f6", parquet.Encodings.DeltaLengthByteArray),
   550  		parquet.WithEncodingFor("f7", parquet.Encodings.Plain),
   551  		parquet.WithEncodingFor("f8", parquet.Encodings.Plain),
   552  		parquet.WithEncodingFor("f9", parquet.Encodings.Plain),
   553  		parquet.WithEncodingFor("f10", parquet.Encodings.Plain),
   554  		parquet.WithEncodingFor("f11", parquet.Encodings.RLE),
   555  		parquet.WithEncodingFor("f12", parquet.Encodings.RLE),
   556  		parquet.WithEncodingFor("f13", parquet.Encodings.RLE),
   557  		parquet.WithEncodingFor("f14", parquet.Encodings.RLE),
   558  	)
   559  	arrprops := pqarrow.DefaultWriterProps()
   560  	var buf bytes.Buffer
   561  	fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops)
   562  	require.NoError(t, err)
   563  	err = fw.Write(rec)
   564  	require.NoError(t, err)
   565  	err = fw.Close()
   566  	require.NoError(t, err)
   567  }
   568  
   569  func TestArrowReadWriteTableChunkedCols(t *testing.T) {
   570  	chunkSizes := []int{2, 4, 10, 2}
   571  	const totalLen = int64(18)
   572  
   573  	rng := testutils.NewRandomArrayGenerator(0)
   574  
   575  	arr := rng.Int32(totalLen, 0, math.MaxInt32/2, 0.9)
   576  	defer arr.Release()
   577  
   578  	offset := int64(0)
   579  	chunks := make([]arrow.Array, 0)
   580  	for _, chnksize := range chunkSizes {
   581  		chk := array.NewSlice(arr, offset, offset+int64(chnksize))
   582  		defer chk.Release()
   583  		defer chk.Release() // for NewChunked below
   584  		chunks = append(chunks, chk)
   585  	}
   586  
   587  	sc := arrow.NewSchema([]arrow.Field{{Name: "field", Type: arr.DataType(), Nullable: true}}, nil)
   588  
   589  	chk := arrow.NewChunked(arr.DataType(), chunks)
   590  	defer chk.Release()
   591  
   592  	tbl := array.NewTable(sc, []arrow.Column{*arrow.NewColumn(sc.Field(0), chk)}, -1)
   593  	defer tbl.Release()
   594  
   595  	simpleRoundTrip(t, tbl, 2)
   596  	simpleRoundTrip(t, tbl, 10)
   597  }
   598  
   599  // set this up for checking our expected results so we can test the functions
   600  // that generate them which we export
   601  func getLogicalType(typ arrow.DataType) schema.LogicalType {
   602  	switch typ.ID() {
   603  	case arrow.DICTIONARY:
   604  		return getLogicalType(typ.(*arrow.DictionaryType).ValueType)
   605  	case arrow.INT8:
   606  		return schema.NewIntLogicalType(8, true)
   607  	case arrow.UINT8:
   608  		return schema.NewIntLogicalType(8, false)
   609  	case arrow.INT16:
   610  		return schema.NewIntLogicalType(16, true)
   611  	case arrow.UINT16:
   612  		return schema.NewIntLogicalType(16, false)
   613  	case arrow.INT32:
   614  		return schema.NewIntLogicalType(32, true)
   615  	case arrow.UINT32:
   616  		return schema.NewIntLogicalType(32, false)
   617  	case arrow.INT64:
   618  		return schema.NewIntLogicalType(64, true)
   619  	case arrow.UINT64:
   620  		return schema.NewIntLogicalType(64, false)
   621  	case arrow.STRING, arrow.LARGE_STRING:
   622  		return schema.StringLogicalType{}
   623  	case arrow.DATE32:
   624  		return schema.DateLogicalType{}
   625  	case arrow.DATE64:
   626  		return schema.DateLogicalType{}
   627  	case arrow.FLOAT16:
   628  		return schema.Float16LogicalType{}
   629  	case arrow.TIMESTAMP:
   630  		ts := typ.(*arrow.TimestampType)
   631  		adjustedUTC := len(ts.TimeZone) == 0
   632  		switch ts.Unit {
   633  		case arrow.Microsecond:
   634  			return schema.NewTimestampLogicalType(adjustedUTC, schema.TimeUnitMicros)
   635  		case arrow.Millisecond:
   636  			return schema.NewTimestampLogicalType(adjustedUTC, schema.TimeUnitMillis)
   637  		case arrow.Nanosecond:
   638  			return schema.NewTimestampLogicalType(adjustedUTC, schema.TimeUnitNanos)
   639  		default:
   640  			panic("only milli, micro and nano units supported for arrow timestamp")
   641  		}
   642  	case arrow.TIME32:
   643  		return schema.NewTimeLogicalType(false, schema.TimeUnitMillis)
   644  	case arrow.TIME64:
   645  		ts := typ.(*arrow.Time64Type)
   646  		switch ts.Unit {
   647  		case arrow.Microsecond:
   648  			return schema.NewTimeLogicalType(false, schema.TimeUnitMicros)
   649  		case arrow.Nanosecond:
   650  			return schema.NewTimeLogicalType(false, schema.TimeUnitNanos)
   651  		default:
   652  			panic("only micro and nano seconds are supported for arrow TIME64")
   653  		}
   654  	case arrow.DECIMAL, arrow.DECIMAL256:
   655  		dec := typ.(arrow.DecimalType)
   656  		return schema.NewDecimalLogicalType(dec.GetPrecision(), dec.GetScale())
   657  	}
   658  	return schema.NoLogicalType{}
   659  }
   660  
   661  func getPhysicalType(typ arrow.DataType) parquet.Type {
   662  	switch typ.ID() {
   663  	case arrow.DICTIONARY:
   664  		return getPhysicalType(typ.(*arrow.DictionaryType).ValueType)
   665  	case arrow.BOOL:
   666  		return parquet.Types.Boolean
   667  	case arrow.UINT8, arrow.INT8, arrow.UINT16, arrow.INT16, arrow.UINT32, arrow.INT32:
   668  		return parquet.Types.Int32
   669  	case arrow.INT64, arrow.UINT64:
   670  		return parquet.Types.Int64
   671  	case arrow.FLOAT32:
   672  		return parquet.Types.Float
   673  	case arrow.FLOAT64:
   674  		return parquet.Types.Double
   675  	case arrow.FLOAT16:
   676  		return parquet.Types.FixedLenByteArray
   677  	case arrow.BINARY, arrow.LARGE_BINARY, arrow.STRING, arrow.LARGE_STRING:
   678  		return parquet.Types.ByteArray
   679  	case arrow.FIXED_SIZE_BINARY, arrow.DECIMAL:
   680  		return parquet.Types.FixedLenByteArray
   681  	case arrow.DATE32:
   682  		return parquet.Types.Int32
   683  	case arrow.DATE64:
   684  		// convert to date32 internally
   685  		return parquet.Types.Int32
   686  	case arrow.TIME32:
   687  		return parquet.Types.Int32
   688  	case arrow.TIME64, arrow.TIMESTAMP:
   689  		return parquet.Types.Int64
   690  	default:
   691  		return parquet.Types.Int32
   692  	}
   693  }
   694  
   695  const (
   696  	boolTestValue = true
   697  	uint8TestVal  = uint8(64)
   698  	int8TestVal   = int8(-64)
   699  	uint16TestVal = uint16(1024)
   700  	int16TestVal  = int16(-1024)
   701  	uint32TestVal = uint32(1024)
   702  	int32TestVal  = int32(-1024)
   703  	uint64TestVal = uint64(1024)
   704  	int64TestVal  = int64(-1024)
   705  	tsTestValue   = arrow.Timestamp(14695634030000)
   706  	date32TestVal = arrow.Date32(170000)
   707  	floatTestVal  = float32(2.1)
   708  	doubleTestVal = float64(4.2)
   709  	strTestVal    = "Test"
   710  
   711  	smallSize = 100
   712  )
   713  
   714  type ParquetIOTestSuite struct {
   715  	suite.Suite
   716  }
   717  
   718  func (ps *ParquetIOTestSuite) SetupTest() {
   719  	ps.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
   720  }
   721  
   722  func (ps *ParquetIOTestSuite) TearDownTest() {
   723  	if arrow.GetExtensionType("uuid") != nil {
   724  		ps.NoError(arrow.UnregisterExtensionType("uuid"))
   725  	}
   726  }
   727  
   728  func (ps *ParquetIOTestSuite) makeSimpleSchema(typ arrow.DataType, rep parquet.Repetition) *schema.GroupNode {
   729  	byteWidth := int32(-1)
   730  
   731  	switch typ := typ.(type) {
   732  	case *arrow.FixedSizeBinaryType:
   733  		byteWidth = int32(typ.ByteWidth)
   734  	case arrow.DecimalType:
   735  		byteWidth = pqarrow.DecimalSize(typ.GetPrecision())
   736  	case *arrow.Float16Type:
   737  		byteWidth = int32(typ.Bytes())
   738  	case *arrow.DictionaryType:
   739  		valuesType := typ.ValueType
   740  		switch dt := valuesType.(type) {
   741  		case *arrow.FixedSizeBinaryType:
   742  			byteWidth = int32(dt.ByteWidth)
   743  		case arrow.DecimalType:
   744  			byteWidth = pqarrow.DecimalSize(dt.GetPrecision())
   745  		case *arrow.Float16Type:
   746  			byteWidth = int32(typ.Bytes())
   747  		}
   748  	}
   749  
   750  	pnode, _ := schema.NewPrimitiveNodeLogical("column1", rep, getLogicalType(typ), getPhysicalType(typ), int(byteWidth), -1)
   751  	return schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{pnode}, -1))
   752  }
   753  
   754  func (ps *ParquetIOTestSuite) makePrimitiveTestCol(mem memory.Allocator, size int, typ arrow.DataType) arrow.Array {
   755  	switch typ.ID() {
   756  	case arrow.BOOL:
   757  		bldr := array.NewBooleanBuilder(mem)
   758  		defer bldr.Release()
   759  		for i := 0; i < size; i++ {
   760  			bldr.Append(boolTestValue)
   761  		}
   762  		return bldr.NewArray()
   763  	case arrow.INT8:
   764  		bldr := array.NewInt8Builder(mem)
   765  		defer bldr.Release()
   766  		for i := 0; i < size; i++ {
   767  			bldr.Append(int8TestVal)
   768  		}
   769  		return bldr.NewArray()
   770  	case arrow.UINT8:
   771  		bldr := array.NewUint8Builder(mem)
   772  		defer bldr.Release()
   773  		for i := 0; i < size; i++ {
   774  			bldr.Append(uint8TestVal)
   775  		}
   776  		return bldr.NewArray()
   777  	case arrow.INT16:
   778  		bldr := array.NewInt16Builder(mem)
   779  		defer bldr.Release()
   780  		for i := 0; i < size; i++ {
   781  			bldr.Append(int16TestVal)
   782  		}
   783  		return bldr.NewArray()
   784  	case arrow.UINT16:
   785  		bldr := array.NewUint16Builder(mem)
   786  		defer bldr.Release()
   787  		for i := 0; i < size; i++ {
   788  			bldr.Append(uint16TestVal)
   789  		}
   790  		return bldr.NewArray()
   791  	case arrow.INT32:
   792  		bldr := array.NewInt32Builder(mem)
   793  		defer bldr.Release()
   794  		for i := 0; i < size; i++ {
   795  			bldr.Append(int32TestVal)
   796  		}
   797  		return bldr.NewArray()
   798  	case arrow.UINT32:
   799  		bldr := array.NewUint32Builder(mem)
   800  		defer bldr.Release()
   801  		for i := 0; i < size; i++ {
   802  			bldr.Append(uint32TestVal)
   803  		}
   804  		return bldr.NewArray()
   805  	case arrow.INT64:
   806  		bldr := array.NewInt64Builder(mem)
   807  		defer bldr.Release()
   808  		for i := 0; i < size; i++ {
   809  			bldr.Append(int64TestVal)
   810  		}
   811  		return bldr.NewArray()
   812  	case arrow.UINT64:
   813  		bldr := array.NewUint64Builder(mem)
   814  		defer bldr.Release()
   815  		for i := 0; i < size; i++ {
   816  			bldr.Append(uint64TestVal)
   817  		}
   818  		return bldr.NewArray()
   819  	case arrow.FLOAT32:
   820  		bldr := array.NewFloat32Builder(mem)
   821  		defer bldr.Release()
   822  		for i := 0; i < size; i++ {
   823  			bldr.Append(floatTestVal)
   824  		}
   825  		return bldr.NewArray()
   826  	case arrow.FLOAT64:
   827  		bldr := array.NewFloat64Builder(mem)
   828  		defer bldr.Release()
   829  		for i := 0; i < size; i++ {
   830  			bldr.Append(doubleTestVal)
   831  		}
   832  		return bldr.NewArray()
   833  	}
   834  	return nil
   835  }
   836  
   837  func (ps *ParquetIOTestSuite) makeTestFile(mem memory.Allocator, typ arrow.DataType, arr arrow.Array, numChunks int) []byte {
   838  	sc := ps.makeSimpleSchema(typ, parquet.Repetitions.Required)
   839  	sink := encoding.NewBufferWriter(0, mem)
   840  	defer sink.Release()
   841  	writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(parquet.NewWriterProperties(parquet.WithAllocator(mem))))
   842  
   843  	props := pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))
   844  	ctx := pqarrow.NewArrowWriteContext(context.TODO(), &props)
   845  	rowGroupSize := arr.Len() / numChunks
   846  
   847  	for i := 0; i < numChunks; i++ {
   848  		rgw := writer.AppendRowGroup()
   849  		cw, err := rgw.NextColumn()
   850  		ps.NoError(err)
   851  
   852  		start := i * rowGroupSize
   853  		slc := array.NewSlice(arr, int64(start), int64(start+rowGroupSize))
   854  		defer slc.Release()
   855  		ps.NoError(pqarrow.WriteArrowToColumn(ctx, cw, slc, nil, nil, false))
   856  		ps.NoError(cw.Close())
   857  		ps.NoError(rgw.Close())
   858  	}
   859  	ps.NoError(writer.Close())
   860  	buf := sink.Finish()
   861  	defer buf.Release()
   862  	return buf.Bytes()
   863  }
   864  
   865  func (ps *ParquetIOTestSuite) createReader(mem memory.Allocator, data []byte) *pqarrow.FileReader {
   866  	rdr, err := file.NewParquetReader(bytes.NewReader(data), file.WithReadProps(parquet.NewReaderProperties(mem)))
   867  	ps.NoError(err)
   868  
   869  	reader, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem)
   870  	ps.NoError(err)
   871  	return reader
   872  }
   873  
   874  func (ps *ParquetIOTestSuite) readTable(rdr *pqarrow.FileReader) arrow.Table {
   875  	tbl, err := rdr.ReadTable(context.TODO())
   876  	ps.NoError(err)
   877  	ps.NotNil(tbl)
   878  	return tbl
   879  }
   880  
   881  func (ps *ParquetIOTestSuite) checkSingleColumnRequiredTableRead(mem memory.Allocator, typ arrow.DataType, numChunks int) {
   882  	values := ps.makePrimitiveTestCol(mem, smallSize, typ)
   883  	defer values.Release()
   884  
   885  	data := ps.makeTestFile(mem, typ, values, numChunks)
   886  	reader := ps.createReader(mem, data)
   887  
   888  	tbl := ps.readTable(reader)
   889  	defer tbl.Release()
   890  
   891  	ps.EqualValues(1, tbl.NumCols())
   892  	ps.EqualValues(smallSize, tbl.NumRows())
   893  
   894  	chunked := tbl.Column(0).Data()
   895  	ps.Len(chunked.Chunks(), 1)
   896  	ps.True(array.Equal(values, chunked.Chunk(0)))
   897  }
   898  
   899  func (ps *ParquetIOTestSuite) checkSingleColumnRead(mem memory.Allocator, typ arrow.DataType, numChunks int) {
   900  	values := ps.makePrimitiveTestCol(mem, smallSize, typ)
   901  	defer values.Release()
   902  
   903  	data := ps.makeTestFile(mem, typ, values, numChunks)
   904  	reader := ps.createReader(mem, data)
   905  
   906  	cr, err := reader.GetColumn(context.TODO(), 0)
   907  	ps.NoError(err)
   908  	defer cr.Release()
   909  
   910  	chunked, err := cr.NextBatch(smallSize)
   911  	ps.NoError(err)
   912  	defer chunked.Release()
   913  
   914  	ps.Len(chunked.Chunks(), 1)
   915  	ps.True(array.Equal(values, chunked.Chunk(0)))
   916  }
   917  
   918  func (ps *ParquetIOTestSuite) TestDateTimeTypesReadWriteTable() {
   919  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   920  	defer mem.AssertSize(ps.T(), 0)
   921  
   922  	toWrite := makeDateTimeTypesTable(mem, false, true)
   923  	defer toWrite.Release()
   924  	buf := writeTableToBuffer(ps.T(), mem, toWrite, toWrite.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
   925  	defer buf.Release()
   926  
   927  	reader := ps.createReader(mem, buf.Bytes())
   928  	tbl := ps.readTable(reader)
   929  	defer tbl.Release()
   930  
   931  	expected := makeDateTimeTypesTable(mem, true, true)
   932  	defer expected.Release()
   933  
   934  	ps.Equal(expected.NumCols(), tbl.NumCols())
   935  	ps.Equal(expected.NumRows(), tbl.NumRows())
   936  	ps.Truef(expected.Schema().Equal(tbl.Schema()), "expected schema: %s\ngot schema: %s", expected.Schema(), tbl.Schema())
   937  
   938  	for i := 0; i < int(expected.NumCols()); i++ {
   939  		exChunk := expected.Column(i).Data()
   940  		tblChunk := tbl.Column(i).Data()
   941  
   942  		ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
   943  		ps.Truef(array.Equal(exChunk.Chunk(0), tblChunk.Chunk(0)), "expected %s\ngot %s", exChunk.Chunk(0), tblChunk.Chunk(0))
   944  	}
   945  }
   946  
   947  func (ps *ParquetIOTestSuite) TestDateTimeTypesWithInt96ReadWriteTable() {
   948  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   949  	defer mem.AssertSize(ps.T(), 0)
   950  
   951  	expected := makeDateTimeTypesTable(mem, false, true)
   952  	defer expected.Release()
   953  	buf := writeTableToBuffer(ps.T(), mem, expected, expected.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
   954  	defer buf.Release()
   955  
   956  	reader := ps.createReader(mem, buf.Bytes())
   957  	tbl := ps.readTable(reader)
   958  	defer tbl.Release()
   959  
   960  	ps.Equal(expected.NumCols(), tbl.NumCols())
   961  	ps.Equal(expected.NumRows(), tbl.NumRows())
   962  	ps.Truef(expected.Schema().Equal(tbl.Schema()), "expected schema: %s\ngot schema: %s", expected.Schema(), tbl.Schema())
   963  
   964  	for i := 0; i < int(expected.NumCols()); i++ {
   965  		exChunk := expected.Column(i).Data()
   966  		tblChunk := tbl.Column(i).Data()
   967  
   968  		ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
   969  		ps.Truef(array.Equal(exChunk.Chunk(0), tblChunk.Chunk(0)), "expected %s\ngot %s", exChunk.Chunk(0), tblChunk.Chunk(0))
   970  	}
   971  }
   972  
   973  func (ps *ParquetIOTestSuite) TestDate64ReadWriteTable() {
   974  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   975  	defer mem.AssertSize(ps.T(), 0)
   976  
   977  	date64InputTable := makeDateTypeTable(mem, false, false)
   978  	defer date64InputTable.Release()
   979  	buf := writeTableToBuffer(ps.T(), mem, date64InputTable, date64InputTable.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
   980  	defer buf.Release()
   981  
   982  	reader := ps.createReader(mem, buf.Bytes())
   983  	roundTripOutputTable := ps.readTable(reader)
   984  	defer roundTripOutputTable.Release()
   985  
   986  	date32ExpectedOutputTable := makeDateTypeTable(mem, true, false)
   987  	defer date32ExpectedOutputTable.Release()
   988  
   989  	ps.Truef(array.TableEqual(date32ExpectedOutputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", date32ExpectedOutputTable, roundTripOutputTable)
   990  }
   991  
   992  func (ps *ParquetIOTestSuite) TestTimestampTZReadWriteTable() {
   993  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   994  	defer mem.AssertSize(ps.T(), 0)
   995  
   996  	inputTable := makeTimestampTypeTable(mem, false)
   997  	defer inputTable.Release()
   998  	buf := writeTableToBuffer(ps.T(), mem, inputTable, inputTable.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
   999  	defer buf.Release()
  1000  
  1001  	reader := ps.createReader(mem, buf.Bytes())
  1002  	roundTripOutputTable := ps.readTable(reader)
  1003  	defer roundTripOutputTable.Release()
  1004  
  1005  	expectedOutputTable := makeTimestampTypeTable(mem, true)
  1006  	defer expectedOutputTable.Release()
  1007  
  1008  	ps.Truef(array.TableEqual(expectedOutputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", expectedOutputTable, roundTripOutputTable)
  1009  }
  1010  
  1011  func (ps *ParquetIOTestSuite) TestDate64ReadWriteTableWithPartialDays() {
  1012  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1013  	defer mem.AssertSize(ps.T(), 0)
  1014  
  1015  	date64InputTableNotAlignedToDateBoundary := makeDateTypeTable(mem, false, true)
  1016  	defer date64InputTableNotAlignedToDateBoundary.Release()
  1017  	buf := writeTableToBuffer(ps.T(), mem, date64InputTableNotAlignedToDateBoundary, date64InputTableNotAlignedToDateBoundary.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
  1018  	defer buf.Release()
  1019  
  1020  	reader := ps.createReader(mem, buf.Bytes())
  1021  	roundTripOutputTable := ps.readTable(reader)
  1022  	defer roundTripOutputTable.Release()
  1023  
  1024  	date32ExpectedOutputTable := makeDateTypeTable(mem, true, true)
  1025  	defer date32ExpectedOutputTable.Release()
  1026  
  1027  	ps.Truef(array.TableEqual(date32ExpectedOutputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", date32ExpectedOutputTable, roundTripOutputTable)
  1028  }
  1029  
  1030  func (ps *ParquetIOTestSuite) TestTimestampTZStoreSchemaReadWriteTable() {
  1031  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1032  	defer mem.AssertSize(ps.T(), 0)
  1033  
  1034  	inputTable := makeTimestampTypeTable(mem, false)
  1035  	defer inputTable.Release()
  1036  	buf := writeTableToBuffer(ps.T(), mem, inputTable, inputTable.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem), pqarrow.WithStoreSchema()))
  1037  	defer buf.Release()
  1038  
  1039  	reader := ps.createReader(mem, buf.Bytes())
  1040  	roundTripOutputTable := ps.readTable(reader)
  1041  	defer roundTripOutputTable.Release()
  1042  
  1043  	ps.Truef(array.TableEqual(inputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", inputTable, roundTripOutputTable)
  1044  }
  1045  
  1046  func (ps *ParquetIOTestSuite) TestLargeBinaryReadWriteTable() {
  1047  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1048  	defer mem.AssertSize(ps.T(), 0)
  1049  
  1050  	// While we may write using LargeString, when we read, we get an array.String back out.
  1051  	// So we're building a normal array.String to use with array.Equal
  1052  	lsBldr := array.NewLargeStringBuilder(mem)
  1053  	defer lsBldr.Release()
  1054  	lbBldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
  1055  	defer lbBldr.Release()
  1056  
  1057  	for i := 0; i < smallSize; i++ {
  1058  		s := strconv.FormatInt(int64(i), 10)
  1059  		lsBldr.Append(s)
  1060  		lbBldr.Append([]byte(s))
  1061  	}
  1062  
  1063  	lsValues := lsBldr.NewArray()
  1064  	defer lsValues.Release()
  1065  	lbValues := lbBldr.NewArray()
  1066  	defer lbValues.Release()
  1067  
  1068  	lsField := arrow.Field{Name: "large_string", Type: arrow.BinaryTypes.LargeString, Nullable: true}
  1069  	lbField := arrow.Field{Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary, Nullable: true}
  1070  	expected := array.NewTable(
  1071  		arrow.NewSchema([]arrow.Field{lsField, lbField}, nil),
  1072  		[]arrow.Column{
  1073  			*arrow.NewColumn(lsField, arrow.NewChunked(lsField.Type, []arrow.Array{lsValues})),
  1074  			*arrow.NewColumn(lbField, arrow.NewChunked(lbField.Type, []arrow.Array{lbValues})),
  1075  		},
  1076  		-1,
  1077  	)
  1078  	defer lsValues.Release() // NewChunked
  1079  	defer lbValues.Release() // NewChunked
  1080  	defer expected.Release()
  1081  	ps.roundTripTable(mem, expected, true)
  1082  }
  1083  
  1084  func (ps *ParquetIOTestSuite) TestReadSingleColumnFile() {
  1085  	types := []arrow.DataType{
  1086  		arrow.FixedWidthTypes.Boolean,
  1087  		arrow.PrimitiveTypes.Uint8,
  1088  		arrow.PrimitiveTypes.Int8,
  1089  		arrow.PrimitiveTypes.Uint16,
  1090  		arrow.PrimitiveTypes.Int16,
  1091  		arrow.PrimitiveTypes.Uint32,
  1092  		arrow.PrimitiveTypes.Int32,
  1093  		arrow.PrimitiveTypes.Uint64,
  1094  		arrow.PrimitiveTypes.Int64,
  1095  		arrow.PrimitiveTypes.Float32,
  1096  		arrow.PrimitiveTypes.Float64,
  1097  	}
  1098  
  1099  	nchunks := []int{1, 4}
  1100  
  1101  	for _, n := range nchunks {
  1102  		for _, dt := range types {
  1103  			ps.Run(fmt.Sprintf("%s %d chunks", dt.Name(), n), func() {
  1104  				mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1105  				defer mem.AssertSize(ps.T(), 0)
  1106  				ps.checkSingleColumnRead(mem, dt, n)
  1107  			})
  1108  		}
  1109  	}
  1110  }
  1111  
  1112  func (ps *ParquetIOTestSuite) TestSingleColumnRequiredRead() {
  1113  	types := []arrow.DataType{
  1114  		arrow.FixedWidthTypes.Boolean,
  1115  		arrow.PrimitiveTypes.Uint8,
  1116  		arrow.PrimitiveTypes.Int8,
  1117  		arrow.PrimitiveTypes.Uint16,
  1118  		arrow.PrimitiveTypes.Int16,
  1119  		arrow.PrimitiveTypes.Uint32,
  1120  		arrow.PrimitiveTypes.Int32,
  1121  		arrow.PrimitiveTypes.Uint64,
  1122  		arrow.PrimitiveTypes.Int64,
  1123  		arrow.PrimitiveTypes.Float32,
  1124  		arrow.PrimitiveTypes.Float64,
  1125  	}
  1126  
  1127  	nchunks := []int{1, 4}
  1128  
  1129  	for _, n := range nchunks {
  1130  		for _, dt := range types {
  1131  			ps.Run(fmt.Sprintf("%s %d chunks", dt.Name(), n), func() {
  1132  				mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1133  				defer mem.AssertSize(ps.T(), 0)
  1134  
  1135  				ps.checkSingleColumnRequiredTableRead(mem, dt, n)
  1136  			})
  1137  		}
  1138  	}
  1139  }
  1140  
  1141  func (ps *ParquetIOTestSuite) TestReadDecimals() {
  1142  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1143  	defer mem.AssertSize(ps.T(), 0)
  1144  
  1145  	bigEndian := []parquet.ByteArray{
  1146  		// 123456
  1147  		[]byte{1, 226, 64},
  1148  		// 987654
  1149  		[]byte{15, 18, 6},
  1150  		// -123456
  1151  		[]byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192},
  1152  	}
  1153  
  1154  	bldr := array.NewDecimal128Builder(mem, &arrow.Decimal128Type{Precision: 6, Scale: 3})
  1155  	defer bldr.Release()
  1156  
  1157  	bldr.Append(decimal128.FromU64(123456))
  1158  	bldr.Append(decimal128.FromU64(987654))
  1159  	bldr.Append(decimal128.FromI64(-123456))
  1160  
  1161  	expected := bldr.NewDecimal128Array()
  1162  	defer expected.Release()
  1163  
  1164  	sc := schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
  1165  		schema.Must(schema.NewPrimitiveNodeLogical("decimals", parquet.Repetitions.Required, schema.NewDecimalLogicalType(6, 3), parquet.Types.ByteArray, -1, -1)),
  1166  	}, -1))
  1167  
  1168  	sink := encoding.NewBufferWriter(0, mem)
  1169  	defer sink.Release()
  1170  	writer := file.NewParquetWriter(sink, sc)
  1171  
  1172  	rgw := writer.AppendRowGroup()
  1173  	cw, _ := rgw.NextColumn()
  1174  	cw.(*file.ByteArrayColumnChunkWriter).WriteBatch(bigEndian, nil, nil)
  1175  	cw.Close()
  1176  	rgw.Close()
  1177  	writer.Close()
  1178  
  1179  	rdr := ps.createReader(mem, sink.Bytes())
  1180  	cr, err := rdr.GetColumn(context.TODO(), 0)
  1181  	ps.NoError(err)
  1182  
  1183  	chunked, err := cr.NextBatch(smallSize)
  1184  	ps.NoError(err)
  1185  	defer chunked.Release()
  1186  
  1187  	ps.Len(chunked.Chunks(), 1)
  1188  	ps.True(array.Equal(expected, chunked.Chunk(0)))
  1189  }
  1190  
  1191  func (ps *ParquetIOTestSuite) TestReadDecimal256() {
  1192  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1193  	defer mem.AssertSize(ps.T(), 0)
  1194  
  1195  	bigEndian := []parquet.ByteArray{
  1196  		// 123456
  1197  		[]byte{1, 226, 64},
  1198  		// 987654
  1199  		[]byte{15, 18, 6},
  1200  		// -123456
  1201  		[]byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192},
  1202  	}
  1203  
  1204  	bldr := array.NewDecimal256Builder(mem, &arrow.Decimal256Type{Precision: 40, Scale: 3})
  1205  	defer bldr.Release()
  1206  
  1207  	bldr.Append(decimal256.FromU64(123456))
  1208  	bldr.Append(decimal256.FromU64(987654))
  1209  	bldr.Append(decimal256.FromI64(-123456))
  1210  
  1211  	expected := bldr.NewDecimal256Array()
  1212  	defer expected.Release()
  1213  
  1214  	sc := schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
  1215  		schema.Must(schema.NewPrimitiveNodeLogical("decimals", parquet.Repetitions.Required, schema.NewDecimalLogicalType(40, 3), parquet.Types.ByteArray, -1, -1)),
  1216  	}, -1))
  1217  
  1218  	sink := encoding.NewBufferWriter(0, mem)
  1219  	defer sink.Release()
  1220  	writer := file.NewParquetWriter(sink, sc)
  1221  
  1222  	rgw := writer.AppendRowGroup()
  1223  	cw, _ := rgw.NextColumn()
  1224  	cw.(*file.ByteArrayColumnChunkWriter).WriteBatch(bigEndian, nil, nil)
  1225  	cw.Close()
  1226  	rgw.Close()
  1227  	writer.Close()
  1228  
  1229  	rdr := ps.createReader(mem, sink.Bytes())
  1230  	cr, err := rdr.GetColumn(context.TODO(), 0)
  1231  	ps.NoError(err)
  1232  
  1233  	chunked, err := cr.NextBatch(smallSize)
  1234  	ps.NoError(err)
  1235  	defer chunked.Release()
  1236  
  1237  	ps.Len(chunked.Chunks(), 1)
  1238  	ps.Truef(array.Equal(expected, chunked.Chunk(0)), "expected: %s\ngot: %s", expected, chunked.Chunk(0))
  1239  }
  1240  
  1241  func (ps *ParquetIOTestSuite) TestReadNestedStruct() {
  1242  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1243  	defer mem.AssertSize(ps.T(), 0)
  1244  
  1245  	dt := arrow.StructOf(arrow.Field{
  1246  		Name: "nested",
  1247  		Type: arrow.StructOf(
  1248  			arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean},
  1249  			arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32},
  1250  			arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64},
  1251  		),
  1252  	})
  1253  	field := arrow.Field{Name: "struct", Type: dt, Nullable: true}
  1254  
  1255  	builder := array.NewStructBuilder(mem, dt)
  1256  	defer builder.Release()
  1257  	nested := builder.FieldBuilder(0).(*array.StructBuilder)
  1258  
  1259  	builder.Append(true)
  1260  	nested.Append(true)
  1261  	nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true)
  1262  	nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1))
  1263  	nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2))
  1264  	builder.AppendNull()
  1265  
  1266  	arr := builder.NewStructArray()
  1267  	defer arr.Release()
  1268  
  1269  	expected := array.NewTable(
  1270  		arrow.NewSchema([]arrow.Field{field}, nil),
  1271  		[]arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, []arrow.Array{arr}))},
  1272  		-1,
  1273  	)
  1274  	defer arr.Release() // NewChunked
  1275  	defer expected.Release()
  1276  	ps.roundTripTable(mem, expected, true)
  1277  }
  1278  
  1279  func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte {
  1280  	var buf bytes.Buffer
  1281  	arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
  1282  	ps.NoError(err)
  1283  
  1284  	writer, err := pqarrow.NewFileWriter(arrsc, &buf, parquet.NewWriterProperties(parquet.WithDictionaryDefault(false)), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
  1285  	ps.NoError(err)
  1286  
  1287  	writer.NewRowGroup()
  1288  	ps.NoError(writer.WriteColumnData(values))
  1289  	//defer values.Release()
  1290  	ps.NoError(writer.Close())
  1291  	ps.NoError(writer.Close())
  1292  
  1293  	return buf.Bytes()
  1294  }
  1295  
  1296  func (ps *ParquetIOTestSuite) readAndCheckSingleColumnFile(mem memory.Allocator, data []byte, values arrow.Array) {
  1297  	reader := ps.createReader(mem, data)
  1298  	cr, err := reader.GetColumn(context.TODO(), 0)
  1299  	ps.NoError(err)
  1300  	ps.NotNil(cr)
  1301  	defer cr.Release()
  1302  
  1303  	chunked, err := cr.NextBatch(smallSize)
  1304  	ps.NoError(err)
  1305  	defer chunked.Release()
  1306  
  1307  	ps.Len(chunked.Chunks(), 1)
  1308  	ps.NotNil(chunked.Chunk(0))
  1309  
  1310  	ps.True(array.Equal(values, chunked.Chunk(0)))
  1311  }
  1312  
  1313  var fullTypeList = []arrow.DataType{
  1314  	arrow.FixedWidthTypes.Boolean,
  1315  	arrow.PrimitiveTypes.Uint8,
  1316  	arrow.PrimitiveTypes.Int8,
  1317  	arrow.PrimitiveTypes.Uint16,
  1318  	arrow.PrimitiveTypes.Int16,
  1319  	arrow.PrimitiveTypes.Uint32,
  1320  	arrow.PrimitiveTypes.Int32,
  1321  	arrow.PrimitiveTypes.Uint64,
  1322  	arrow.PrimitiveTypes.Int64,
  1323  	arrow.FixedWidthTypes.Date32,
  1324  	arrow.PrimitiveTypes.Float32,
  1325  	arrow.PrimitiveTypes.Float64,
  1326  	arrow.FixedWidthTypes.Float16,
  1327  	arrow.BinaryTypes.String,
  1328  	arrow.BinaryTypes.Binary,
  1329  	&arrow.FixedSizeBinaryType{ByteWidth: 10},
  1330  	&arrow.Decimal128Type{Precision: 1, Scale: 0},
  1331  	&arrow.Decimal128Type{Precision: 5, Scale: 4},
  1332  	&arrow.Decimal128Type{Precision: 10, Scale: 9},
  1333  	&arrow.Decimal128Type{Precision: 19, Scale: 18},
  1334  	&arrow.Decimal128Type{Precision: 23, Scale: 22},
  1335  	&arrow.Decimal128Type{Precision: 27, Scale: 26},
  1336  	&arrow.Decimal128Type{Precision: 38, Scale: 37},
  1337  }
  1338  
  1339  func (ps *ParquetIOTestSuite) TestSingleColumnRequiredWrite() {
  1340  	for _, dt := range fullTypeList {
  1341  		ps.Run(dt.Name(), func() {
  1342  			mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1343  			defer mem.AssertSize(ps.T(), 0)
  1344  
  1345  			values := testutils.RandomNonNull(mem, dt, smallSize)
  1346  			defer values.Release()
  1347  			sc := ps.makeSimpleSchema(dt, parquet.Repetitions.Required)
  1348  			data := ps.writeColumn(mem, sc, values)
  1349  			ps.readAndCheckSingleColumnFile(mem, data, values)
  1350  		})
  1351  	}
  1352  }
  1353  
  1354  func (ps *ParquetIOTestSuite) roundTripTable(mem memory.Allocator, expected arrow.Table, storeSchema bool) {
  1355  	var buf bytes.Buffer
  1356  	var props pqarrow.ArrowWriterProperties
  1357  	if storeSchema {
  1358  		props = pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema(), pqarrow.WithAllocator(mem))
  1359  	} else {
  1360  		props = pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))
  1361  	}
  1362  
  1363  	writeProps := parquet.NewWriterProperties(parquet.WithAllocator(mem))
  1364  	ps.Require().NoError(pqarrow.WriteTable(expected, &buf, expected.NumRows(), writeProps, props))
  1365  
  1366  	reader := ps.createReader(mem, buf.Bytes())
  1367  	defer reader.ParquetReader().Close()
  1368  
  1369  	tbl := ps.readTable(reader)
  1370  	defer tbl.Release()
  1371  
  1372  	ps.Equal(expected.NumCols(), tbl.NumCols())
  1373  	ps.Equal(expected.NumRows(), tbl.NumRows())
  1374  
  1375  	exChunk := expected.Column(0).Data()
  1376  	tblChunk := tbl.Column(0).Data()
  1377  
  1378  	ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
  1379  	exc := exChunk.Chunk(0)
  1380  	tbc := tblChunk.Chunk(0)
  1381  	ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
  1382  }
  1383  
  1384  func makeEmptyListsArray(size int) arrow.Array {
  1385  	// allocate an offsets buffer with only zeros
  1386  	offsetsNbytes := arrow.Int32Traits.BytesRequired(size + 1)
  1387  	offsetsBuffer := make([]byte, offsetsNbytes)
  1388  
  1389  	childBuffers := []*memory.Buffer{nil, nil}
  1390  	childData := array.NewData(arrow.PrimitiveTypes.Float32, 0, childBuffers, nil, 0, 0)
  1391  	defer childData.Release()
  1392  	buffers := []*memory.Buffer{nil, memory.NewBufferBytes(offsetsBuffer)}
  1393  	arrayData := array.NewData(arrow.ListOf(childData.DataType()), size, buffers, []arrow.ArrayData{childData}, 0, 0)
  1394  	defer arrayData.Release()
  1395  	return array.MakeFromData(arrayData)
  1396  }
  1397  
  1398  func makeListArray(values arrow.Array, size, nullcount int) arrow.Array {
  1399  	nonNullEntries := size - nullcount - 1
  1400  	lengthPerEntry := values.Len() / nonNullEntries
  1401  
  1402  	offsets := make([]byte, arrow.Int32Traits.BytesRequired(size+1))
  1403  	offsetsArr := arrow.Int32Traits.CastFromBytes(offsets)
  1404  
  1405  	nullBitmap := make([]byte, int(bitutil.BytesForBits(int64(size))))
  1406  
  1407  	curOffset := 0
  1408  	for i := 0; i < size; i++ {
  1409  		offsetsArr[i] = int32(curOffset)
  1410  		if !(((i % 2) == 0) && ((i / 2) < nullcount)) {
  1411  			// non-null list (list with index 1 is always empty)
  1412  			bitutil.SetBit(nullBitmap, i)
  1413  			if i != 1 {
  1414  				curOffset += lengthPerEntry
  1415  			}
  1416  		}
  1417  	}
  1418  	offsetsArr[size] = int32(values.Len())
  1419  
  1420  	listData := array.NewData(arrow.ListOf(values.DataType()), size,
  1421  		[]*memory.Buffer{memory.NewBufferBytes(nullBitmap), memory.NewBufferBytes(offsets)},
  1422  		[]arrow.ArrayData{values.Data()}, nullcount, 0)
  1423  	defer listData.Release()
  1424  	return array.NewListData(listData)
  1425  }
  1426  
  1427  func prepareEmptyListsTable(size int) arrow.Table {
  1428  	lists := makeEmptyListsArray(size)
  1429  	defer lists.Release()
  1430  	chunked := arrow.NewChunked(lists.DataType(), []arrow.Array{lists})
  1431  	defer chunked.Release()
  1432  	return makeSimpleTable(chunked, true)
  1433  }
  1434  
  1435  func prepareListTable(dt arrow.DataType, size int, nullableLists bool, nullableElems bool, nullCount int) arrow.Table {
  1436  	nc := nullCount
  1437  	if !nullableElems {
  1438  		nc = 0
  1439  	}
  1440  	values := testutils.RandomNullable(dt, size*size, nc)
  1441  	defer values.Release()
  1442  	// also test that slice offsets are respected
  1443  	values = array.NewSlice(values, 5, int64(values.Len()))
  1444  	defer values.Release()
  1445  
  1446  	if !nullableLists {
  1447  		nullCount = 0
  1448  	}
  1449  	lists := makeListArray(values, size, nullCount)
  1450  	defer lists.Release()
  1451  
  1452  	chunked := arrow.NewChunked(lists.DataType(), []arrow.Array{lists})
  1453  	defer chunked.Release()
  1454  
  1455  	return makeSimpleTable(array.NewChunkedSlice(chunked, 3, int64(size)), nullableLists)
  1456  }
  1457  
  1458  func prepareListOfListTable(dt arrow.DataType, size, nullCount int, nullableParentLists, nullableLists, nullableElems bool) arrow.Table {
  1459  	nc := nullCount
  1460  	if !nullableElems {
  1461  		nc = 0
  1462  	}
  1463  
  1464  	values := testutils.RandomNullable(dt, size*6, nc)
  1465  	defer values.Release()
  1466  
  1467  	if nullableLists {
  1468  		nc = nullCount
  1469  	} else {
  1470  		nc = 0
  1471  	}
  1472  
  1473  	lists := makeListArray(values, size*3, nc)
  1474  	defer lists.Release()
  1475  
  1476  	if !nullableParentLists {
  1477  		nullCount = 0
  1478  	}
  1479  
  1480  	parentLists := makeListArray(lists, size, nullCount)
  1481  	defer parentLists.Release()
  1482  
  1483  	chunked := arrow.NewChunked(parentLists.DataType(), []arrow.Array{parentLists})
  1484  	defer chunked.Release()
  1485  
  1486  	return makeSimpleTable(chunked, nullableParentLists)
  1487  }
  1488  
  1489  func (ps *ParquetIOTestSuite) TestSingleEmptyListsColumnReadWrite() {
  1490  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1491  	defer mem.AssertSize(ps.T(), 0)
  1492  
  1493  	expected := prepareEmptyListsTable(smallSize)
  1494  	defer expected.Release()
  1495  	buf := writeTableToBuffer(ps.T(), mem, expected, smallSize, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
  1496  	defer buf.Release()
  1497  
  1498  	reader := ps.createReader(mem, buf.Bytes())
  1499  	tbl := ps.readTable(reader)
  1500  	defer tbl.Release()
  1501  
  1502  	ps.EqualValues(expected.NumCols(), tbl.NumCols())
  1503  	ps.EqualValues(expected.NumRows(), tbl.NumRows())
  1504  
  1505  	exChunk := expected.Column(0).Data()
  1506  	tblChunk := tbl.Column(0).Data()
  1507  
  1508  	ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
  1509  	ps.True(array.Equal(exChunk.Chunk(0), tblChunk.Chunk(0)))
  1510  }
  1511  
  1512  func (ps *ParquetIOTestSuite) TestSingleColumnOptionalReadWrite() {
  1513  	for _, dt := range fullTypeList {
  1514  		ps.Run(dt.Name(), func() {
  1515  			mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1516  			defer mem.AssertSize(ps.T(), 0)
  1517  
  1518  			values := testutils.RandomNullable(dt, smallSize, 10)
  1519  			defer values.Release()
  1520  			sc := ps.makeSimpleSchema(dt, parquet.Repetitions.Optional)
  1521  			data := ps.writeColumn(mem, sc, values)
  1522  			ps.readAndCheckSingleColumnFile(mem, data, values)
  1523  		})
  1524  	}
  1525  }
  1526  
  1527  func (ps *ParquetIOTestSuite) TestSingleNullableListNullableColumnReadWrite() {
  1528  	for _, dt := range fullTypeList {
  1529  		ps.Run(dt.Name(), func() {
  1530  			expected := prepareListTable(dt, smallSize, true, true, 10)
  1531  			defer expected.Release()
  1532  			ps.roundTripTable(memory.DefaultAllocator, expected, false)
  1533  		})
  1534  	}
  1535  }
  1536  
  1537  func (ps *ParquetIOTestSuite) TestSingleRequiredListNullableColumnReadWrite() {
  1538  	for _, dt := range fullTypeList {
  1539  		ps.Run(dt.Name(), func() {
  1540  			expected := prepareListTable(dt, smallSize, false, true, 10)
  1541  			defer expected.Release()
  1542  			ps.roundTripTable(memory.DefaultAllocator, expected, false)
  1543  		})
  1544  	}
  1545  }
  1546  
  1547  func (ps *ParquetIOTestSuite) TestSingleNullableListRequiredColumnReadWrite() {
  1548  	for _, dt := range fullTypeList {
  1549  		ps.Run(dt.Name(), func() {
  1550  			expected := prepareListTable(dt, smallSize, true, false, 10)
  1551  			defer expected.Release()
  1552  			ps.roundTripTable(memory.DefaultAllocator, expected, false)
  1553  		})
  1554  	}
  1555  }
  1556  
  1557  func (ps *ParquetIOTestSuite) TestSingleRequiredListRequiredColumnReadWrite() {
  1558  	for _, dt := range fullTypeList {
  1559  		ps.Run(dt.Name(), func() {
  1560  			expected := prepareListTable(dt, smallSize, false, false, 0)
  1561  			defer expected.Release()
  1562  			ps.roundTripTable(memory.DefaultAllocator, expected, false)
  1563  		})
  1564  	}
  1565  }
  1566  
  1567  func (ps *ParquetIOTestSuite) TestSingleNullableListRequiredListRequiredColumnReadWrite() {
  1568  	for _, dt := range fullTypeList {
  1569  		ps.Run(dt.Name(), func() {
  1570  			expected := prepareListOfListTable(dt, smallSize, 2, true, false, false)
  1571  			defer expected.Release()
  1572  			ps.roundTripTable(memory.DefaultAllocator, expected, false)
  1573  		})
  1574  	}
  1575  }
  1576  
  1577  func (ps *ParquetIOTestSuite) TestSimpleStruct() {
  1578  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1579  	defer mem.AssertSize(ps.T(), 0)
  1580  
  1581  	links := arrow.StructOf(arrow.Field{Name: "Backward", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
  1582  		arrow.Field{Name: "Forward", Type: arrow.PrimitiveTypes.Int64, Nullable: true})
  1583  
  1584  	bldr := array.NewStructBuilder(mem, links)
  1585  	defer bldr.Release()
  1586  
  1587  	backBldr := bldr.FieldBuilder(0).(*array.Int64Builder)
  1588  	forwardBldr := bldr.FieldBuilder(1).(*array.Int64Builder)
  1589  
  1590  	bldr.Append(true)
  1591  	backBldr.AppendNull()
  1592  	forwardBldr.Append(20)
  1593  
  1594  	bldr.Append(true)
  1595  	backBldr.Append(10)
  1596  	forwardBldr.Append(40)
  1597  
  1598  	data := bldr.NewArray()
  1599  	defer data.Release()
  1600  
  1601  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{{Name: "links", Type: links}}, nil),
  1602  		[]arrow.Column{*arrow.NewColumn(arrow.Field{Name: "links", Type: links}, arrow.NewChunked(links, []arrow.Array{data}))}, -1)
  1603  	defer data.Release() // NewChunked
  1604  	defer tbl.Release()
  1605  
  1606  	ps.roundTripTable(mem, tbl, false)
  1607  }
  1608  
  1609  func (ps *ParquetIOTestSuite) TestSingleColumnNullableStruct() {
  1610  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1611  	defer mem.AssertSize(ps.T(), 0)
  1612  
  1613  	links := arrow.StructOf(arrow.Field{Name: "Backward", Type: arrow.PrimitiveTypes.Int64, Nullable: true})
  1614  	bldr := array.NewStructBuilder(mem, links)
  1615  	defer bldr.Release()
  1616  
  1617  	backBldr := bldr.FieldBuilder(0).(*array.Int64Builder)
  1618  
  1619  	bldr.AppendNull()
  1620  	bldr.Append(true)
  1621  	backBldr.Append(10)
  1622  
  1623  	data := bldr.NewArray()
  1624  	defer data.Release()
  1625  
  1626  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{{Name: "links", Type: links, Nullable: true}}, nil),
  1627  		[]arrow.Column{*arrow.NewColumn(arrow.Field{Name: "links", Type: links, Nullable: true}, arrow.NewChunked(links, []arrow.Array{data}))}, -1)
  1628  	defer data.Release() // NewChunked
  1629  	defer tbl.Release()
  1630  
  1631  	ps.roundTripTable(mem, tbl, false)
  1632  }
  1633  
  1634  func (ps *ParquetIOTestSuite) TestNestedRequiredFieldStruct() {
  1635  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1636  	defer mem.AssertSize(ps.T(), 0)
  1637  
  1638  	intField := arrow.Field{Name: "int_array", Type: arrow.PrimitiveTypes.Int32}
  1639  	intBldr := array.NewInt32Builder(mem)
  1640  	defer intBldr.Release()
  1641  	intBldr.AppendValues([]int32{0, 1, 2, 3, 4, 5, 7, 8}, nil)
  1642  
  1643  	intArr := intBldr.NewArray()
  1644  	defer intArr.Release()
  1645  
  1646  	validity := memory.NewBufferBytes([]byte{0xCC})
  1647  	defer validity.Release()
  1648  
  1649  	structField := arrow.Field{Name: "root", Type: arrow.StructOf(intField), Nullable: true}
  1650  	structData := array.NewData(structField.Type, 8, []*memory.Buffer{validity}, []arrow.ArrayData{intArr.Data()}, 4, 0)
  1651  	defer structData.Release()
  1652  	stData := array.NewStructData(structData)
  1653  	defer stData.Release()
  1654  
  1655  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{structField}, nil),
  1656  		[]arrow.Column{*arrow.NewColumn(structField,
  1657  			arrow.NewChunked(structField.Type, []arrow.Array{stData}))}, -1)
  1658  	defer stData.Release() // NewChunked
  1659  	defer tbl.Release()
  1660  
  1661  	ps.roundTripTable(mem, tbl, false)
  1662  }
  1663  
  1664  func (ps *ParquetIOTestSuite) TestNestedNullableField() {
  1665  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1666  	defer mem.AssertSize(ps.T(), 0)
  1667  
  1668  	intField := arrow.Field{Name: "int_array", Type: arrow.PrimitiveTypes.Int32, Nullable: true}
  1669  	intBldr := array.NewInt32Builder(mem)
  1670  	defer intBldr.Release()
  1671  	intBldr.AppendValues([]int32{0, 1, 2, 3, 4, 5, 7, 8}, []bool{true, false, true, false, true, true, false, true})
  1672  
  1673  	intArr := intBldr.NewArray()
  1674  	defer intArr.Release()
  1675  
  1676  	validity := memory.NewBufferBytes([]byte{0xCC})
  1677  	defer validity.Release()
  1678  
  1679  	structField := arrow.Field{Name: "root", Type: arrow.StructOf(intField), Nullable: true}
  1680  	data := array.NewData(structField.Type, 8, []*memory.Buffer{validity}, []arrow.ArrayData{intArr.Data()}, 4, 0)
  1681  	defer data.Release()
  1682  	stData := array.NewStructData(data)
  1683  	defer stData.Release()
  1684  
  1685  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{structField}, nil),
  1686  		[]arrow.Column{*arrow.NewColumn(structField,
  1687  			arrow.NewChunked(structField.Type, []arrow.Array{stData}))}, -1)
  1688  	defer stData.Release() // NewChunked
  1689  	defer tbl.Release()
  1690  
  1691  	ps.roundTripTable(mem, tbl, false)
  1692  }
  1693  
  1694  func (ps *ParquetIOTestSuite) TestNestedEmptyList() {
  1695  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1696  	defer mem.AssertSize(ps.T(), 0)
  1697  
  1698  	bldr := array.NewStructBuilder(mem, arrow.StructOf(
  1699  		arrow.Field{
  1700  			Name: "root",
  1701  			Type: arrow.StructOf(
  1702  				arrow.Field{
  1703  					Name: "child1",
  1704  					Type: arrow.ListOf(arrow.StructOf(
  1705  						arrow.Field{
  1706  							Name: "child2",
  1707  							Type: arrow.ListOf(arrow.StructOf(
  1708  								arrow.Field{
  1709  									Name: "name",
  1710  									Type: arrow.BinaryTypes.String,
  1711  								},
  1712  							)),
  1713  						},
  1714  					)),
  1715  				},
  1716  			),
  1717  		},
  1718  	))
  1719  	defer bldr.Release()
  1720  
  1721  	rootBldr := bldr.FieldBuilder(0).(*array.StructBuilder)
  1722  	child1Bldr := rootBldr.FieldBuilder(0).(*array.ListBuilder)
  1723  	child1ElBldr := child1Bldr.ValueBuilder().(*array.StructBuilder)
  1724  	child2Bldr := child1ElBldr.FieldBuilder(0).(*array.ListBuilder)
  1725  	leafBldr := child2Bldr.ValueBuilder().(*array.StructBuilder)
  1726  	nameBldr := leafBldr.FieldBuilder(0).(*array.StringBuilder)
  1727  
  1728  	// target structure 8 times
  1729  	// {
  1730  	//   "root": {
  1731  	//     "child1": [
  1732  	//       { "child2": [{ "name": "foo" }] },
  1733  	//       { "child2": [] }
  1734  	//     ]
  1735  	//   }
  1736  	// }
  1737  
  1738  	for i := 0; i < 8; i++ {
  1739  		bldr.Append(true)
  1740  		rootBldr.Append(true)
  1741  		child1Bldr.Append(true)
  1742  
  1743  		child1ElBldr.Append(true)
  1744  		child2Bldr.Append(true)
  1745  		leafBldr.Append(true)
  1746  		nameBldr.Append("foo")
  1747  
  1748  		child1ElBldr.Append(true)
  1749  		child2Bldr.Append(true)
  1750  	}
  1751  
  1752  	arr := bldr.NewArray()
  1753  	defer arr.Release()
  1754  
  1755  	field := arrow.Field{Name: "x", Type: arr.DataType(), Nullable: true}
  1756  	expected := array.NewTableFromSlice(arrow.NewSchema([]arrow.Field{field}, nil), [][]arrow.Array{{arr}})
  1757  	defer expected.Release()
  1758  
  1759  	ps.roundTripTable(mem, expected, false)
  1760  }
  1761  
  1762  func (ps *ParquetIOTestSuite) TestCanonicalNestedRoundTrip() {
  1763  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1764  	defer mem.AssertSize(ps.T(), 0)
  1765  
  1766  	docIdField := arrow.Field{Name: "DocID", Type: arrow.PrimitiveTypes.Int64}
  1767  	linksField := arrow.Field{Name: "Links", Type: arrow.StructOf(
  1768  		arrow.Field{Name: "Backward", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)},
  1769  		arrow.Field{Name: "Forward", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)},
  1770  	), Nullable: true}
  1771  
  1772  	nameStruct := arrow.StructOf(
  1773  		arrow.Field{Name: "Language", Nullable: true, Type: arrow.ListOf(
  1774  			arrow.StructOf(arrow.Field{Name: "Code", Type: arrow.BinaryTypes.String},
  1775  				arrow.Field{Name: "Country", Type: arrow.BinaryTypes.String, Nullable: true}))},
  1776  		arrow.Field{Name: "Url", Type: arrow.BinaryTypes.String, Nullable: true})
  1777  
  1778  	nameField := arrow.Field{Name: "Name", Type: arrow.ListOf(nameStruct)}
  1779  	sc := arrow.NewSchema([]arrow.Field{docIdField, linksField, nameField}, nil)
  1780  
  1781  	docIDArr, _, err := array.FromJSON(mem, docIdField.Type, strings.NewReader("[10, 20]"))
  1782  	ps.Require().NoError(err)
  1783  	defer docIDArr.Release()
  1784  
  1785  	linksIDArr, _, err := array.FromJSON(mem, linksField.Type, strings.NewReader(`[{"Backward":[], "Forward":[20, 40, 60]}, {"Backward":[10, 30], "Forward": [80]}]`))
  1786  	ps.Require().NoError(err)
  1787  	defer linksIDArr.Release()
  1788  
  1789  	nameArr, _, err := array.FromJSON(mem, nameField.Type, strings.NewReader(`
  1790  			[[{"Language": [{"Code": "en_us", "Country": "us"},
  1791  							{"Code": "en_us", "Country": null}],
  1792  			   "Url": "http://A"},
  1793  			  {"Url": "http://B", "Language": null},
  1794  			  {"Language": [{"Code": "en-gb", "Country": "gb"}], "Url": null}],
  1795  			  [{"Url": "http://C", "Language": null}]]`))
  1796  	ps.Require().NoError(err)
  1797  	defer nameArr.Release()
  1798  
  1799  	expected := array.NewTable(sc, []arrow.Column{
  1800  		*arrow.NewColumn(docIdField, arrow.NewChunked(docIdField.Type, []arrow.Array{docIDArr})),
  1801  		*arrow.NewColumn(linksField, arrow.NewChunked(linksField.Type, []arrow.Array{linksIDArr})),
  1802  		*arrow.NewColumn(nameField, arrow.NewChunked(nameField.Type, []arrow.Array{nameArr})),
  1803  	}, 2)
  1804  	defer docIDArr.Release()   // NewChunked
  1805  	defer linksIDArr.Release() // NewChunked
  1806  	defer nameArr.Release()    // NewChunked
  1807  	defer expected.Release()
  1808  
  1809  	ps.roundTripTable(mem, expected, false)
  1810  }
  1811  
  1812  func (ps *ParquetIOTestSuite) TestFixedSizeList() {
  1813  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1814  	defer mem.AssertSize(ps.T(), 0)
  1815  
  1816  	bldr := array.NewFixedSizeListBuilder(mem, 3, arrow.PrimitiveTypes.Int16)
  1817  	defer bldr.Release()
  1818  
  1819  	vb := bldr.ValueBuilder().(*array.Int16Builder)
  1820  
  1821  	bldr.AppendValues([]bool{true, true, true})
  1822  	vb.AppendValues([]int16{1, 2, 3, 4, 5, 6, 7, 8, 9}, nil)
  1823  
  1824  	data := bldr.NewArray()
  1825  	defer data.Release() // NewArray
  1826  
  1827  	field := arrow.Field{Name: "root", Type: data.DataType(), Nullable: true}
  1828  	cnk := arrow.NewChunked(field.Type, []arrow.Array{data})
  1829  	defer data.Release() // NewChunked
  1830  
  1831  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{field}, nil), []arrow.Column{*arrow.NewColumn(field, cnk)}, -1)
  1832  	defer cnk.Release() // NewColumn
  1833  	defer tbl.Release()
  1834  
  1835  	ps.roundTripTable(mem, tbl, true)
  1836  }
  1837  
  1838  func (ps *ParquetIOTestSuite) TestNull() {
  1839  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1840  	defer mem.AssertSize(ps.T(), 0)
  1841  
  1842  	bldr := array.NewNullBuilder(mem)
  1843  	defer bldr.Release()
  1844  
  1845  	bldr.AppendNull()
  1846  	bldr.AppendNull()
  1847  	bldr.AppendNull()
  1848  
  1849  	data := bldr.NewArray()
  1850  	defer data.Release()
  1851  
  1852  	field := arrow.Field{Name: "x", Type: data.DataType(), Nullable: true}
  1853  	expected := array.NewTable(
  1854  		arrow.NewSchema([]arrow.Field{field}, nil),
  1855  		[]arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(field.Type, []arrow.Array{data}))},
  1856  		-1,
  1857  	)
  1858  
  1859  	ps.roundTripTable(mem, expected, true)
  1860  }
  1861  
  1862  // ARROW-17169
  1863  func (ps *ParquetIOTestSuite) TestNullableListOfStruct() {
  1864  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1865  	defer mem.AssertSize(ps.T(), 0)
  1866  
  1867  	bldr := array.NewListBuilder(mem, arrow.StructOf(
  1868  		arrow.Field{Name: "a", Type: arrow.PrimitiveTypes.Int32},
  1869  		arrow.Field{Name: "b", Type: arrow.BinaryTypes.String},
  1870  	))
  1871  	defer bldr.Release()
  1872  
  1873  	stBldr := bldr.ValueBuilder().(*array.StructBuilder)
  1874  	aBldr := stBldr.FieldBuilder(0).(*array.Int32Builder)
  1875  	bBldr := stBldr.FieldBuilder(1).(*array.StringBuilder)
  1876  
  1877  	for i := 0; i < 320; i++ {
  1878  		if i%5 == 0 {
  1879  			bldr.AppendNull()
  1880  			continue
  1881  		}
  1882  		bldr.Append(true)
  1883  		for j := 0; j < 4; j++ {
  1884  			stBldr.Append(true)
  1885  			aBldr.Append(int32(i + j))
  1886  			bBldr.Append(strconv.Itoa(i + j))
  1887  		}
  1888  	}
  1889  
  1890  	arr := bldr.NewArray()
  1891  	defer arr.Release()
  1892  
  1893  	field := arrow.Field{Name: "x", Type: arr.DataType(), Nullable: true}
  1894  	expected := array.NewTable(arrow.NewSchema([]arrow.Field{field}, nil),
  1895  		[]arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(field.Type, []arrow.Array{arr}))}, -1)
  1896  	defer arr.Release() // NewChunked
  1897  	defer expected.Release()
  1898  
  1899  	ps.roundTripTable(mem, expected, false)
  1900  }
  1901  
  1902  func (ps *ParquetIOTestSuite) TestStructWithListOfNestedStructs() {
  1903  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1904  	defer mem.AssertSize(ps.T(), 0)
  1905  
  1906  	bldr := array.NewStructBuilder(mem, arrow.StructOf(
  1907  		arrow.Field{
  1908  			Nullable: true,
  1909  			Name:     "l",
  1910  			Type: arrow.ListOf(arrow.StructOf(
  1911  				arrow.Field{
  1912  					Nullable: true,
  1913  					Name:     "a",
  1914  					Type: arrow.StructOf(
  1915  						arrow.Field{
  1916  							Nullable: true,
  1917  							Name:     "b",
  1918  							Type:     arrow.BinaryTypes.String,
  1919  						},
  1920  					),
  1921  				},
  1922  			)),
  1923  		},
  1924  	))
  1925  	defer bldr.Release()
  1926  
  1927  	lBldr := bldr.FieldBuilder(0).(*array.ListBuilder)
  1928  	stBldr := lBldr.ValueBuilder().(*array.StructBuilder)
  1929  	aBldr := stBldr.FieldBuilder(0).(*array.StructBuilder)
  1930  	bBldr := aBldr.FieldBuilder(0).(*array.StringBuilder)
  1931  
  1932  	bldr.AppendNull()
  1933  	bldr.Append(true)
  1934  	lBldr.Append(true)
  1935  	for i := 0; i < 8; i++ {
  1936  		stBldr.Append(true)
  1937  		aBldr.Append(true)
  1938  		bBldr.Append(strconv.Itoa(i))
  1939  	}
  1940  
  1941  	arr := bldr.NewArray()
  1942  	defer arr.Release()
  1943  
  1944  	field := arrow.Field{Name: "x", Type: arr.DataType(), Nullable: true}
  1945  	expected := array.NewTable(arrow.NewSchema([]arrow.Field{field}, nil),
  1946  		[]arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(field.Type, []arrow.Array{arr}))}, -1)
  1947  	defer arr.Release() // NewChunked
  1948  	defer expected.Release()
  1949  
  1950  	ps.roundTripTable(mem, expected, false)
  1951  }
  1952  
  1953  func TestParquetArrowIO(t *testing.T) {
  1954  	suite.Run(t, new(ParquetIOTestSuite))
  1955  }
  1956  
  1957  func TestBufferedRecWrite(t *testing.T) {
  1958  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  1959  	defer mem.AssertSize(t, 0)
  1960  
  1961  	sc := arrow.NewSchema([]arrow.Field{
  1962  		{Name: "f32", Type: arrow.PrimitiveTypes.Float32, Nullable: true},
  1963  		{Name: "i32", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
  1964  		{Name: "struct_i64_f64", Type: arrow.StructOf(
  1965  			arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
  1966  			arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64, Nullable: true})},
  1967  	}, nil)
  1968  
  1969  	structData := array.NewData(sc.Field(2).Type, SIZELEN,
  1970  		[]*memory.Buffer{nil, nil},
  1971  		[]arrow.ArrayData{testutils.RandomNullable(arrow.PrimitiveTypes.Int64, SIZELEN, 0).Data(), testutils.RandomNullable(arrow.PrimitiveTypes.Float64, SIZELEN, 0).Data()}, 0, 0)
  1972  	defer structData.Release()
  1973  	cols := []arrow.Array{
  1974  		testutils.RandomNullable(sc.Field(0).Type, SIZELEN, SIZELEN/5),
  1975  		testutils.RandomNullable(sc.Field(1).Type, SIZELEN, SIZELEN/5),
  1976  		array.NewStructData(structData),
  1977  	}
  1978  
  1979  	rec := array.NewRecord(sc, cols, SIZELEN)
  1980  	defer rec.Release()
  1981  
  1982  	var (
  1983  		buf bytes.Buffer
  1984  	)
  1985  
  1986  	wr, err := pqarrow.NewFileWriter(sc, &buf,
  1987  		parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy), parquet.WithDictionaryDefault(false), parquet.WithDataPageSize(100*1024)),
  1988  		pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
  1989  	require.NoError(t, err)
  1990  
  1991  	p1 := rec.NewSlice(0, SIZELEN/2)
  1992  	defer p1.Release()
  1993  	require.NoError(t, wr.WriteBuffered(p1))
  1994  
  1995  	p2 := rec.NewSlice(SIZELEN/2, SIZELEN)
  1996  	defer p2.Release()
  1997  	require.NoError(t, wr.WriteBuffered(p2))
  1998  
  1999  	wr.Close()
  2000  
  2001  	rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
  2002  	assert.NoError(t, err)
  2003  
  2004  	assert.EqualValues(t, 1, rdr.NumRowGroups())
  2005  	assert.EqualValues(t, SIZELEN, rdr.NumRows())
  2006  	rdr.Close()
  2007  
  2008  	tbl, err := pqarrow.ReadTable(context.Background(), bytes.NewReader(buf.Bytes()), nil, pqarrow.ArrowReadProperties{}, nil)
  2009  	assert.NoError(t, err)
  2010  	defer tbl.Release()
  2011  
  2012  	assert.EqualValues(t, SIZELEN, tbl.NumRows())
  2013  }
  2014  
  2015  func (ps *ParquetIOTestSuite) TestArrowMapTypeRoundTrip() {
  2016  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  2017  	defer mem.AssertSize(ps.T(), 0)
  2018  
  2019  	bldr := array.NewMapBuilder(mem, arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int32, false)
  2020  	defer bldr.Release()
  2021  
  2022  	kb := bldr.KeyBuilder().(*array.StringBuilder)
  2023  	ib := bldr.ItemBuilder().(*array.Int32Builder)
  2024  
  2025  	bldr.Append(true)
  2026  	kb.AppendValues([]string{"Fee", "Fi", "Fo", "Fum"}, nil)
  2027  	ib.AppendValues([]int32{1, 2, 3, 4}, nil)
  2028  
  2029  	bldr.Append(true)
  2030  	kb.AppendValues([]string{"Fee", "Fi", "Fo"}, nil)
  2031  	ib.AppendValues([]int32{5, 4, 3}, nil)
  2032  
  2033  	bldr.AppendNull()
  2034  
  2035  	bldr.Append(true)
  2036  	kb.AppendValues([]string{"Fo", "Fi", "Fee"}, nil)
  2037  	ib.AppendValues([]int32{-1, 2, 3}, []bool{false, true, true})
  2038  
  2039  	arr := bldr.NewArray()
  2040  	defer arr.Release()
  2041  
  2042  	fld := arrow.Field{Name: "mapped", Type: arr.DataType(), Nullable: true}
  2043  	cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
  2044  	defer arr.Release() // NewChunked
  2045  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
  2046  	defer cnk.Release() // NewColumn
  2047  	defer tbl.Release()
  2048  
  2049  	ps.roundTripTable(mem, tbl, true)
  2050  }
  2051  
  2052  func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() {
  2053  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  2054  	defer mem.AssertSize(ps.T(), 0)
  2055  
  2056  	extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType())
  2057  	defer extBuilder.Release()
  2058  	builder := types.NewUUIDBuilder(extBuilder)
  2059  	builder.Append(uuid.New())
  2060  	arr := builder.NewArray()
  2061  	defer arr.Release()
  2062  
  2063  	fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
  2064  	cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
  2065  	defer arr.Release() // NewChunked
  2066  	tbl := array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
  2067  	defer cnk.Release() // NewColumn
  2068  	defer tbl.Release()
  2069  
  2070  	ps.roundTripTable(mem, tbl, true)
  2071  }
  2072  
  2073  func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
  2074  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  2075  	defer mem.AssertSize(ps.T(), 0)
  2076  
  2077  	var written, expected arrow.Table
  2078  
  2079  	{
  2080  		// Prepare `written` table with the extension type registered.
  2081  		extType := types.NewUUIDType()
  2082  		bldr := array.NewExtensionBuilder(mem, extType)
  2083  		defer bldr.Release()
  2084  
  2085  		bldr.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(
  2086  			[][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")},
  2087  			[]bool{false, true, true, true})
  2088  
  2089  		arr := bldr.NewArray()
  2090  		defer arr.Release()
  2091  
  2092  		if arrow.GetExtensionType("uuid") != nil {
  2093  			ps.NoError(arrow.UnregisterExtensionType("uuid"))
  2094  		}
  2095  
  2096  		fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
  2097  		cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
  2098  		defer arr.Release() // NewChunked
  2099  		written = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
  2100  		defer cnk.Release() // NewColumn
  2101  		defer written.Release()
  2102  	}
  2103  
  2104  	{
  2105  		// Prepare `expected` table with the extension type unregistered in the underlying type.
  2106  		bldr := array.NewFixedSizeBinaryBuilder(mem, &arrow.FixedSizeBinaryType{ByteWidth: 16})
  2107  		defer bldr.Release()
  2108  		bldr.AppendValues(
  2109  			[][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")},
  2110  			[]bool{false, true, true, true})
  2111  
  2112  		arr := bldr.NewArray()
  2113  		defer arr.Release()
  2114  
  2115  		fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
  2116  		cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
  2117  		defer arr.Release() // NewChunked
  2118  		expected = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
  2119  		defer cnk.Release() // NewColumn
  2120  		defer expected.Release()
  2121  	}
  2122  
  2123  	// sanity check before going deeper
  2124  	ps.Equal(expected.NumCols(), written.NumCols())
  2125  	ps.Equal(expected.NumRows(), written.NumRows())
  2126  
  2127  	// just like roundTripTable() but different written vs. expected tables
  2128  	var buf bytes.Buffer
  2129  	props := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema(), pqarrow.WithAllocator(mem))
  2130  
  2131  	writeProps := parquet.NewWriterProperties(parquet.WithAllocator(mem))
  2132  	ps.Require().NoError(pqarrow.WriteTable(written, &buf, written.NumRows(), writeProps, props))
  2133  
  2134  	reader := ps.createReader(mem, buf.Bytes())
  2135  	defer reader.ParquetReader().Close()
  2136  
  2137  	tbl := ps.readTable(reader)
  2138  	defer tbl.Release()
  2139  
  2140  	ps.Equal(expected.NumCols(), tbl.NumCols())
  2141  	ps.Equal(expected.NumRows(), tbl.NumRows())
  2142  
  2143  	exChunk := expected.Column(0).Data()
  2144  	tblChunk := tbl.Column(0).Data()
  2145  
  2146  	ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
  2147  	exc := exChunk.Chunk(0)
  2148  	tbc := tblChunk.Chunk(0)
  2149  	ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
  2150  
  2151  	expectedMd := arrow.MetadataFrom(map[string]string{
  2152  		ipc.ExtensionTypeKeyName:     "uuid",
  2153  		ipc.ExtensionMetadataKeyName: "uuid-serialized",
  2154  		"PARQUET:field_id":           "-1",
  2155  	})
  2156  	ps.Truef(expectedMd.Equal(tbl.Column(0).Field().Metadata), "expected: %v\ngot: %v", expectedMd, tbl.Column(0).Field().Metadata)
  2157  }
  2158  
  2159  func TestWriteTableMemoryAllocation(t *testing.T) {
  2160  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
  2161  	sc := arrow.NewSchema([]arrow.Field{
  2162  		{Name: "f32", Type: arrow.PrimitiveTypes.Float32, Nullable: true},
  2163  		{Name: "i32", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
  2164  		{Name: "struct_i64_f64", Type: arrow.StructOf(
  2165  			arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
  2166  			arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64, Nullable: true})},
  2167  		{Name: "arr_i64", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)},
  2168  		{Name: "uuid", Type: types.NewUUIDType(), Nullable: true},
  2169  	}, nil)
  2170  
  2171  	bld := array.NewRecordBuilder(mem, sc)
  2172  	bld.Field(0).(*array.Float32Builder).Append(1.0)
  2173  	bld.Field(1).(*array.Int32Builder).Append(1)
  2174  	sbld := bld.Field(2).(*array.StructBuilder)
  2175  	sbld.Append(true)
  2176  	sbld.FieldBuilder(0).(*array.Int64Builder).Append(1)
  2177  	sbld.FieldBuilder(1).(*array.Float64Builder).Append(1.0)
  2178  	abld := bld.Field(3).(*array.ListBuilder)
  2179  	abld.Append(true)
  2180  	abld.ValueBuilder().(*array.Int64Builder).Append(2)
  2181  	bld.Field(4).(*types.UUIDBuilder).Append(uuid.MustParse("00000000-0000-0000-0000-000000000001"))
  2182  
  2183  	rec := bld.NewRecord()
  2184  	bld.Release()
  2185  
  2186  	var buf bytes.Buffer
  2187  	wr, err := pqarrow.NewFileWriter(sc, &buf,
  2188  		parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy)),
  2189  		pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
  2190  	require.NoError(t, err)
  2191  
  2192  	require.NoError(t, wr.Write(rec))
  2193  	rec.Release()
  2194  	wr.Close()
  2195  
  2196  	require.Zero(t, mem.CurrentAlloc())
  2197  }
  2198  
  2199  func TestEmptyListDeltaBinaryPacked(t *testing.T) {
  2200  	schema := arrow.NewSchema([]arrow.Field{
  2201  		{Name: "ts", Type: arrow.ListOf(arrow.PrimitiveTypes.Uint64),
  2202  			Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}}, nil)
  2203  	builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
  2204  	defer builder.Release()
  2205  
  2206  	listBuilder := builder.Field(0).(*array.ListBuilder)
  2207  	listBuilder.Append(true)
  2208  	arrowRec := builder.NewRecord()
  2209  	defer arrowRec.Release()
  2210  
  2211  	var buf bytes.Buffer
  2212  	wr, err := pqarrow.NewFileWriter(schema, &buf,
  2213  		parquet.NewWriterProperties(
  2214  			parquet.WithDictionaryFor("ts.list.element", false),
  2215  			parquet.WithEncodingFor("ts.list.element", parquet.Encodings.DeltaBinaryPacked)),
  2216  		pqarrow.DefaultWriterProps())
  2217  	require.NoError(t, err)
  2218  
  2219  	require.NoError(t, wr.WriteBuffered(arrowRec))
  2220  	require.NoError(t, wr.Close())
  2221  
  2222  	rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
  2223  	require.NoError(t, err)
  2224  	reader, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
  2225  	require.NoError(t, err)
  2226  	defer rdr.Close()
  2227  
  2228  	tbl, err := reader.ReadTable(context.Background())
  2229  	require.NoError(t, err)
  2230  	defer tbl.Release()
  2231  
  2232  	assert.True(t, schema.Equal(tbl.Schema()))
  2233  	assert.EqualValues(t, 1, tbl.NumRows())
  2234  }