github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/encode_dictionary_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  //go:build go1.18
    18  
    19  package pqarrow_test
    20  
    21  import (
    22  	"bytes"
    23  	"context"
    24  	"fmt"
    25  	"math"
    26  	"strings"
    27  	"testing"
    28  
    29  	"github.com/apache/arrow/go/v14/arrow"
    30  	"github.com/apache/arrow/go/v14/arrow/array"
    31  	"github.com/apache/arrow/go/v14/arrow/compute"
    32  	"github.com/apache/arrow/go/v14/arrow/memory"
    33  	"github.com/apache/arrow/go/v14/parquet"
    34  	"github.com/apache/arrow/go/v14/parquet/file"
    35  	"github.com/apache/arrow/go/v14/parquet/internal/testutils"
    36  	"github.com/apache/arrow/go/v14/parquet/pqarrow"
    37  	"github.com/stretchr/testify/assert"
    38  	"github.com/stretchr/testify/require"
    39  	"github.com/stretchr/testify/suite"
    40  )
    41  
    42  func (ps *ParquetIOTestSuite) TestSingleColumnOptionalDictionaryWrite() {
    43  	for _, dt := range fullTypeList {
    44  		// skip tests for bool as we don't do dictionaries for it
    45  		if dt.ID() == arrow.BOOL {
    46  			continue
    47  		}
    48  
    49  		ps.Run(dt.Name(), func() {
    50  			mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
    51  			defer mem.AssertSize(ps.T(), 0)
    52  
    53  			bldr := array.NewDictionaryBuilder(mem, &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int16, ValueType: dt})
    54  			defer bldr.Release()
    55  
    56  			values := testutils.RandomNullable(dt, smallSize, 10)
    57  			defer values.Release()
    58  			ps.Require().NoError(bldr.AppendArray(values))
    59  
    60  			arr := bldr.NewDictionaryArray()
    61  			defer arr.Release()
    62  
    63  			sc := ps.makeSimpleSchema(arr.DataType(), parquet.Repetitions.Optional)
    64  			data := ps.writeColumn(mem, sc, arr)
    65  			ps.readAndCheckSingleColumnFile(mem, data, values)
    66  		})
    67  	}
    68  }
    69  
    70  func TestPqarrowDictionaries(t *testing.T) {
    71  	suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: parquet.DataPageV1})
    72  	suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: parquet.DataPageV2})
    73  	testSuite := &ArrowReadDictSuite{}
    74  	for _, np := range testSuite.NullProbabilities() {
    75  		testSuite.nullProb = np
    76  		t.Run(fmt.Sprintf("nullprob=%.2f", np), func(t *testing.T) {
    77  			suite.Run(t, testSuite)
    78  		})
    79  	}
    80  }
    81  
    82  type ArrowWriteDictionarySuite struct {
    83  	suite.Suite
    84  
    85  	dataPageVersion parquet.DataPageVersion
    86  }
    87  
    88  func (ad *ArrowWriteDictionarySuite) fromJSON(mem memory.Allocator, dt arrow.DataType, data string) arrow.Array {
    89  	arr, _, err := array.FromJSON(mem, dt, strings.NewReader(data))
    90  	ad.Require().NoError(err)
    91  	return arr
    92  }
    93  
    94  func (ad *ArrowWriteDictionarySuite) TestStatisticsWithFallback() {
    95  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
    96  	defer mem.AssertSize(ad.T(), 0)
    97  
    98  	testDictionaries := []arrow.Array{
    99  		ad.fromJSON(mem, arrow.BinaryTypes.String, `["b", "c", "d", "a", "b", "c", "d", "a"]`),
   100  		ad.fromJSON(mem, arrow.BinaryTypes.String, `["b", "c", "d", "a", "b", "c", "d", "a"]`),
   101  		ad.fromJSON(mem, arrow.BinaryTypes.Binary, `["ZA==", "Yw==", "Yg==", "YQ==", "ZA==", "Yw==", "Yg==", "YQ=="]`),
   102  		ad.fromJSON(mem, arrow.BinaryTypes.LargeString, `["a", "b", "c", "a", "b", "c"]`),
   103  	}
   104  
   105  	testIndices := []arrow.Array{
   106  		// ["b", null, "a", "b", null, "a"]
   107  		ad.fromJSON(mem, arrow.PrimitiveTypes.Int32, `[0, null, 3, 0, null, 3]`),
   108  		// ["b", "c", null, "b", "c", null]
   109  		ad.fromJSON(mem, arrow.PrimitiveTypes.Int32, `[0, 1, null, 0, 1, null]`),
   110  		// ["ZA==", "Yw==", "YQ==", "ZA==", "Yw==", "YQ=="]
   111  		ad.fromJSON(mem, arrow.PrimitiveTypes.Int32, `[0, 1, 3, 0, 1, 3]`),
   112  		ad.fromJSON(mem, arrow.PrimitiveTypes.Int32, `[null, null, null, null, null, null]`),
   113  	}
   114  
   115  	defer func() {
   116  		for _, d := range testDictionaries {
   117  			d.Release()
   118  		}
   119  		for _, i := range testIndices {
   120  			i.Release()
   121  		}
   122  	}()
   123  
   124  	// arrays will be written with 3 values per row group, 2 values per data page
   125  	// the row groups are identical for ease of testing
   126  	expectedValidCounts := []int32{2, 2, 3, 0}
   127  	expectedNullCounts := []int32{1, 1, 0, 3}
   128  	expectedNumDataPages := []int{2, 2, 2, 1}
   129  	expectedValidByPage := [][]int32{
   130  		{1, 1},
   131  		{2, 0},
   132  		{2, 1},
   133  		{0}}
   134  	expectedNullByPage := [][]int64{
   135  		{1, 0},
   136  		{0, 1},
   137  		{0, 0},
   138  		{3}}
   139  	expectedDictCounts := []int32{4, 4, 4, 3}
   140  	// pairs of (min, max)
   141  	expectedMinMax := [][2]string{
   142  		{"a", "b"},
   143  		{"b", "c"},
   144  		{"a", "d"},
   145  		{"", ""}}
   146  
   147  	expectedMinByPage := [][][]string{
   148  		{{"b", "a"}, {"b", "a"}},
   149  		{{"b", "b"}, {"b", "b"}},
   150  		{{"c", "a"}, {"c", "a"}}}
   151  	expectedMaxByPage := [][][]string{
   152  		{{"b", "a"}, {"b", "a"}},
   153  		{{"c", "c"}, {"c", "c"}},
   154  		{{"d", "a"}, {"d", "a"}}}
   155  	expectedHasMinMaxByPage := [][][]bool{
   156  		{{true, true}, {true, true}},
   157  		// second page of each rowgroup only contains a null,
   158  		// so there's no stat on that page
   159  		{{true, false}, {true, false}},
   160  		{{true, true}, {true, true}},
   161  		{{false}, {false}}}
   162  
   163  	for caseIndex, dict := range testDictionaries {
   164  		ad.Run(dict.DataType().String(), func() {
   165  			dictType := &arrow.DictionaryType{
   166  				IndexType: testIndices[caseIndex].DataType(),
   167  				ValueType: dict.DataType(),
   168  			}
   169  			dictEncoded := array.NewDictionaryArray(dictType, testIndices[caseIndex], dict)
   170  			defer dictEncoded.Release()
   171  			schema := arrow.NewSchema([]arrow.Field{
   172  				{Name: "values", Type: dictEncoded.DataType(), Nullable: true}}, nil)
   173  			col := arrow.NewColumnFromArr(schema.Field(0), dictEncoded)
   174  			defer col.Release()
   175  			tbl := array.NewTable(schema, []arrow.Column{col}, int64(dictEncoded.Len()))
   176  			defer tbl.Release()
   177  
   178  			writerProperties := parquet.NewWriterProperties(
   179  				parquet.WithMaxRowGroupLength(3),
   180  				parquet.WithDataPageVersion(ad.dataPageVersion),
   181  				parquet.WithBatchSize(2),
   182  				parquet.WithDictionaryDefault(true),
   183  				parquet.WithDataPageSize(2),
   184  				parquet.WithStats(true),
   185  			)
   186  
   187  			var buf bytes.Buffer
   188  			ad.Require().NoError(pqarrow.WriteTable(tbl, &buf, math.MaxInt64, writerProperties,
   189  				pqarrow.DefaultWriterProps()))
   190  
   191  			rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   192  			ad.Require().NoError(err)
   193  			defer rdr.Close()
   194  
   195  			metadata := rdr.MetaData()
   196  			ad.Len(metadata.RowGroups, 2)
   197  
   198  			for i := 0; i < rdr.NumRowGroups(); i++ {
   199  				rg := metadata.RowGroup(i)
   200  				ad.EqualValues(1, rg.NumColumns())
   201  				col, err := rg.ColumnChunk(0)
   202  				ad.Require().NoError(err)
   203  				stats, err := col.Statistics()
   204  				ad.Require().NoError(err)
   205  
   206  				ad.EqualValues(expectedValidCounts[caseIndex], stats.NumValues())
   207  				ad.EqualValues(expectedNullCounts[caseIndex], stats.NullCount())
   208  
   209  				caseExpectedMinMax := expectedMinMax[caseIndex]
   210  				ad.Equal(caseExpectedMinMax[0], string(stats.EncodeMin()))
   211  				ad.Equal(caseExpectedMinMax[1], string(stats.EncodeMax()))
   212  			}
   213  
   214  			for rowGroup := 0; rowGroup < 2; rowGroup++ {
   215  				pr, err := rdr.RowGroup(0).GetColumnPageReader(0)
   216  				ad.Require().NoError(err)
   217  				ad.True(pr.Next())
   218  				page := pr.Page()
   219  				ad.NotNil(page)
   220  				ad.NoError(pr.Err())
   221  				ad.Require().IsType((*file.DictionaryPage)(nil), page)
   222  				dictPage := page.(*file.DictionaryPage)
   223  				ad.EqualValues(expectedDictCounts[caseIndex], dictPage.NumValues())
   224  
   225  				for pageIdx := 0; pageIdx < expectedNumDataPages[caseIndex]; pageIdx++ {
   226  					ad.True(pr.Next())
   227  					page = pr.Page()
   228  					ad.NotNil(page)
   229  					ad.NoError(pr.Err())
   230  
   231  					dataPage, ok := page.(file.DataPage)
   232  					ad.Require().True(ok)
   233  					stats := dataPage.Statistics()
   234  					ad.EqualValues(expectedNullByPage[caseIndex][pageIdx], stats.NullCount)
   235  
   236  					expectHasMinMax := expectedHasMinMaxByPage[caseIndex][rowGroup][pageIdx]
   237  					ad.Equal(expectHasMinMax, stats.HasMin)
   238  					ad.Equal(expectHasMinMax, stats.HasMax)
   239  
   240  					if expectHasMinMax {
   241  						ad.Equal(expectedMinByPage[caseIndex][rowGroup][pageIdx], string(stats.Min))
   242  						ad.Equal(expectedMaxByPage[caseIndex][rowGroup][pageIdx], string(stats.Max))
   243  					}
   244  
   245  					ad.EqualValues(expectedValidByPage[caseIndex][pageIdx]+int32(expectedNullByPage[caseIndex][pageIdx]),
   246  						dataPage.NumValues())
   247  				}
   248  
   249  				ad.False(pr.Next())
   250  			}
   251  		})
   252  	}
   253  }
   254  
   255  func (ad *ArrowWriteDictionarySuite) TestStatisticsUnifiedDictionary() {
   256  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   257  	defer mem.AssertSize(ad.T(), 0)
   258  
   259  	// two chunks with a shared dictionary
   260  	var (
   261  		tbl      arrow.Table
   262  		dictType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32,
   263  			ValueType: arrow.BinaryTypes.String}
   264  		schema = arrow.NewSchema([]arrow.Field{
   265  			{Name: "values", Type: dictType, Nullable: true}}, nil)
   266  	)
   267  
   268  	{
   269  		// it's important there are no duplicate values in the dictionary,
   270  		// otherwise we trigger the WriteDense() code path which side-steps
   271  		// dictionary encoding.
   272  		testDictionary := ad.fromJSON(mem, arrow.BinaryTypes.String, `["b", "c", "d", "a"]`)
   273  		defer testDictionary.Release()
   274  
   275  		testIndices := []arrow.Array{
   276  			// ["a", null, "a", "a", null, "a"]
   277  			ad.fromJSON(mem, arrow.PrimitiveTypes.Int32, `[3, null, 3, 3, null, 3]`),
   278  			// ["b", "a", null, "b", null, "c"]
   279  			ad.fromJSON(mem, arrow.PrimitiveTypes.Int32, `[0, 3, null, 0, null, 1]`),
   280  		}
   281  		chunks := []arrow.Array{
   282  			array.NewDictionaryArray(dictType, testIndices[0], testDictionary),
   283  			array.NewDictionaryArray(dictType, testIndices[1], testDictionary),
   284  		}
   285  		testIndices[0].Release()
   286  		testIndices[1].Release()
   287  
   288  		tbl = array.NewTableFromSlice(schema, [][]arrow.Array{chunks})
   289  		defer tbl.Release()
   290  
   291  		chunks[0].Release()
   292  		chunks[1].Release()
   293  	}
   294  
   295  	var buf bytes.Buffer
   296  	{
   297  		// write data as two row groups, one with 9 rows and one with 3
   298  		props := parquet.NewWriterProperties(
   299  			parquet.WithMaxRowGroupLength(9),
   300  			parquet.WithDataPageVersion(ad.dataPageVersion),
   301  			parquet.WithBatchSize(3),
   302  			parquet.WithDataPageSize(3),
   303  			parquet.WithDictionaryDefault(true),
   304  			parquet.WithStats(true))
   305  
   306  		ad.Require().NoError(pqarrow.WriteTable(tbl, &buf, math.MaxInt64, props, pqarrow.DefaultWriterProps()))
   307  	}
   308  
   309  	rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   310  	ad.Require().NoError(err)
   311  	defer rdr.Close()
   312  
   313  	metadata := rdr.MetaData()
   314  	ad.Len(metadata.RowGroups, 2)
   315  	ad.EqualValues(9, metadata.RowGroup(0).NumRows())
   316  	ad.EqualValues(3, metadata.RowGroup(1).NumRows())
   317  
   318  	col0, err := metadata.RowGroup(0).ColumnChunk(0)
   319  	ad.Require().NoError(err)
   320  	col1, err := metadata.RowGroup(1).ColumnChunk(0)
   321  	ad.Require().NoError(err)
   322  
   323  	stats0, err := col0.Statistics()
   324  	ad.Require().NoError(err)
   325  	stats1, err := col1.Statistics()
   326  	ad.Require().NoError(err)
   327  
   328  	ad.EqualValues(6, stats0.NumValues())
   329  	ad.EqualValues(2, stats1.NumValues())
   330  	ad.EqualValues(3, stats0.NullCount())
   331  	ad.EqualValues(1, stats1.NullCount())
   332  	ad.Equal([]byte("a"), stats0.EncodeMin())
   333  	ad.Equal([]byte("b"), stats1.EncodeMin())
   334  	ad.Equal([]byte("b"), stats0.EncodeMax())
   335  	ad.Equal([]byte("c"), stats1.EncodeMax())
   336  }
   337  
   338  const numRowGroups = 16
   339  
   340  type ArrowReadDictSuite struct {
   341  	suite.Suite
   342  
   343  	mem *memory.CheckedAllocator
   344  
   345  	denseVals     arrow.Array
   346  	expectedDense arrow.Table
   347  	props         pqarrow.ArrowReadProperties
   348  	nullProb      float64
   349  
   350  	buf bytes.Buffer
   351  
   352  	options struct {
   353  		numRows      int
   354  		numRowGroups int
   355  		numUniques   int
   356  	}
   357  }
   358  
   359  func (ar *ArrowReadDictSuite) generateData(nullProb float64) {
   360  	const minLen = 2
   361  	const maxLen = 100
   362  	rag := testutils.NewRandomArrayGenerator(0)
   363  
   364  	ar.denseVals = rag.StringWithRepeats(ar.mem, int64(ar.options.numRows),
   365  		int64(ar.options.numUniques), minLen, maxLen, nullProb)
   366  
   367  	chunked := arrow.NewChunked(arrow.BinaryTypes.String, []arrow.Array{ar.denseVals})
   368  	defer chunked.Release()
   369  	ar.expectedDense = makeSimpleTable(chunked, true)
   370  }
   371  
   372  func (ar *ArrowReadDictSuite) SetupTest() {
   373  	ar.mem = memory.NewCheckedAllocator(memory.DefaultAllocator)
   374  	ar.buf.Reset()
   375  
   376  	ar.options = struct {
   377  		numRows      int
   378  		numRowGroups int
   379  		numUniques   int
   380  	}{1024 * numRowGroups, numRowGroups, 128}
   381  
   382  	ar.props = pqarrow.ArrowReadProperties{}
   383  	ar.generateData(ar.nullProb)
   384  }
   385  
   386  func (ar *ArrowReadDictSuite) TearDownTest() {
   387  	if ar.denseVals != nil {
   388  		ar.denseVals.Release()
   389  	}
   390  	ar.expectedDense.Release()
   391  
   392  	ar.mem.AssertSize(ar.T(), 0)
   393  }
   394  
   395  func (ar *ArrowReadDictSuite) writeSimple() {
   396  	// write num_row_groups row groups; each row group will have a
   397  	// different dictionary
   398  	ar.Require().NoError(pqarrow.WriteTable(ar.expectedDense, &ar.buf, int64(ar.options.numRows/ar.options.numRowGroups),
   399  		parquet.NewWriterProperties(parquet.WithDictionaryDefault(true), parquet.WithStats(true)),
   400  		pqarrow.DefaultWriterProps()))
   401  }
   402  
   403  func (*ArrowReadDictSuite) NullProbabilities() []float64 {
   404  	return []float64{0.0, 0.5, 1}
   405  }
   406  
   407  func (ar *ArrowReadDictSuite) checkReadWholeFile(expected arrow.Table) {
   408  	tbl, err := pqarrow.ReadTable(context.Background(),
   409  		bytes.NewReader(ar.buf.Bytes()), nil, ar.props, ar.mem)
   410  	ar.Require().NoError(err)
   411  	defer tbl.Release()
   412  
   413  	ar.Truef(array.TableEqual(expected, tbl), "expected: %s\ngot: %s", expected, tbl)
   414  }
   415  
   416  func (ar *ArrowReadDictSuite) checkStreamReadWholeFile(expected arrow.Table) {
   417  	reader, err := file.NewParquetReader(bytes.NewReader(ar.buf.Bytes()))
   418  	ar.Require().NoError(err)
   419  	defer reader.Close()
   420  
   421  	rdr, err := pqarrow.NewFileReader(reader, ar.props, ar.mem)
   422  	ar.Require().NoError(err)
   423  
   424  	rrdr, err := rdr.GetRecordReader(context.Background(), nil, nil)
   425  	ar.Require().NoError(err)
   426  	defer rrdr.Release()
   427  
   428  	recs := make([]arrow.Record, 0)
   429  	for rrdr.Next() {
   430  		rec := rrdr.Record()
   431  		rec.Retain()
   432  		defer rec.Release()
   433  		recs = append(recs, rec)
   434  	}
   435  
   436  	tbl := array.NewTableFromRecords(rrdr.Schema(), recs)
   437  	defer tbl.Release()
   438  
   439  	ar.Truef(array.TableEqual(expected, tbl), "expected: %s\ngot: %s", expected, tbl)
   440  }
   441  
   442  func (ar *ArrowReadDictSuite) getReader() *pqarrow.FileReader {
   443  	reader, err := file.NewParquetReader(bytes.NewReader(ar.buf.Bytes()))
   444  	ar.Require().NoError(err)
   445  
   446  	rdr, err := pqarrow.NewFileReader(reader, ar.props, ar.mem)
   447  	ar.Require().NoError(err)
   448  	return rdr
   449  }
   450  
   451  func asDict32Encoded(mem memory.Allocator, arr arrow.Array) arrow.Array {
   452  	bldr := array.NewDictionaryBuilder(mem, &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrow.BinaryTypes.String})
   453  	defer bldr.Release()
   454  	bldr.AppendArray(arr)
   455  	return bldr.NewArray()
   456  }
   457  
   458  func (ar *ArrowReadDictSuite) TestReadWholeFileDict() {
   459  	ar.props.SetReadDict(0, true)
   460  	ar.writeSimple()
   461  
   462  	numRowGroups := ar.options.numRowGroups
   463  	chunkSize := ar.options.numRows / ar.options.numRowGroups
   464  
   465  	chunks := make([]arrow.Array, numRowGroups)
   466  	for i := 0; i < numRowGroups; i++ {
   467  		start := int64(chunkSize * i)
   468  		sl := array.NewSlice(ar.denseVals, start, start+int64(chunkSize))
   469  		defer sl.Release()
   470  		chunks[i] = asDict32Encoded(ar.mem, sl)
   471  		defer chunks[i].Release()
   472  	}
   473  
   474  	chunked := arrow.NewChunked(chunks[0].DataType(), chunks)
   475  	defer chunked.Release()
   476  
   477  	exTable := makeSimpleTable(chunked, true)
   478  	defer exTable.Release()
   479  
   480  	ar.checkReadWholeFile(exTable)
   481  }
   482  
   483  func (ar *ArrowReadDictSuite) TestZeroChunksListOfDictionary() {
   484  	ar.props.SetReadDict(0, true)
   485  	ar.denseVals.Release()
   486  	ar.denseVals = nil
   487  
   488  	values := arrow.NewChunked(arrow.ListOf(arrow.BinaryTypes.String), []arrow.Array{})
   489  	defer values.Release()
   490  
   491  	ar.options.numRowGroups = 1
   492  	ar.options.numRows = 0
   493  	ar.options.numUniques = 0
   494  	ar.expectedDense.Release()
   495  	ar.expectedDense = makeSimpleTable(values, false)
   496  
   497  	ar.writeSimple()
   498  
   499  	rdr := ar.getReader()
   500  	defer rdr.ParquetReader().Close()
   501  
   502  	colReader, err := rdr.GetColumn(context.Background(), 0)
   503  	ar.Require().NoError(err)
   504  	defer colReader.Release()
   505  
   506  	chnked, err := colReader.NextBatch(1 << 15)
   507  	ar.Require().NoError(err)
   508  	defer chnked.Release()
   509  	ar.Zero(chnked.Len())
   510  	ar.Len(chnked.Chunks(), 1)
   511  }
   512  
   513  func (ar *ArrowReadDictSuite) TestIncrementalReads() {
   514  	ar.options.numRows = 100
   515  	ar.options.numUniques = 10
   516  
   517  	ar.denseVals.Release()
   518  	ar.expectedDense.Release()
   519  	ar.generateData(ar.nullProb)
   520  
   521  	ar.props.SetReadDict(0, true)
   522  	// just write a single row group
   523  	ar.Require().NoError(pqarrow.WriteTable(ar.expectedDense, &ar.buf, int64(ar.options.numRows),
   524  		parquet.NewWriterProperties(parquet.WithDictionaryDefault(true), parquet.WithStats(true)),
   525  		pqarrow.DefaultWriterProps()))
   526  
   527  	// read in one shot
   528  	expected, err := pqarrow.ReadTable(context.Background(), bytes.NewReader(ar.buf.Bytes()), nil, ar.props, ar.mem)
   529  	ar.Require().NoError(err)
   530  	defer expected.Release()
   531  
   532  	rdr := ar.getReader()
   533  	defer rdr.ParquetReader().Close()
   534  	col, err := rdr.GetColumn(context.Background(), 0)
   535  	ar.Require().NoError(err)
   536  	defer col.Release()
   537  
   538  	const numReads = 4
   539  	batchSize := ar.options.numRows / numReads
   540  
   541  	ctx := compute.WithAllocator(context.Background(), ar.mem)
   542  
   543  	for i := 0; i < numReads; i++ {
   544  		chunk, err := col.NextBatch(int64(batchSize))
   545  		ar.Require().NoError(err)
   546  		defer chunk.Release()
   547  		// no need to manually release chunk, like other record readers
   548  		// the col reader holds onto the current record and will release it
   549  		// when the next is requested or when the reader is released
   550  		resultDense, err := compute.CastArray(ctx, chunk.Chunk(0),
   551  			compute.SafeCastOptions(arrow.BinaryTypes.String))
   552  		ar.Require().NoError(err)
   553  		defer resultDense.Release()
   554  
   555  		sl := array.NewSlice(ar.denseVals, int64(i*batchSize), int64((i*batchSize)+batchSize))
   556  		defer sl.Release()
   557  
   558  		ar.Truef(array.Equal(sl, resultDense), "expected: %s\ngot: %s", sl, resultDense)
   559  	}
   560  }
   561  
   562  func (ar *ArrowReadDictSuite) TestStreamReadWholeFileDict() {
   563  	ar.options.numRows = 100
   564  	ar.options.numUniques = 10
   565  
   566  	ar.denseVals.Release()
   567  	ar.expectedDense.Release()
   568  	ar.generateData(ar.nullProb)
   569  
   570  	ar.writeSimple()
   571  	ar.props.BatchSize = int64(ar.options.numRows * 2)
   572  	ar.checkStreamReadWholeFile(ar.expectedDense)
   573  }
   574  
   575  func (ar *ArrowReadDictSuite) TestReadWholeFileDense() {
   576  	ar.props.SetReadDict(0, false)
   577  	ar.writeSimple()
   578  	ar.checkReadWholeFile(ar.expectedDense)
   579  }
   580  
   581  func doRoundTrip(t *testing.T, tbl arrow.Table, rowGroupSize int64, wrProps *parquet.WriterProperties, arrWrProps *pqarrow.ArrowWriterProperties, arrReadProps pqarrow.ArrowReadProperties) arrow.Table {
   582  	var buf bytes.Buffer
   583  	require.NoError(t, pqarrow.WriteTable(tbl, &buf, rowGroupSize, wrProps, *arrWrProps))
   584  
   585  	out, err := pqarrow.ReadTable(context.Background(), bytes.NewReader(buf.Bytes()), nil, arrReadProps, wrProps.Allocator())
   586  	require.NoError(t, err)
   587  	return out
   588  }
   589  
   590  func TestArrowWriteChangingDictionaries(t *testing.T) {
   591  	const (
   592  		numUnique            = 50
   593  		repeat               = 5000
   594  		minLen, maxLen int32 = 2, 20
   595  	)
   596  
   597  	rag := testutils.NewRandomArrayGenerator(0)
   598  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   599  	defer mem.AssertSize(t, 0)
   600  
   601  	values := rag.StringWithRepeats(mem, repeat*numUnique, numUnique, minLen, maxLen, 0.1)
   602  	defer values.Release()
   603  
   604  	valuesChunk := arrow.NewChunked(values.DataType(), []arrow.Array{values})
   605  	defer valuesChunk.Release()
   606  
   607  	expected := makeSimpleTable(valuesChunk, true)
   608  	defer expected.Release()
   609  
   610  	const numChunks = 10
   611  	chunks := make([]arrow.Array, numChunks)
   612  	chunkSize := valuesChunk.Len() / numChunks
   613  	for i := 0; i < numChunks; i++ {
   614  		start := int64(chunkSize * i)
   615  		sl := array.NewSlice(values, start, start+int64(chunkSize))
   616  		defer sl.Release()
   617  		chunks[i] = asDict32Encoded(mem, sl)
   618  		defer chunks[i].Release()
   619  	}
   620  
   621  	dictChunked := arrow.NewChunked(chunks[0].DataType(), chunks)
   622  	defer dictChunked.Release()
   623  	dictTable := makeSimpleTable(dictChunked, true)
   624  	defer dictTable.Release()
   625  
   626  	props := pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))
   627  	actual := doRoundTrip(t, dictTable, int64(values.Len())/2, parquet.NewWriterProperties(parquet.WithAllocator(mem)),
   628  		&props, pqarrow.ArrowReadProperties{})
   629  	defer actual.Release()
   630  
   631  	assert.Truef(t, array.TableEqual(expected, actual), "expected: %s\ngot: %s", expected, actual)
   632  }
   633  
   634  func TestArrowAutoReadAsDictionary(t *testing.T) {
   635  	const (
   636  		numUnique            = 50
   637  		repeat               = 100
   638  		minLen, maxLen int32 = 2, 20
   639  	)
   640  
   641  	rag := testutils.NewRandomArrayGenerator(0)
   642  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   643  	defer mem.AssertSize(t, 0)
   644  
   645  	values := rag.StringWithRepeats(mem, repeat*numUnique, numUnique, minLen, maxLen, 0.1)
   646  	defer values.Release()
   647  
   648  	dictValues := asDict32Encoded(mem, values)
   649  	defer dictValues.Release()
   650  
   651  	dictChunk := arrow.NewChunked(dictValues.DataType(), []arrow.Array{dictValues})
   652  	defer dictChunk.Release()
   653  
   654  	valuesChunk := arrow.NewChunked(values.DataType(), []arrow.Array{values})
   655  	defer valuesChunk.Release()
   656  
   657  	expected := makeSimpleTable(dictChunk, true)
   658  	defer expected.Release()
   659  	expectedDense := makeSimpleTable(valuesChunk, true)
   660  	defer expectedDense.Release()
   661  
   662  	wrProps := parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithDictionaryDefault(true))
   663  	propsStoreSchema := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
   664  	actual := doRoundTrip(t, expected, int64(valuesChunk.Len()), wrProps, &propsStoreSchema, pqarrow.ArrowReadProperties{})
   665  	defer actual.Release()
   666  
   667  	assert.Truef(t, array.TableEqual(expected, actual), "expected: %s\ngot: %s", expected, actual)
   668  
   669  	propsNoStoreSchema := pqarrow.NewArrowWriterProperties()
   670  	actualDense := doRoundTrip(t, expected, int64(valuesChunk.Len()), wrProps, &propsNoStoreSchema, pqarrow.ArrowReadProperties{})
   671  	defer actualDense.Release()
   672  
   673  	assert.Truef(t, array.TableEqual(expectedDense, actualDense), "expected: %s\ngot: %s", expectedDense, actualDense)
   674  }
   675  
   676  func TestArrowWriteNestedSubfieldDictionary(t *testing.T) {
   677  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   678  	defer mem.AssertSize(t, 0)
   679  
   680  	offsets, _, _ := array.FromJSON(mem, arrow.PrimitiveTypes.Int32, strings.NewReader(`[0, 0, 2, 3]`))
   681  	defer offsets.Release()
   682  	indices, _, _ := array.FromJSON(mem, arrow.PrimitiveTypes.Int32, strings.NewReader(`[0, 0, 0]`))
   683  	defer indices.Release()
   684  	dict, _, _ := array.FromJSON(mem, arrow.BinaryTypes.String, strings.NewReader(`["foo"]`))
   685  	defer dict.Release()
   686  
   687  	dictType := &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrow.BinaryTypes.String}
   688  	dictValues := array.NewDictionaryArray(dictType, indices, dict)
   689  	defer dictValues.Release()
   690  
   691  	data := array.NewData(arrow.ListOf(dictType), 3, []*memory.Buffer{nil, offsets.Data().Buffers()[1]},
   692  		[]arrow.ArrayData{dictValues.Data()}, 0, 0)
   693  	defer data.Release()
   694  	values := array.NewListData(data)
   695  	defer values.Release()
   696  
   697  	chk := arrow.NewChunked(values.DataType(), []arrow.Array{values})
   698  	defer chk.Release()
   699  
   700  	tbl := makeSimpleTable(chk, true)
   701  	defer tbl.Release()
   702  	propsStoreSchema := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
   703  	actual := doRoundTrip(t, tbl, int64(values.Len()), parquet.NewWriterProperties(), &propsStoreSchema, pqarrow.ArrowReadProperties{})
   704  	defer actual.Release()
   705  
   706  	assert.Truef(t, array.TableEqual(tbl, actual), "expected: %s\ngot: %s", tbl, actual)
   707  }
   708  
   709  func TestDictOfEmptyStringsRoundtrip(t *testing.T) {
   710  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   711  	defer mem.AssertSize(t, 0)
   712  
   713  	schema := arrow.NewSchema([]arrow.Field{
   714  		{Name: "reserved1", Type: arrow.BinaryTypes.String, Nullable: true},
   715  	}, nil)
   716  
   717  	bldr := array.NewStringBuilder(mem)
   718  	defer bldr.Release()
   719  
   720  	for i := 0; i < 6; i++ {
   721  		bldr.AppendEmptyValue()
   722  	}
   723  
   724  	arr := bldr.NewArray()
   725  	defer arr.Release()
   726  	col1 := arrow.NewColumnFromArr(schema.Field(0), arr)
   727  	defer col1.Release()
   728  	tbl := array.NewTable(schema, []arrow.Column{col1}, 6)
   729  	defer tbl.Release()
   730  
   731  	var buf bytes.Buffer
   732  	require.NoError(t, pqarrow.WriteTable(tbl, &buf, 6,
   733  		parquet.NewWriterProperties(parquet.WithDictionaryDefault(true)),
   734  		pqarrow.NewArrowWriterProperties()))
   735  
   736  	result, err := pqarrow.ReadTable(context.Background(), bytes.NewReader(buf.Bytes()), nil, pqarrow.ArrowReadProperties{}, mem)
   737  	require.NoError(t, err)
   738  	defer result.Release()
   739  
   740  	assert.EqualValues(t, 6, result.NumRows())
   741  	assert.EqualValues(t, 1, result.NumCols())
   742  	col := result.Column(0).Data().Chunk(0)
   743  	assert.Equal(t, arrow.STRING, col.DataType().ID())
   744  
   745  	for i := 0; i < 6; i++ {
   746  		assert.Zero(t, col.(*array.String).Value(i))
   747  	}
   748  }