github.com/apache/arrow/go/v14@v14.0.2/parquet/pqarrow/file_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow_test
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"os"
    25  	"path/filepath"
    26  	"strings"
    27  	"testing"
    28  
    29  	"github.com/apache/arrow/go/v14/arrow"
    30  	"github.com/apache/arrow/go/v14/arrow/array"
    31  	"github.com/apache/arrow/go/v14/arrow/decimal128"
    32  	"github.com/apache/arrow/go/v14/arrow/memory"
    33  	"github.com/apache/arrow/go/v14/parquet"
    34  	"github.com/apache/arrow/go/v14/parquet/file"
    35  	"github.com/apache/arrow/go/v14/parquet/pqarrow"
    36  	"github.com/stretchr/testify/assert"
    37  	"github.com/stretchr/testify/require"
    38  )
    39  
    40  func getDataDir() string {
    41  	datadir := os.Getenv("PARQUET_TEST_DATA")
    42  	if datadir == "" {
    43  		panic("please point PARQUET_TEST_DATA env var to the test data directory")
    44  	}
    45  	return datadir
    46  }
    47  
    48  func TestArrowReaderAdHocReadDecimals(t *testing.T) {
    49  	tests := []struct {
    50  		file string
    51  		typ  *arrow.Decimal128Type
    52  	}{
    53  		{"int32_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}},
    54  		{"int64_decimal", &arrow.Decimal128Type{Precision: 10, Scale: 2}},
    55  		{"fixed_length_decimal", &arrow.Decimal128Type{Precision: 25, Scale: 2}},
    56  		{"fixed_length_decimal_legacy", &arrow.Decimal128Type{Precision: 13, Scale: 2}},
    57  		{"byte_array_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}},
    58  	}
    59  
    60  	dataDir := getDataDir()
    61  	for _, tt := range tests {
    62  		t.Run(tt.file, func(t *testing.T) {
    63  			mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
    64  			defer mem.AssertSize(t, 0)
    65  
    66  			filename := filepath.Join(dataDir, tt.file+".parquet")
    67  			require.FileExists(t, filename)
    68  
    69  			rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem)))
    70  			require.NoError(t, err)
    71  			defer rdr.Close()
    72  			arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem)
    73  			require.NoError(t, err)
    74  
    75  			tbl, err := arrowRdr.ReadTable(context.Background())
    76  			require.NoError(t, err)
    77  			defer tbl.Release()
    78  
    79  			assert.EqualValues(t, 1, tbl.NumCols())
    80  			assert.Truef(t, arrow.TypeEqual(tbl.Schema().Field(0).Type, tt.typ), "expected: %s\ngot: %s", tbl.Schema().Field(0).Type, tt.typ)
    81  
    82  			const expectedLen = 24
    83  			valCol := tbl.Column(0)
    84  
    85  			assert.EqualValues(t, expectedLen, valCol.Len())
    86  			assert.Len(t, valCol.Data().Chunks(), 1)
    87  
    88  			chunk := valCol.Data().Chunk(0)
    89  			bldr := array.NewDecimal128Builder(mem, tt.typ)
    90  			defer bldr.Release()
    91  			for i := 0; i < expectedLen; i++ {
    92  				bldr.Append(decimal128.FromI64(int64((i + 1) * 100)))
    93  			}
    94  
    95  			expectedArr := bldr.NewDecimal128Array()
    96  			defer expectedArr.Release()
    97  
    98  			assert.Truef(t, array.Equal(expectedArr, chunk), "expected: %s\ngot: %s", expectedArr, chunk)
    99  		})
   100  	}
   101  }
   102  
   103  func TestRecordReaderParallel(t *testing.T) {
   104  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   105  	defer mem.AssertSize(t, 0)
   106  
   107  	tbl := makeDateTimeTypesTable(mem, true, true)
   108  	defer tbl.Release()
   109  
   110  	var buf bytes.Buffer
   111  	require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))))
   112  
   113  	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem)))
   114  	require.NoError(t, err)
   115  
   116  	reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 3, Parallel: true}, mem)
   117  	require.NoError(t, err)
   118  
   119  	sc, err := reader.Schema()
   120  	assert.NoError(t, err)
   121  	assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc)
   122  
   123  	rr, err := reader.GetRecordReader(context.Background(), nil, nil)
   124  	assert.NoError(t, err)
   125  	assert.NotNil(t, rr)
   126  	defer rr.Release()
   127  
   128  	records := make([]arrow.Record, 0)
   129  	for rr.Next() {
   130  		rec := rr.Record()
   131  		defer rec.Release()
   132  
   133  		assert.Truef(t, sc.Equal(rec.Schema()), "expected: %s\ngot: %s", sc, rec.Schema())
   134  		rec.Retain()
   135  		records = append(records, rec)
   136  	}
   137  
   138  	assert.False(t, rr.Next())
   139  
   140  	tr := array.NewTableReader(tbl, 3)
   141  	defer tr.Release()
   142  
   143  	assert.True(t, tr.Next())
   144  	assert.Truef(t, array.RecordEqual(tr.Record(), records[0]), "expected: %s\ngot: %s", tr.Record(), records[0])
   145  	assert.True(t, tr.Next())
   146  	assert.Truef(t, array.RecordEqual(tr.Record(), records[1]), "expected: %s\ngot: %s", tr.Record(), records[1])
   147  }
   148  
   149  func TestRecordReaderSerial(t *testing.T) {
   150  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   151  	defer mem.AssertSize(t, 0)
   152  
   153  	tbl := makeDateTimeTypesTable(mem, true, true)
   154  	defer tbl.Release()
   155  
   156  	var buf bytes.Buffer
   157  	require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))))
   158  
   159  	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem)))
   160  	require.NoError(t, err)
   161  
   162  	reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 2}, mem)
   163  	require.NoError(t, err)
   164  
   165  	sc, err := reader.Schema()
   166  	assert.NoError(t, err)
   167  	assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc)
   168  
   169  	rr, err := reader.GetRecordReader(context.Background(), nil, nil)
   170  	assert.NoError(t, err)
   171  	assert.NotNil(t, rr)
   172  	defer rr.Release()
   173  
   174  	tr := array.NewTableReader(tbl, 2)
   175  	defer tr.Release()
   176  
   177  	rec, err := rr.Read()
   178  	assert.NoError(t, err)
   179  	tr.Next()
   180  	assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec)
   181  
   182  	rec, err = rr.Read()
   183  	assert.NoError(t, err)
   184  	tr.Next()
   185  	assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec)
   186  
   187  	rec, err = rr.Read()
   188  	assert.NoError(t, err)
   189  	tr.Next()
   190  	assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec)
   191  
   192  	rec, err = rr.Read()
   193  	assert.Same(t, io.EOF, err)
   194  	assert.Nil(t, rec)
   195  }
   196  
   197  func TestFileReaderWriterMetadata(t *testing.T) {
   198  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   199  	defer mem.AssertSize(t, 0)
   200  
   201  	tbl := makeDateTimeTypesTable(mem, true, true)
   202  	defer tbl.Release()
   203  
   204  	meta := arrow.NewMetadata([]string{"foo", "bar"}, []string{"bar", "baz"})
   205  	sc := arrow.NewSchema(tbl.Schema().Fields(), &meta)
   206  
   207  	var buf bytes.Buffer
   208  	writer, err := pqarrow.NewFileWriter(sc, &buf, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
   209  	require.NoError(t, err)
   210  	require.NoError(t, writer.WriteTable(tbl, tbl.NumRows()))
   211  	require.NoError(t, writer.Close())
   212  
   213  	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem)))
   214  	require.NoError(t, err)
   215  	defer pf.Close()
   216  
   217  	kvMeta := pf.MetaData().KeyValueMetadata()
   218  	assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys())
   219  	assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values())
   220  }
   221  
   222  func TestFileReaderColumnChunkBoundsErrors(t *testing.T) {
   223  	schema := arrow.NewSchema([]arrow.Field{
   224  		{Name: "zero", Type: arrow.PrimitiveTypes.Float64},
   225  		{Name: "g", Type: arrow.StructOf(
   226  			arrow.Field{Name: "one", Type: arrow.PrimitiveTypes.Float64},
   227  			arrow.Field{Name: "two", Type: arrow.PrimitiveTypes.Float64},
   228  			arrow.Field{Name: "three", Type: arrow.PrimitiveTypes.Float64},
   229  		)},
   230  	}, nil)
   231  
   232  	// generate Parquet data with four columns
   233  	// that are represented by two logical fields
   234  	data := `[
   235  		{
   236  			"zero": 1,
   237  			"g": {
   238  				"one": 1,
   239  				"two": 1,
   240  				"three": 1
   241  			}
   242  		},
   243  		{
   244  			"zero": 2,
   245  			"g": {
   246  				"one": 2,
   247  				"two": 2,
   248  				"three": 2
   249  			}
   250  		}
   251  	]`
   252  
   253  	record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data))
   254  	require.NoError(t, err)
   255  
   256  	output := &bytes.Buffer{}
   257  	writer, err := pqarrow.NewFileWriter(schema, output, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
   258  	require.NoError(t, err)
   259  
   260  	require.NoError(t, writer.Write(record))
   261  	require.NoError(t, writer.Close())
   262  
   263  	fileReader, err := file.NewParquetReader(bytes.NewReader(output.Bytes()))
   264  	require.NoError(t, err)
   265  
   266  	arrowReader, err := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator)
   267  	require.NoError(t, err)
   268  
   269  	// assert that errors are returned for indexes outside the bounds of the logical fields (instead of the physical columns)
   270  	ctx := pqarrow.NewArrowWriteContext(context.Background(), nil)
   271  	assert.Greater(t, fileReader.NumRowGroups(), 0)
   272  	for rowGroupIndex := 0; rowGroupIndex < fileReader.NumRowGroups(); rowGroupIndex += 1 {
   273  		rowGroupReader := arrowReader.RowGroup(rowGroupIndex)
   274  		for fieldNum := 0; fieldNum < schema.NumFields(); fieldNum += 1 {
   275  			_, err := rowGroupReader.Column(fieldNum).Read(ctx)
   276  			assert.NoError(t, err, "reading field num: %d", fieldNum)
   277  		}
   278  
   279  		_, subZeroErr := rowGroupReader.Column(-1).Read(ctx)
   280  		assert.Error(t, subZeroErr)
   281  
   282  		_, tooHighErr := rowGroupReader.Column(schema.NumFields()).Read(ctx)
   283  		assert.ErrorContains(t, tooHighErr, fmt.Sprintf("there are only %d columns", schema.NumFields()))
   284  	}
   285  }