github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/file_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow_test
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"io"
    23  	"os"
    24  	"path/filepath"
    25  	"testing"
    26  
    27  	"github.com/apache/arrow/go/v10/arrow"
    28  	"github.com/apache/arrow/go/v10/arrow/array"
    29  	"github.com/apache/arrow/go/v10/arrow/decimal128"
    30  	"github.com/apache/arrow/go/v10/arrow/memory"
    31  	"github.com/apache/arrow/go/v10/parquet"
    32  	"github.com/apache/arrow/go/v10/parquet/file"
    33  	"github.com/apache/arrow/go/v10/parquet/pqarrow"
    34  	"github.com/stretchr/testify/assert"
    35  	"github.com/stretchr/testify/require"
    36  )
    37  
    38  func getDataDir() string {
    39  	datadir := os.Getenv("PARQUET_TEST_DATA")
    40  	if datadir == "" {
    41  		panic("please point PARQUET_TEST_DATA env var to the test data directory")
    42  	}
    43  	return datadir
    44  }
    45  
    46  func TestArrowReaderAdHocReadDecimals(t *testing.T) {
    47  	tests := []struct {
    48  		file string
    49  		typ  *arrow.Decimal128Type
    50  	}{
    51  		{"int32_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}},
    52  		{"int64_decimal", &arrow.Decimal128Type{Precision: 10, Scale: 2}},
    53  		{"fixed_length_decimal", &arrow.Decimal128Type{Precision: 25, Scale: 2}},
    54  		{"fixed_length_decimal_legacy", &arrow.Decimal128Type{Precision: 13, Scale: 2}},
    55  		{"byte_array_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}},
    56  	}
    57  
    58  	dataDir := getDataDir()
    59  	for _, tt := range tests {
    60  		t.Run(tt.file, func(t *testing.T) {
    61  			mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
    62  			defer mem.AssertSize(t, 0)
    63  
    64  			filename := filepath.Join(dataDir, tt.file+".parquet")
    65  			require.FileExists(t, filename)
    66  
    67  			rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem)))
    68  			require.NoError(t, err)
    69  			defer rdr.Close()
    70  			arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem)
    71  			require.NoError(t, err)
    72  
    73  			tbl, err := arrowRdr.ReadTable(context.Background())
    74  			require.NoError(t, err)
    75  			defer tbl.Release()
    76  
    77  			assert.EqualValues(t, 1, tbl.NumCols())
    78  			assert.Truef(t, arrow.TypeEqual(tbl.Schema().Field(0).Type, tt.typ), "expected: %s\ngot: %s", tbl.Schema().Field(0).Type, tt.typ)
    79  
    80  			const expectedLen = 24
    81  			valCol := tbl.Column(0)
    82  
    83  			assert.EqualValues(t, expectedLen, valCol.Len())
    84  			assert.Len(t, valCol.Data().Chunks(), 1)
    85  
    86  			chunk := valCol.Data().Chunk(0)
    87  			bldr := array.NewDecimal128Builder(mem, tt.typ)
    88  			defer bldr.Release()
    89  			for i := 0; i < expectedLen; i++ {
    90  				bldr.Append(decimal128.FromI64(int64((i + 1) * 100)))
    91  			}
    92  
    93  			expectedArr := bldr.NewDecimal128Array()
    94  			defer expectedArr.Release()
    95  
    96  			assert.Truef(t, array.Equal(expectedArr, chunk), "expected: %s\ngot: %s", expectedArr, chunk)
    97  		})
    98  	}
    99  }
   100  
   101  func TestRecordReaderParallel(t *testing.T) {
   102  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   103  	defer mem.AssertSize(t, 0)
   104  
   105  	tbl := makeDateTimeTypesTable(mem, true, true)
   106  	defer tbl.Release()
   107  
   108  	var buf bytes.Buffer
   109  	require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))))
   110  
   111  	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem)))
   112  	require.NoError(t, err)
   113  
   114  	reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 3, Parallel: true}, mem)
   115  	require.NoError(t, err)
   116  
   117  	sc, err := reader.Schema()
   118  	assert.NoError(t, err)
   119  	assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc)
   120  
   121  	rr, err := reader.GetRecordReader(context.Background(), nil, nil)
   122  	assert.NoError(t, err)
   123  	assert.NotNil(t, rr)
   124  	defer rr.Release()
   125  
   126  	records := make([]arrow.Record, 0)
   127  	for rr.Next() {
   128  		rec := rr.Record()
   129  		defer rec.Release()
   130  
   131  		assert.Truef(t, sc.Equal(rec.Schema()), "expected: %s\ngot: %s", sc, rec.Schema())
   132  		rec.Retain()
   133  		records = append(records, rec)
   134  	}
   135  
   136  	assert.False(t, rr.Next())
   137  
   138  	tr := array.NewTableReader(tbl, 3)
   139  	defer tr.Release()
   140  
   141  	assert.True(t, tr.Next())
   142  	assert.Truef(t, array.RecordEqual(tr.Record(), records[0]), "expected: %s\ngot: %s", tr.Record(), records[0])
   143  	assert.True(t, tr.Next())
   144  	assert.Truef(t, array.RecordEqual(tr.Record(), records[1]), "expected: %s\ngot: %s", tr.Record(), records[1])
   145  }
   146  
   147  func TestRecordReaderSerial(t *testing.T) {
   148  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   149  	defer mem.AssertSize(t, 0)
   150  
   151  	tbl := makeDateTimeTypesTable(mem, true, true)
   152  	defer tbl.Release()
   153  
   154  	var buf bytes.Buffer
   155  	require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))))
   156  
   157  	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem)))
   158  	require.NoError(t, err)
   159  
   160  	reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 2}, mem)
   161  	require.NoError(t, err)
   162  
   163  	sc, err := reader.Schema()
   164  	assert.NoError(t, err)
   165  	assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc)
   166  
   167  	rr, err := reader.GetRecordReader(context.Background(), nil, nil)
   168  	assert.NoError(t, err)
   169  	assert.NotNil(t, rr)
   170  	defer rr.Release()
   171  
   172  	tr := array.NewTableReader(tbl, 2)
   173  	defer tr.Release()
   174  
   175  	rec, err := rr.Read()
   176  	assert.NoError(t, err)
   177  	tr.Next()
   178  	assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec)
   179  
   180  	rec, err = rr.Read()
   181  	assert.NoError(t, err)
   182  	tr.Next()
   183  	assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec)
   184  
   185  	rec, err = rr.Read()
   186  	assert.NoError(t, err)
   187  	tr.Next()
   188  	assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec)
   189  
   190  	rec, err = rr.Read()
   191  	assert.Same(t, io.EOF, err)
   192  	assert.Nil(t, rec)
   193  }
   194  
   195  func TestFileReaderWriterMetadata(t *testing.T) {
   196  	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
   197  	defer mem.AssertSize(t, 0)
   198  
   199  	tbl := makeDateTimeTypesTable(mem, true, true)
   200  	defer tbl.Release()
   201  
   202  	meta := arrow.NewMetadata([]string{"foo", "bar"}, []string{"bar", "baz"})
   203  	sc := arrow.NewSchema(tbl.Schema().Fields(), &meta)
   204  
   205  	var buf bytes.Buffer
   206  	writer, err := pqarrow.NewFileWriter(sc, &buf, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))
   207  	require.NoError(t, err)
   208  	require.NoError(t, writer.WriteTable(tbl, tbl.NumRows()))
   209  	require.NoError(t, writer.Close())
   210  
   211  	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem)))
   212  	require.NoError(t, err)
   213  	defer pf.Close()
   214  
   215  	kvMeta := pf.MetaData().KeyValueMetadata()
   216  	assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys())
   217  	assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values())
   218  }