github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/file_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "bytes" 21 "context" 22 "io" 23 "os" 24 "path/filepath" 25 "testing" 26 27 "github.com/apache/arrow/go/v10/arrow" 28 "github.com/apache/arrow/go/v10/arrow/array" 29 "github.com/apache/arrow/go/v10/arrow/decimal128" 30 "github.com/apache/arrow/go/v10/arrow/memory" 31 "github.com/apache/arrow/go/v10/parquet" 32 "github.com/apache/arrow/go/v10/parquet/file" 33 "github.com/apache/arrow/go/v10/parquet/pqarrow" 34 "github.com/stretchr/testify/assert" 35 "github.com/stretchr/testify/require" 36 ) 37 38 func getDataDir() string { 39 datadir := os.Getenv("PARQUET_TEST_DATA") 40 if datadir == "" { 41 panic("please point PARQUET_TEST_DATA env var to the test data directory") 42 } 43 return datadir 44 } 45 46 func TestArrowReaderAdHocReadDecimals(t *testing.T) { 47 tests := []struct { 48 file string 49 typ *arrow.Decimal128Type 50 }{ 51 {"int32_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}}, 52 {"int64_decimal", &arrow.Decimal128Type{Precision: 10, Scale: 2}}, 53 {"fixed_length_decimal", &arrow.Decimal128Type{Precision: 25, Scale: 2}}, 54 {"fixed_length_decimal_legacy", &arrow.Decimal128Type{Precision: 13, Scale: 2}}, 55 {"byte_array_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}}, 56 } 57 58 dataDir := getDataDir() 59 for _, tt := range tests { 60 t.Run(tt.file, func(t *testing.T) { 61 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 62 defer mem.AssertSize(t, 0) 63 64 filename := filepath.Join(dataDir, tt.file+".parquet") 65 require.FileExists(t, filename) 66 67 rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem))) 68 require.NoError(t, err) 69 defer rdr.Close() 70 arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem) 71 require.NoError(t, err) 72 73 tbl, err := arrowRdr.ReadTable(context.Background()) 74 require.NoError(t, err) 75 defer tbl.Release() 76 77 assert.EqualValues(t, 1, tbl.NumCols()) 78 assert.Truef(t, arrow.TypeEqual(tbl.Schema().Field(0).Type, tt.typ), "expected: %s\ngot: %s", tbl.Schema().Field(0).Type, tt.typ) 79 80 const expectedLen = 24 81 valCol := tbl.Column(0) 82 83 assert.EqualValues(t, expectedLen, valCol.Len()) 84 assert.Len(t, valCol.Data().Chunks(), 1) 85 86 chunk := valCol.Data().Chunk(0) 87 bldr := array.NewDecimal128Builder(mem, tt.typ) 88 defer bldr.Release() 89 for i := 0; i < expectedLen; i++ { 90 bldr.Append(decimal128.FromI64(int64((i + 1) * 100))) 91 } 92 93 expectedArr := bldr.NewDecimal128Array() 94 defer expectedArr.Release() 95 96 assert.Truef(t, array.Equal(expectedArr, chunk), "expected: %s\ngot: %s", expectedArr, chunk) 97 }) 98 } 99 } 100 101 func TestRecordReaderParallel(t *testing.T) { 102 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 103 defer mem.AssertSize(t, 0) 104 105 tbl := makeDateTimeTypesTable(mem, true, true) 106 defer tbl.Release() 107 108 var buf bytes.Buffer 109 require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))) 110 111 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem))) 112 require.NoError(t, err) 113 114 reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 3, Parallel: true}, mem) 115 require.NoError(t, err) 116 117 sc, err := reader.Schema() 118 assert.NoError(t, err) 119 assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc) 120 121 rr, err := reader.GetRecordReader(context.Background(), nil, nil) 122 assert.NoError(t, err) 123 assert.NotNil(t, rr) 124 defer rr.Release() 125 126 records := make([]arrow.Record, 0) 127 for rr.Next() { 128 rec := rr.Record() 129 defer rec.Release() 130 131 assert.Truef(t, sc.Equal(rec.Schema()), "expected: %s\ngot: %s", sc, rec.Schema()) 132 rec.Retain() 133 records = append(records, rec) 134 } 135 136 assert.False(t, rr.Next()) 137 138 tr := array.NewTableReader(tbl, 3) 139 defer tr.Release() 140 141 assert.True(t, tr.Next()) 142 assert.Truef(t, array.RecordEqual(tr.Record(), records[0]), "expected: %s\ngot: %s", tr.Record(), records[0]) 143 assert.True(t, tr.Next()) 144 assert.Truef(t, array.RecordEqual(tr.Record(), records[1]), "expected: %s\ngot: %s", tr.Record(), records[1]) 145 } 146 147 func TestRecordReaderSerial(t *testing.T) { 148 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 149 defer mem.AssertSize(t, 0) 150 151 tbl := makeDateTimeTypesTable(mem, true, true) 152 defer tbl.Release() 153 154 var buf bytes.Buffer 155 require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))) 156 157 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem))) 158 require.NoError(t, err) 159 160 reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 2}, mem) 161 require.NoError(t, err) 162 163 sc, err := reader.Schema() 164 assert.NoError(t, err) 165 assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc) 166 167 rr, err := reader.GetRecordReader(context.Background(), nil, nil) 168 assert.NoError(t, err) 169 assert.NotNil(t, rr) 170 defer rr.Release() 171 172 tr := array.NewTableReader(tbl, 2) 173 defer tr.Release() 174 175 rec, err := rr.Read() 176 assert.NoError(t, err) 177 tr.Next() 178 assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec) 179 180 rec, err = rr.Read() 181 assert.NoError(t, err) 182 tr.Next() 183 assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec) 184 185 rec, err = rr.Read() 186 assert.NoError(t, err) 187 tr.Next() 188 assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec) 189 190 rec, err = rr.Read() 191 assert.Same(t, io.EOF, err) 192 assert.Nil(t, rec) 193 } 194 195 func TestFileReaderWriterMetadata(t *testing.T) { 196 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 197 defer mem.AssertSize(t, 0) 198 199 tbl := makeDateTimeTypesTable(mem, true, true) 200 defer tbl.Release() 201 202 meta := arrow.NewMetadata([]string{"foo", "bar"}, []string{"bar", "baz"}) 203 sc := arrow.NewSchema(tbl.Schema().Fields(), &meta) 204 205 var buf bytes.Buffer 206 writer, err := pqarrow.NewFileWriter(sc, &buf, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 207 require.NoError(t, err) 208 require.NoError(t, writer.WriteTable(tbl, tbl.NumRows())) 209 require.NoError(t, writer.Close()) 210 211 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem))) 212 require.NoError(t, err) 213 defer pf.Close() 214 215 kvMeta := pf.MetaData().KeyValueMetadata() 216 assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys()) 217 assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values()) 218 }