github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/file_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "io" 24 "os" 25 "path/filepath" 26 "strings" 27 "testing" 28 29 "github.com/apache/arrow/go/v14/arrow" 30 "github.com/apache/arrow/go/v14/arrow/array" 31 "github.com/apache/arrow/go/v14/arrow/decimal128" 32 "github.com/apache/arrow/go/v14/arrow/memory" 33 "github.com/apache/arrow/go/v14/parquet" 34 "github.com/apache/arrow/go/v14/parquet/file" 35 "github.com/apache/arrow/go/v14/parquet/pqarrow" 36 "github.com/stretchr/testify/assert" 37 "github.com/stretchr/testify/require" 38 ) 39 40 func getDataDir() string { 41 datadir := os.Getenv("PARQUET_TEST_DATA") 42 if datadir == "" { 43 panic("please point PARQUET_TEST_DATA env var to the test data directory") 44 } 45 return datadir 46 } 47 48 func TestArrowReaderAdHocReadDecimals(t *testing.T) { 49 tests := []struct { 50 file string 51 typ *arrow.Decimal128Type 52 }{ 53 {"int32_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}}, 54 {"int64_decimal", &arrow.Decimal128Type{Precision: 10, Scale: 2}}, 55 {"fixed_length_decimal", &arrow.Decimal128Type{Precision: 25, Scale: 2}}, 56 {"fixed_length_decimal_legacy", &arrow.Decimal128Type{Precision: 13, Scale: 2}}, 57 {"byte_array_decimal", &arrow.Decimal128Type{Precision: 4, Scale: 2}}, 58 } 59 60 dataDir := getDataDir() 61 for _, tt := range tests { 62 t.Run(tt.file, func(t *testing.T) { 63 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 64 defer mem.AssertSize(t, 0) 65 66 filename := filepath.Join(dataDir, tt.file+".parquet") 67 require.FileExists(t, filename) 68 69 rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem))) 70 require.NoError(t, err) 71 defer rdr.Close() 72 arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem) 73 require.NoError(t, err) 74 75 tbl, err := arrowRdr.ReadTable(context.Background()) 76 require.NoError(t, err) 77 defer tbl.Release() 78 79 assert.EqualValues(t, 1, tbl.NumCols()) 80 assert.Truef(t, arrow.TypeEqual(tbl.Schema().Field(0).Type, tt.typ), "expected: %s\ngot: %s", tbl.Schema().Field(0).Type, tt.typ) 81 82 const expectedLen = 24 83 valCol := tbl.Column(0) 84 85 assert.EqualValues(t, expectedLen, valCol.Len()) 86 assert.Len(t, valCol.Data().Chunks(), 1) 87 88 chunk := valCol.Data().Chunk(0) 89 bldr := array.NewDecimal128Builder(mem, tt.typ) 90 defer bldr.Release() 91 for i := 0; i < expectedLen; i++ { 92 bldr.Append(decimal128.FromI64(int64((i + 1) * 100))) 93 } 94 95 expectedArr := bldr.NewDecimal128Array() 96 defer expectedArr.Release() 97 98 assert.Truef(t, array.Equal(expectedArr, chunk), "expected: %s\ngot: %s", expectedArr, chunk) 99 }) 100 } 101 } 102 103 func TestRecordReaderParallel(t *testing.T) { 104 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 105 defer mem.AssertSize(t, 0) 106 107 tbl := makeDateTimeTypesTable(mem, true, true) 108 defer tbl.Release() 109 110 var buf bytes.Buffer 111 require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))) 112 113 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem))) 114 require.NoError(t, err) 115 116 reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 3, Parallel: true}, mem) 117 require.NoError(t, err) 118 119 sc, err := reader.Schema() 120 assert.NoError(t, err) 121 assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc) 122 123 rr, err := reader.GetRecordReader(context.Background(), nil, nil) 124 assert.NoError(t, err) 125 assert.NotNil(t, rr) 126 defer rr.Release() 127 128 records := make([]arrow.Record, 0) 129 for rr.Next() { 130 rec := rr.Record() 131 defer rec.Release() 132 133 assert.Truef(t, sc.Equal(rec.Schema()), "expected: %s\ngot: %s", sc, rec.Schema()) 134 rec.Retain() 135 records = append(records, rec) 136 } 137 138 assert.False(t, rr.Next()) 139 140 tr := array.NewTableReader(tbl, 3) 141 defer tr.Release() 142 143 assert.True(t, tr.Next()) 144 assert.Truef(t, array.RecordEqual(tr.Record(), records[0]), "expected: %s\ngot: %s", tr.Record(), records[0]) 145 assert.True(t, tr.Next()) 146 assert.Truef(t, array.RecordEqual(tr.Record(), records[1]), "expected: %s\ngot: %s", tr.Record(), records[1]) 147 } 148 149 func TestRecordReaderSerial(t *testing.T) { 150 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 151 defer mem.AssertSize(t, 0) 152 153 tbl := makeDateTimeTypesTable(mem, true, true) 154 defer tbl.Release() 155 156 var buf bytes.Buffer 157 require.NoError(t, pqarrow.WriteTable(tbl, &buf, tbl.NumRows(), nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)))) 158 159 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem))) 160 require.NoError(t, err) 161 162 reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{BatchSize: 2}, mem) 163 require.NoError(t, err) 164 165 sc, err := reader.Schema() 166 assert.NoError(t, err) 167 assert.Truef(t, tbl.Schema().Equal(sc), "expected: %s\ngot: %s", tbl.Schema(), sc) 168 169 rr, err := reader.GetRecordReader(context.Background(), nil, nil) 170 assert.NoError(t, err) 171 assert.NotNil(t, rr) 172 defer rr.Release() 173 174 tr := array.NewTableReader(tbl, 2) 175 defer tr.Release() 176 177 rec, err := rr.Read() 178 assert.NoError(t, err) 179 tr.Next() 180 assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec) 181 182 rec, err = rr.Read() 183 assert.NoError(t, err) 184 tr.Next() 185 assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec) 186 187 rec, err = rr.Read() 188 assert.NoError(t, err) 189 tr.Next() 190 assert.Truef(t, array.RecordEqual(tr.Record(), rec), "expected: %s\ngot: %s", tr.Record(), rec) 191 192 rec, err = rr.Read() 193 assert.Same(t, io.EOF, err) 194 assert.Nil(t, rec) 195 } 196 197 func TestFileReaderWriterMetadata(t *testing.T) { 198 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 199 defer mem.AssertSize(t, 0) 200 201 tbl := makeDateTimeTypesTable(mem, true, true) 202 defer tbl.Release() 203 204 meta := arrow.NewMetadata([]string{"foo", "bar"}, []string{"bar", "baz"}) 205 sc := arrow.NewSchema(tbl.Schema().Fields(), &meta) 206 207 var buf bytes.Buffer 208 writer, err := pqarrow.NewFileWriter(sc, &buf, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 209 require.NoError(t, err) 210 require.NoError(t, writer.WriteTable(tbl, tbl.NumRows())) 211 require.NoError(t, writer.Close()) 212 213 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), file.WithReadProps(parquet.NewReaderProperties(mem))) 214 require.NoError(t, err) 215 defer pf.Close() 216 217 kvMeta := pf.MetaData().KeyValueMetadata() 218 assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys()) 219 assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values()) 220 } 221 222 func TestFileReaderColumnChunkBoundsErrors(t *testing.T) { 223 schema := arrow.NewSchema([]arrow.Field{ 224 {Name: "zero", Type: arrow.PrimitiveTypes.Float64}, 225 {Name: "g", Type: arrow.StructOf( 226 arrow.Field{Name: "one", Type: arrow.PrimitiveTypes.Float64}, 227 arrow.Field{Name: "two", Type: arrow.PrimitiveTypes.Float64}, 228 arrow.Field{Name: "three", Type: arrow.PrimitiveTypes.Float64}, 229 )}, 230 }, nil) 231 232 // generate Parquet data with four columns 233 // that are represented by two logical fields 234 data := `[ 235 { 236 "zero": 1, 237 "g": { 238 "one": 1, 239 "two": 1, 240 "three": 1 241 } 242 }, 243 { 244 "zero": 2, 245 "g": { 246 "one": 2, 247 "two": 2, 248 "three": 2 249 } 250 } 251 ]` 252 253 record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data)) 254 require.NoError(t, err) 255 256 output := &bytes.Buffer{} 257 writer, err := pqarrow.NewFileWriter(schema, output, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps()) 258 require.NoError(t, err) 259 260 require.NoError(t, writer.Write(record)) 261 require.NoError(t, writer.Close()) 262 263 fileReader, err := file.NewParquetReader(bytes.NewReader(output.Bytes())) 264 require.NoError(t, err) 265 266 arrowReader, err := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator) 267 require.NoError(t, err) 268 269 // assert that errors are returned for indexes outside the bounds of the logical fields (instead of the physical columns) 270 ctx := pqarrow.NewArrowWriteContext(context.Background(), nil) 271 assert.Greater(t, fileReader.NumRowGroups(), 0) 272 for rowGroupIndex := 0; rowGroupIndex < fileReader.NumRowGroups(); rowGroupIndex += 1 { 273 rowGroupReader := arrowReader.RowGroup(rowGroupIndex) 274 for fieldNum := 0; fieldNum < schema.NumFields(); fieldNum += 1 { 275 _, err := rowGroupReader.Column(fieldNum).Read(ctx) 276 assert.NoError(t, err, "reading field num: %d", fieldNum) 277 } 278 279 _, subZeroErr := rowGroupReader.Column(-1).Read(ctx) 280 assert.Error(t, subZeroErr) 281 282 _, tooHighErr := rowGroupReader.Column(schema.NumFields()).Read(ctx) 283 assert.ErrorContains(t, tooHighErr, fmt.Sprintf("there are only %d columns", schema.NumFields())) 284 } 285 }