github.com/apache/arrow/go/v16@v16.1.0/parquet/pqarrow/encode_arrow_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "math" 24 "strconv" 25 "strings" 26 "testing" 27 28 "github.com/apache/arrow/go/v16/arrow" 29 "github.com/apache/arrow/go/v16/arrow/array" 30 "github.com/apache/arrow/go/v16/arrow/bitutil" 31 "github.com/apache/arrow/go/v16/arrow/decimal128" 32 "github.com/apache/arrow/go/v16/arrow/decimal256" 33 "github.com/apache/arrow/go/v16/arrow/ipc" 34 "github.com/apache/arrow/go/v16/arrow/memory" 35 "github.com/apache/arrow/go/v16/internal/types" 36 "github.com/apache/arrow/go/v16/internal/utils" 37 "github.com/apache/arrow/go/v16/parquet" 38 "github.com/apache/arrow/go/v16/parquet/compress" 39 "github.com/apache/arrow/go/v16/parquet/file" 40 "github.com/apache/arrow/go/v16/parquet/internal/encoding" 41 "github.com/apache/arrow/go/v16/parquet/internal/testutils" 42 "github.com/apache/arrow/go/v16/parquet/pqarrow" 43 "github.com/apache/arrow/go/v16/parquet/schema" 44 "github.com/google/uuid" 45 "github.com/stretchr/testify/assert" 46 "github.com/stretchr/testify/require" 47 "github.com/stretchr/testify/suite" 48 ) 49 50 func makeSimpleTable(values *arrow.Chunked, nullable bool) arrow.Table { 51 sc := arrow.NewSchema([]arrow.Field{{Name: "col", Type: values.DataType(), Nullable: nullable, 52 Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}}, nil) 53 column := arrow.NewColumn(sc.Field(0), values) 54 defer column.Release() 55 return array.NewTable(sc, []arrow.Column{*column}, -1) 56 } 57 58 func makeDateTimeTypesTable(mem memory.Allocator, expected bool, addFieldMeta bool) arrow.Table { 59 isValid := []bool{true, true, true, false, true, true} 60 61 // roundtrip without modification 62 f0 := arrow.Field{Name: "f0", Type: arrow.FixedWidthTypes.Date32, Nullable: true} 63 f1 := arrow.Field{Name: "f1", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: true} 64 f2 := arrow.Field{Name: "f2", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true} 65 f3 := arrow.Field{Name: "f3", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: true} 66 f3X := arrow.Field{Name: "f3", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true} 67 f4 := arrow.Field{Name: "f4", Type: arrow.FixedWidthTypes.Time32ms, Nullable: true} 68 f5 := arrow.Field{Name: "f5", Type: arrow.FixedWidthTypes.Time64us, Nullable: true} 69 f6 := arrow.Field{Name: "f6", Type: arrow.FixedWidthTypes.Time64ns, Nullable: true} 70 71 fieldList := []arrow.Field{f0, f1, f2} 72 if expected { 73 fieldList = append(fieldList, f3X) 74 } else { 75 fieldList = append(fieldList, f3) 76 } 77 fieldList = append(fieldList, f4, f5, f6) 78 79 if addFieldMeta { 80 for idx := range fieldList { 81 fieldList[idx].Metadata = arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(idx + 1)}) 82 } 83 } 84 arrsc := arrow.NewSchema(fieldList, nil) 85 86 d32Values := []arrow.Date32{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000} 87 ts64nsValues := []arrow.Timestamp{1489269000000, 1489270000000, 1489271000000, 1489272000000, 1489272000000, 1489273000000} 88 ts64usValues := []arrow.Timestamp{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000} 89 ts64msValues := []arrow.Timestamp{1489269, 1489270, 1489271, 1489272, 1489272, 1489273} 90 t32Values := []arrow.Time32{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000} 91 t64nsValues := []arrow.Time64{1489269000000, 1489270000000, 1489271000000, 1489272000000, 1489272000000, 1489273000000} 92 t64usValues := []arrow.Time64{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000} 93 94 builders := make([]array.Builder, 0, len(fieldList)) 95 for _, f := range fieldList { 96 bldr := array.NewBuilder(mem, f.Type) 97 defer bldr.Release() 98 builders = append(builders, bldr) 99 } 100 101 builders[0].(*array.Date32Builder).AppendValues(d32Values, isValid) 102 builders[1].(*array.TimestampBuilder).AppendValues(ts64msValues, isValid) 103 builders[2].(*array.TimestampBuilder).AppendValues(ts64usValues, isValid) 104 if expected { 105 builders[3].(*array.TimestampBuilder).AppendValues(ts64usValues, isValid) 106 } else { 107 builders[3].(*array.TimestampBuilder).AppendValues(ts64nsValues, isValid) 108 } 109 builders[4].(*array.Time32Builder).AppendValues(t32Values, isValid) 110 builders[5].(*array.Time64Builder).AppendValues(t64usValues, isValid) 111 builders[6].(*array.Time64Builder).AppendValues(t64nsValues, isValid) 112 113 cols := make([]arrow.Column, 0, len(fieldList)) 114 for idx, field := range fieldList { 115 arr := builders[idx].NewArray() 116 defer arr.Release() 117 118 chunked := arrow.NewChunked(field.Type, []arrow.Array{arr}) 119 defer chunked.Release() 120 col := arrow.NewColumn(field, chunked) 121 defer col.Release() 122 cols = append(cols, *col) 123 } 124 125 return array.NewTable(arrsc, cols, int64(len(isValid))) 126 } 127 128 func makeDateTypeTable(mem memory.Allocator, expected bool, partialDays bool) arrow.Table { 129 const ( 130 millisPerHour int64 = 1000 * 60 * 60 131 millisPerDay int64 = millisPerHour * 24 132 ) 133 isValid := []bool{true, true, true, false, true, true} 134 135 var field arrow.Field 136 if expected { 137 field = arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: true} 138 } else { 139 field = arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date64, Nullable: true} 140 } 141 142 field.Metadata = arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"1"}) 143 144 arrsc := arrow.NewSchema([]arrow.Field{field}, nil) 145 146 d32Values := []arrow.Date32{1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000} 147 148 d64Values := make([]arrow.Date64, len(d32Values)) 149 for i := range d64Values { 150 // Calculate number of milliseconds at date boundary 151 d64Values[i] = arrow.Date64(int64(d32Values[i]) * millisPerDay) 152 if partialDays { 153 // Offset 1 or more hours past the date boundary 154 hoursIntoDay := int64(i) * millisPerHour 155 d64Values[i] += arrow.Date64(hoursIntoDay) 156 } 157 } 158 159 bldr := array.NewRecordBuilder(mem, arrsc) 160 defer bldr.Release() 161 162 if expected { 163 bldr.Field(0).(*array.Date32Builder).AppendValues(d32Values, isValid) 164 } else { 165 bldr.Field(0).(*array.Date64Builder).AppendValues(d64Values, isValid) 166 } 167 168 rec := bldr.NewRecord() 169 defer rec.Release() 170 171 return array.NewTableFromRecords(arrsc, []arrow.Record{rec}) 172 } 173 174 func makeTimestampTypeTable(mem memory.Allocator, expected bool) arrow.Table { 175 isValid := []bool{true, true, true, false, true, true} 176 177 // Timestamp with relative (i.e. local) semantics. Make sure it roundtrips without being incorrectly converted to an absolute point in time. 178 f0 := arrow.Field{Name: "f0", Type: &arrow.TimestampType{Unit: arrow.Millisecond}, Nullable: true, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"1"})} 179 180 // Timestamp with absolute (i.e. instant) semantics. The physical representation is always from Unix epoch in UTC timezone. 181 // TimeZone is used for display purposes and can be stripped on roundtrip without changing the actual instant referred to. 182 // WithStoreSchema will preserve the original timezone, but the instant in will be equivalent even if it's not used. 183 f1 := arrow.Field{Name: "f1", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}, Nullable: true, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"2"})} 184 f1X := arrow.Field{Name: "f1", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "UTC"}, Nullable: true, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"2"})} 185 186 fieldList := []arrow.Field{f0} 187 if expected { 188 fieldList = append(fieldList, f1X) 189 } else { 190 fieldList = append(fieldList, f1) 191 } 192 193 arrsc := arrow.NewSchema(fieldList, nil) 194 195 ts64msValues := []arrow.Timestamp{1489269, 1489270, 1489271, 1489272, 1489272, 1489273} 196 197 bldr := array.NewRecordBuilder(mem, arrsc) 198 defer bldr.Release() 199 200 bldr.Field(0).(*array.TimestampBuilder).AppendValues(ts64msValues, isValid) 201 bldr.Field(1).(*array.TimestampBuilder).AppendValues(ts64msValues, isValid) 202 203 rec := bldr.NewRecord() 204 defer rec.Release() 205 206 return array.NewTableFromRecords(arrsc, []arrow.Record{rec}) 207 } 208 209 func TestWriteArrowCols(t *testing.T) { 210 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 211 defer mem.AssertSize(t, 0) 212 213 tbl := makeDateTimeTypesTable(mem, false, false) 214 defer tbl.Release() 215 216 sink := encoding.NewBufferWriter(0, mem) 217 defer sink.Release() 218 219 fileWriter, err := pqarrow.NewFileWriter( 220 tbl.Schema(), 221 sink, 222 parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_4)), 223 pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)), 224 ) 225 require.NoError(t, err) 226 227 fileWriter.NewRowGroup() 228 for i := int64(0); i < tbl.NumCols(); i++ { 229 colChunk := tbl.Column(int(i)).Data() 230 err := fileWriter.WriteColumnChunked(colChunk, 0, int64(colChunk.Len())) 231 require.NoError(t, err) 232 } 233 require.NoError(t, fileWriter.Close()) 234 235 expected := makeDateTimeTypesTable(mem, true, false) 236 defer expected.Release() 237 238 reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes())) 239 require.NoError(t, err) 240 241 assert.EqualValues(t, expected.NumCols(), reader.MetaData().Schema.NumColumns()) 242 assert.EqualValues(t, expected.NumRows(), reader.NumRows()) 243 assert.EqualValues(t, 1, reader.NumRowGroups()) 244 245 rgr := reader.RowGroup(0) 246 247 for i := 0; i < int(expected.NumCols()); i++ { 248 var ( 249 total int64 250 read int 251 defLevelsOut = make([]int16, int(expected.NumRows())) 252 arr = expected.Column(i).Data().Chunk(0) 253 ) 254 switch expected.Schema().Field(i).Type.(arrow.FixedWidthDataType).BitWidth() { 255 case 32: 256 col, err := rgr.Column(i) 257 assert.NoError(t, err) 258 colReader := col.(*file.Int32ColumnChunkReader) 259 vals := make([]int32, int(expected.NumRows())) 260 total, read, err = colReader.ReadBatch(expected.NumRows(), vals, defLevelsOut, nil) 261 require.NoError(t, err) 262 263 nulls := 0 264 for j := 0; j < arr.Len(); j++ { 265 if arr.IsNull(j) { 266 nulls++ 267 continue 268 } 269 270 switch v := arr.(type) { 271 case *array.Date32: 272 assert.EqualValues(t, v.Value(j), vals[j-nulls]) 273 case *array.Time32: 274 assert.EqualValues(t, v.Value(j), vals[j-nulls]) 275 } 276 } 277 case 64: 278 col, err := rgr.Column(i) 279 assert.NoError(t, err) 280 colReader := col.(*file.Int64ColumnChunkReader) 281 vals := make([]int64, int(expected.NumRows())) 282 total, read, err = colReader.ReadBatch(expected.NumRows(), vals, defLevelsOut, nil) 283 require.NoError(t, err) 284 285 nulls := 0 286 for j := 0; j < arr.Len(); j++ { 287 if arr.IsNull(j) { 288 nulls++ 289 continue 290 } 291 292 switch v := arr.(type) { 293 case *array.Date64: 294 assert.EqualValues(t, v.Value(j), vals[j-nulls]) 295 case *array.Time64: 296 assert.EqualValues(t, v.Value(j), vals[j-nulls]) 297 case *array.Timestamp: 298 assert.EqualValues(t, v.Value(j), vals[j-nulls]) 299 } 300 } 301 } 302 assert.EqualValues(t, expected.NumRows(), total) 303 assert.EqualValues(t, expected.NumRows()-1, read) 304 assert.Equal(t, []int16{1, 1, 1, 0, 1, 1}, defLevelsOut) 305 } 306 } 307 308 func TestWriteArrowInt96(t *testing.T) { 309 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 310 defer mem.AssertSize(t, 0) 311 312 tbl := makeDateTimeTypesTable(mem, false, false) 313 defer tbl.Release() 314 315 sink := encoding.NewBufferWriter(0, mem) 316 defer sink.Release() 317 318 fileWriter, err := pqarrow.NewFileWriter( 319 tbl.Schema(), 320 sink, 321 parquet.NewWriterProperties(parquet.WithAllocator(mem)), 322 pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true), pqarrow.WithAllocator(mem)), 323 ) 324 require.NoError(t, err) 325 326 fileWriter.NewRowGroup() 327 for i := int64(0); i < tbl.NumCols(); i++ { 328 colChunk := tbl.Column(int(i)).Data() 329 err := fileWriter.WriteColumnChunked(colChunk, 0, int64(colChunk.Len())) 330 require.NoError(t, err) 331 } 332 require.NoError(t, fileWriter.Close()) 333 334 expected := makeDateTimeTypesTable(mem, false, false) 335 defer expected.Release() 336 337 reader, err := file.NewParquetReader(bytes.NewReader(sink.Bytes())) 338 require.NoError(t, err) 339 340 assert.EqualValues(t, expected.NumCols(), reader.MetaData().Schema.NumColumns()) 341 assert.EqualValues(t, expected.NumRows(), reader.NumRows()) 342 assert.EqualValues(t, 1, reader.NumRowGroups()) 343 344 rgr := reader.RowGroup(0) 345 tsRdr, err := rgr.Column(3) 346 assert.NoError(t, err) 347 assert.Equal(t, parquet.Types.Int96, tsRdr.Type()) 348 349 rdr := tsRdr.(*file.Int96ColumnChunkReader) 350 vals := make([]parquet.Int96, expected.NumRows()) 351 defLevels := make([]int16, int(expected.NumRows())) 352 353 total, read, _ := rdr.ReadBatch(expected.NumRows(), vals, defLevels, nil) 354 assert.EqualValues(t, expected.NumRows(), total) 355 assert.EqualValues(t, expected.NumRows()-1, read) 356 assert.Equal(t, []int16{1, 1, 1, 0, 1, 1}, defLevels) 357 358 data := expected.Column(3).Data().Chunk(0).(*array.Timestamp) 359 assert.EqualValues(t, data.Value(0), vals[0].ToTime().UnixNano()) 360 assert.EqualValues(t, data.Value(1), vals[1].ToTime().UnixNano()) 361 assert.EqualValues(t, data.Value(2), vals[2].ToTime().UnixNano()) 362 assert.EqualValues(t, data.Value(4), vals[3].ToTime().UnixNano()) 363 assert.EqualValues(t, data.Value(5), vals[4].ToTime().UnixNano()) 364 } 365 366 func writeTableToBuffer(t *testing.T, mem memory.Allocator, tbl arrow.Table, rowGroupSize int64, props pqarrow.ArrowWriterProperties) *memory.Buffer { 367 sink := encoding.NewBufferWriter(0, mem) 368 defer sink.Release() 369 370 fileWriter, err := pqarrow.NewFileWriter( 371 tbl.Schema(), 372 sink, 373 parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), 374 props, 375 ) 376 require.NoError(t, err) 377 378 offset := int64(0) 379 for offset < tbl.NumRows() { 380 sz := utils.Min(rowGroupSize, tbl.NumRows()-offset) 381 fileWriter.NewRowGroup() 382 for i := 0; i < int(tbl.NumCols()); i++ { 383 colChunk := tbl.Column(i).Data() 384 err := fileWriter.WriteColumnChunked(colChunk, 0, int64(colChunk.Len())) 385 require.NoError(t, err) 386 } 387 offset += sz 388 } 389 390 require.NoError(t, fileWriter.Close()) 391 return sink.Finish() 392 } 393 394 func simpleRoundTrip(t *testing.T, tbl arrow.Table, rowGroupSize int64) { 395 t.Helper() 396 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 397 defer mem.AssertSize(t, 0) 398 399 buf := writeTableToBuffer(t, mem, tbl, rowGroupSize, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 400 defer buf.Release() 401 402 rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 403 require.NoError(t, err) 404 405 ardr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem) 406 require.NoError(t, err) 407 408 for i := 0; i < int(tbl.NumCols()); i++ { 409 crdr, err := ardr.GetColumn(context.TODO(), i) 410 require.NoError(t, err) 411 412 chunked, err := crdr.NextBatch(tbl.NumRows()) 413 require.NoError(t, err) 414 defer chunked.Release() 415 416 require.EqualValues(t, tbl.NumRows(), chunked.Len()) 417 418 chunkList := tbl.Column(i).Data().Chunks() 419 offset := int64(0) 420 for _, chnk := range chunkList { 421 slc := array.NewChunkedSlice(chunked, offset, offset+int64(chnk.Len())) 422 defer slc.Release() 423 424 assert.EqualValues(t, chnk.Len(), slc.Len()) 425 if len(slc.Chunks()) == 1 { 426 offset += int64(chnk.Len()) 427 assert.True(t, array.Equal(chnk, slc.Chunk(0))) 428 } 429 } 430 crdr.Release() 431 } 432 } 433 434 func TestWriteKeyValueMetadata(t *testing.T) { 435 kv := map[string]string{ 436 "key1": "value1", 437 "key2": "value2", 438 "key3": "value3", 439 } 440 441 sc := arrow.NewSchema([]arrow.Field{ 442 {Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 443 }, nil) 444 bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc) 445 defer bldr.Release() 446 for _, b := range bldr.Fields() { 447 b.AppendNull() 448 } 449 450 rec := bldr.NewRecord() 451 defer rec.Release() 452 453 props := parquet.NewWriterProperties( 454 parquet.WithVersion(parquet.V1_0), 455 ) 456 var buf bytes.Buffer 457 fw, err := pqarrow.NewFileWriter(sc, &buf, props, pqarrow.DefaultWriterProps()) 458 require.NoError(t, err) 459 err = fw.Write(rec) 460 require.NoError(t, err) 461 462 for key, value := range kv { 463 require.NoError(t, fw.AppendKeyValueMetadata(key, value)) 464 } 465 466 err = fw.Close() 467 require.NoError(t, err) 468 469 reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 470 require.NoError(t, err) 471 472 for key, value := range kv { 473 got := reader.MetaData().KeyValueMetadata().FindValue(key) 474 require.NotNil(t, got) 475 assert.Equal(t, value, *got) 476 } 477 } 478 479 func TestWriteEmptyLists(t *testing.T) { 480 sc := arrow.NewSchema([]arrow.Field{ 481 {Name: "f1", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)}, 482 {Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)}, 483 {Name: "f3", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_us)}, 484 {Name: "f4", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_ms)}, 485 {Name: "f5", Type: arrow.ListOf(arrow.FixedWidthTypes.Time32ms)}, 486 {Name: "f6", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64ns)}, 487 {Name: "f7", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64us)}, 488 }, nil) 489 bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc) 490 defer bldr.Release() 491 for _, b := range bldr.Fields() { 492 b.AppendNull() 493 } 494 495 rec := bldr.NewRecord() 496 defer rec.Release() 497 498 props := parquet.NewWriterProperties( 499 parquet.WithVersion(parquet.V1_0), 500 ) 501 arrprops := pqarrow.DefaultWriterProps() 502 var buf bytes.Buffer 503 fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops) 504 require.NoError(t, err) 505 err = fw.Write(rec) 506 require.NoError(t, err) 507 err = fw.Close() 508 require.NoError(t, err) 509 } 510 511 func TestWriteAllNullsWithDeltaEncoding(t *testing.T) { 512 sc := arrow.NewSchema([]arrow.Field{ 513 {Name: "f1", Type: arrow.PrimitiveTypes.Int64, Nullable: true}, 514 {Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)}, 515 {Name: "f3", Type: arrow.BinaryTypes.String, Nullable: true}, 516 {Name: "f4", Type: arrow.ListOf(arrow.BinaryTypes.String)}, 517 {Name: "f5", Type: arrow.BinaryTypes.LargeString, Nullable: true}, 518 {Name: "f6", Type: arrow.ListOf(arrow.BinaryTypes.LargeString)}, 519 {Name: "f7", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, 520 {Name: "f8", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)}, 521 {Name: "f9", Type: arrow.BinaryTypes.String, Nullable: true}, 522 {Name: "f10", Type: arrow.ListOf(arrow.BinaryTypes.LargeString)}, 523 {Name: "f11", Type: arrow.FixedWidthTypes.Boolean, Nullable: true}, 524 {Name: "f12", Type: arrow.ListOf(arrow.FixedWidthTypes.Boolean)}, 525 {Name: "f13", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 526 {Name: "f14", Type: arrow.ListOf(arrow.PrimitiveTypes.Float32)}, 527 }, nil) 528 bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc) 529 defer bldr.Release() 530 for _, b := range bldr.Fields() { 531 b.AppendNull() 532 } 533 534 rec := bldr.NewRecord() 535 defer rec.Release() 536 537 props := parquet.NewWriterProperties( 538 parquet.WithVersion(parquet.V1_0), 539 parquet.WithDictionaryDefault(false), 540 parquet.WithDictionaryFor("f9", true), 541 parquet.WithDictionaryFor("f10", true), 542 parquet.WithDictionaryFor("f13", true), 543 parquet.WithDictionaryFor("f14", true), 544 parquet.WithEncodingFor("f1", parquet.Encodings.DeltaBinaryPacked), 545 parquet.WithEncodingFor("f2", parquet.Encodings.DeltaBinaryPacked), 546 parquet.WithEncodingFor("f3", parquet.Encodings.DeltaByteArray), 547 parquet.WithEncodingFor("f4", parquet.Encodings.DeltaByteArray), 548 parquet.WithEncodingFor("f5", parquet.Encodings.DeltaLengthByteArray), 549 parquet.WithEncodingFor("f6", parquet.Encodings.DeltaLengthByteArray), 550 parquet.WithEncodingFor("f7", parquet.Encodings.Plain), 551 parquet.WithEncodingFor("f8", parquet.Encodings.Plain), 552 parquet.WithEncodingFor("f9", parquet.Encodings.Plain), 553 parquet.WithEncodingFor("f10", parquet.Encodings.Plain), 554 parquet.WithEncodingFor("f11", parquet.Encodings.RLE), 555 parquet.WithEncodingFor("f12", parquet.Encodings.RLE), 556 parquet.WithEncodingFor("f13", parquet.Encodings.RLE), 557 parquet.WithEncodingFor("f14", parquet.Encodings.RLE), 558 ) 559 arrprops := pqarrow.DefaultWriterProps() 560 var buf bytes.Buffer 561 fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops) 562 require.NoError(t, err) 563 err = fw.Write(rec) 564 require.NoError(t, err) 565 err = fw.Close() 566 require.NoError(t, err) 567 } 568 569 func TestArrowReadWriteTableChunkedCols(t *testing.T) { 570 chunkSizes := []int{2, 4, 10, 2} 571 const totalLen = int64(18) 572 573 rng := testutils.NewRandomArrayGenerator(0) 574 575 arr := rng.Int32(totalLen, 0, math.MaxInt32/2, 0.9) 576 defer arr.Release() 577 578 offset := int64(0) 579 chunks := make([]arrow.Array, 0) 580 for _, chnksize := range chunkSizes { 581 chk := array.NewSlice(arr, offset, offset+int64(chnksize)) 582 defer chk.Release() 583 defer chk.Release() // for NewChunked below 584 chunks = append(chunks, chk) 585 } 586 587 sc := arrow.NewSchema([]arrow.Field{{Name: "field", Type: arr.DataType(), Nullable: true}}, nil) 588 589 chk := arrow.NewChunked(arr.DataType(), chunks) 590 defer chk.Release() 591 592 tbl := array.NewTable(sc, []arrow.Column{*arrow.NewColumn(sc.Field(0), chk)}, -1) 593 defer tbl.Release() 594 595 simpleRoundTrip(t, tbl, 2) 596 simpleRoundTrip(t, tbl, 10) 597 } 598 599 // set this up for checking our expected results so we can test the functions 600 // that generate them which we export 601 func getLogicalType(typ arrow.DataType) schema.LogicalType { 602 switch typ.ID() { 603 case arrow.DICTIONARY: 604 return getLogicalType(typ.(*arrow.DictionaryType).ValueType) 605 case arrow.INT8: 606 return schema.NewIntLogicalType(8, true) 607 case arrow.UINT8: 608 return schema.NewIntLogicalType(8, false) 609 case arrow.INT16: 610 return schema.NewIntLogicalType(16, true) 611 case arrow.UINT16: 612 return schema.NewIntLogicalType(16, false) 613 case arrow.INT32: 614 return schema.NewIntLogicalType(32, true) 615 case arrow.UINT32: 616 return schema.NewIntLogicalType(32, false) 617 case arrow.INT64: 618 return schema.NewIntLogicalType(64, true) 619 case arrow.UINT64: 620 return schema.NewIntLogicalType(64, false) 621 case arrow.STRING, arrow.LARGE_STRING: 622 return schema.StringLogicalType{} 623 case arrow.DATE32: 624 return schema.DateLogicalType{} 625 case arrow.DATE64: 626 return schema.DateLogicalType{} 627 case arrow.FLOAT16: 628 return schema.Float16LogicalType{} 629 case arrow.TIMESTAMP: 630 ts := typ.(*arrow.TimestampType) 631 adjustedUTC := len(ts.TimeZone) == 0 632 switch ts.Unit { 633 case arrow.Microsecond: 634 return schema.NewTimestampLogicalType(adjustedUTC, schema.TimeUnitMicros) 635 case arrow.Millisecond: 636 return schema.NewTimestampLogicalType(adjustedUTC, schema.TimeUnitMillis) 637 case arrow.Nanosecond: 638 return schema.NewTimestampLogicalType(adjustedUTC, schema.TimeUnitNanos) 639 default: 640 panic("only milli, micro and nano units supported for arrow timestamp") 641 } 642 case arrow.TIME32: 643 return schema.NewTimeLogicalType(false, schema.TimeUnitMillis) 644 case arrow.TIME64: 645 ts := typ.(*arrow.Time64Type) 646 switch ts.Unit { 647 case arrow.Microsecond: 648 return schema.NewTimeLogicalType(false, schema.TimeUnitMicros) 649 case arrow.Nanosecond: 650 return schema.NewTimeLogicalType(false, schema.TimeUnitNanos) 651 default: 652 panic("only micro and nano seconds are supported for arrow TIME64") 653 } 654 case arrow.DECIMAL, arrow.DECIMAL256: 655 dec := typ.(arrow.DecimalType) 656 return schema.NewDecimalLogicalType(dec.GetPrecision(), dec.GetScale()) 657 } 658 return schema.NoLogicalType{} 659 } 660 661 func getPhysicalType(typ arrow.DataType) parquet.Type { 662 switch typ.ID() { 663 case arrow.DICTIONARY: 664 return getPhysicalType(typ.(*arrow.DictionaryType).ValueType) 665 case arrow.BOOL: 666 return parquet.Types.Boolean 667 case arrow.UINT8, arrow.INT8, arrow.UINT16, arrow.INT16, arrow.UINT32, arrow.INT32: 668 return parquet.Types.Int32 669 case arrow.INT64, arrow.UINT64: 670 return parquet.Types.Int64 671 case arrow.FLOAT32: 672 return parquet.Types.Float 673 case arrow.FLOAT64: 674 return parquet.Types.Double 675 case arrow.FLOAT16: 676 return parquet.Types.FixedLenByteArray 677 case arrow.BINARY, arrow.LARGE_BINARY, arrow.STRING, arrow.LARGE_STRING: 678 return parquet.Types.ByteArray 679 case arrow.FIXED_SIZE_BINARY, arrow.DECIMAL: 680 return parquet.Types.FixedLenByteArray 681 case arrow.DATE32: 682 return parquet.Types.Int32 683 case arrow.DATE64: 684 // convert to date32 internally 685 return parquet.Types.Int32 686 case arrow.TIME32: 687 return parquet.Types.Int32 688 case arrow.TIME64, arrow.TIMESTAMP: 689 return parquet.Types.Int64 690 default: 691 return parquet.Types.Int32 692 } 693 } 694 695 const ( 696 boolTestValue = true 697 uint8TestVal = uint8(64) 698 int8TestVal = int8(-64) 699 uint16TestVal = uint16(1024) 700 int16TestVal = int16(-1024) 701 uint32TestVal = uint32(1024) 702 int32TestVal = int32(-1024) 703 uint64TestVal = uint64(1024) 704 int64TestVal = int64(-1024) 705 tsTestValue = arrow.Timestamp(14695634030000) 706 date32TestVal = arrow.Date32(170000) 707 floatTestVal = float32(2.1) 708 doubleTestVal = float64(4.2) 709 strTestVal = "Test" 710 711 smallSize = 100 712 ) 713 714 type ParquetIOTestSuite struct { 715 suite.Suite 716 } 717 718 func (ps *ParquetIOTestSuite) SetupTest() { 719 ps.NoError(arrow.RegisterExtensionType(types.NewUUIDType())) 720 } 721 722 func (ps *ParquetIOTestSuite) TearDownTest() { 723 if arrow.GetExtensionType("uuid") != nil { 724 ps.NoError(arrow.UnregisterExtensionType("uuid")) 725 } 726 } 727 728 func (ps *ParquetIOTestSuite) makeSimpleSchema(typ arrow.DataType, rep parquet.Repetition) *schema.GroupNode { 729 byteWidth := int32(-1) 730 731 switch typ := typ.(type) { 732 case *arrow.FixedSizeBinaryType: 733 byteWidth = int32(typ.ByteWidth) 734 case arrow.DecimalType: 735 byteWidth = pqarrow.DecimalSize(typ.GetPrecision()) 736 case *arrow.Float16Type: 737 byteWidth = int32(typ.Bytes()) 738 case *arrow.DictionaryType: 739 valuesType := typ.ValueType 740 switch dt := valuesType.(type) { 741 case *arrow.FixedSizeBinaryType: 742 byteWidth = int32(dt.ByteWidth) 743 case arrow.DecimalType: 744 byteWidth = pqarrow.DecimalSize(dt.GetPrecision()) 745 case *arrow.Float16Type: 746 byteWidth = int32(typ.Bytes()) 747 } 748 } 749 750 pnode, _ := schema.NewPrimitiveNodeLogical("column1", rep, getLogicalType(typ), getPhysicalType(typ), int(byteWidth), -1) 751 return schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{pnode}, -1)) 752 } 753 754 func (ps *ParquetIOTestSuite) makePrimitiveTestCol(mem memory.Allocator, size int, typ arrow.DataType) arrow.Array { 755 switch typ.ID() { 756 case arrow.BOOL: 757 bldr := array.NewBooleanBuilder(mem) 758 defer bldr.Release() 759 for i := 0; i < size; i++ { 760 bldr.Append(boolTestValue) 761 } 762 return bldr.NewArray() 763 case arrow.INT8: 764 bldr := array.NewInt8Builder(mem) 765 defer bldr.Release() 766 for i := 0; i < size; i++ { 767 bldr.Append(int8TestVal) 768 } 769 return bldr.NewArray() 770 case arrow.UINT8: 771 bldr := array.NewUint8Builder(mem) 772 defer bldr.Release() 773 for i := 0; i < size; i++ { 774 bldr.Append(uint8TestVal) 775 } 776 return bldr.NewArray() 777 case arrow.INT16: 778 bldr := array.NewInt16Builder(mem) 779 defer bldr.Release() 780 for i := 0; i < size; i++ { 781 bldr.Append(int16TestVal) 782 } 783 return bldr.NewArray() 784 case arrow.UINT16: 785 bldr := array.NewUint16Builder(mem) 786 defer bldr.Release() 787 for i := 0; i < size; i++ { 788 bldr.Append(uint16TestVal) 789 } 790 return bldr.NewArray() 791 case arrow.INT32: 792 bldr := array.NewInt32Builder(mem) 793 defer bldr.Release() 794 for i := 0; i < size; i++ { 795 bldr.Append(int32TestVal) 796 } 797 return bldr.NewArray() 798 case arrow.UINT32: 799 bldr := array.NewUint32Builder(mem) 800 defer bldr.Release() 801 for i := 0; i < size; i++ { 802 bldr.Append(uint32TestVal) 803 } 804 return bldr.NewArray() 805 case arrow.INT64: 806 bldr := array.NewInt64Builder(mem) 807 defer bldr.Release() 808 for i := 0; i < size; i++ { 809 bldr.Append(int64TestVal) 810 } 811 return bldr.NewArray() 812 case arrow.UINT64: 813 bldr := array.NewUint64Builder(mem) 814 defer bldr.Release() 815 for i := 0; i < size; i++ { 816 bldr.Append(uint64TestVal) 817 } 818 return bldr.NewArray() 819 case arrow.FLOAT32: 820 bldr := array.NewFloat32Builder(mem) 821 defer bldr.Release() 822 for i := 0; i < size; i++ { 823 bldr.Append(floatTestVal) 824 } 825 return bldr.NewArray() 826 case arrow.FLOAT64: 827 bldr := array.NewFloat64Builder(mem) 828 defer bldr.Release() 829 for i := 0; i < size; i++ { 830 bldr.Append(doubleTestVal) 831 } 832 return bldr.NewArray() 833 } 834 return nil 835 } 836 837 func (ps *ParquetIOTestSuite) makeTestFile(mem memory.Allocator, typ arrow.DataType, arr arrow.Array, numChunks int) []byte { 838 sc := ps.makeSimpleSchema(typ, parquet.Repetitions.Required) 839 sink := encoding.NewBufferWriter(0, mem) 840 defer sink.Release() 841 writer := file.NewParquetWriter(sink, sc, file.WithWriterProps(parquet.NewWriterProperties(parquet.WithAllocator(mem)))) 842 843 props := pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)) 844 ctx := pqarrow.NewArrowWriteContext(context.TODO(), &props) 845 rowGroupSize := arr.Len() / numChunks 846 847 for i := 0; i < numChunks; i++ { 848 rgw := writer.AppendRowGroup() 849 cw, err := rgw.NextColumn() 850 ps.NoError(err) 851 852 start := i * rowGroupSize 853 slc := array.NewSlice(arr, int64(start), int64(start+rowGroupSize)) 854 defer slc.Release() 855 ps.NoError(pqarrow.WriteArrowToColumn(ctx, cw, slc, nil, nil, false)) 856 ps.NoError(cw.Close()) 857 ps.NoError(rgw.Close()) 858 } 859 ps.NoError(writer.Close()) 860 buf := sink.Finish() 861 defer buf.Release() 862 return buf.Bytes() 863 } 864 865 func (ps *ParquetIOTestSuite) createReader(mem memory.Allocator, data []byte) *pqarrow.FileReader { 866 rdr, err := file.NewParquetReader(bytes.NewReader(data), file.WithReadProps(parquet.NewReaderProperties(mem))) 867 ps.NoError(err) 868 869 reader, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem) 870 ps.NoError(err) 871 return reader 872 } 873 874 func (ps *ParquetIOTestSuite) readTable(rdr *pqarrow.FileReader) arrow.Table { 875 tbl, err := rdr.ReadTable(context.TODO()) 876 ps.NoError(err) 877 ps.NotNil(tbl) 878 return tbl 879 } 880 881 func (ps *ParquetIOTestSuite) checkSingleColumnRequiredTableRead(mem memory.Allocator, typ arrow.DataType, numChunks int) { 882 values := ps.makePrimitiveTestCol(mem, smallSize, typ) 883 defer values.Release() 884 885 data := ps.makeTestFile(mem, typ, values, numChunks) 886 reader := ps.createReader(mem, data) 887 888 tbl := ps.readTable(reader) 889 defer tbl.Release() 890 891 ps.EqualValues(1, tbl.NumCols()) 892 ps.EqualValues(smallSize, tbl.NumRows()) 893 894 chunked := tbl.Column(0).Data() 895 ps.Len(chunked.Chunks(), 1) 896 ps.True(array.Equal(values, chunked.Chunk(0))) 897 } 898 899 func (ps *ParquetIOTestSuite) checkSingleColumnRead(mem memory.Allocator, typ arrow.DataType, numChunks int) { 900 values := ps.makePrimitiveTestCol(mem, smallSize, typ) 901 defer values.Release() 902 903 data := ps.makeTestFile(mem, typ, values, numChunks) 904 reader := ps.createReader(mem, data) 905 906 cr, err := reader.GetColumn(context.TODO(), 0) 907 ps.NoError(err) 908 defer cr.Release() 909 910 chunked, err := cr.NextBatch(smallSize) 911 ps.NoError(err) 912 defer chunked.Release() 913 914 ps.Len(chunked.Chunks(), 1) 915 ps.True(array.Equal(values, chunked.Chunk(0))) 916 } 917 918 func (ps *ParquetIOTestSuite) TestDateTimeTypesReadWriteTable() { 919 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 920 defer mem.AssertSize(ps.T(), 0) 921 922 toWrite := makeDateTimeTypesTable(mem, false, true) 923 defer toWrite.Release() 924 buf := writeTableToBuffer(ps.T(), mem, toWrite, toWrite.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 925 defer buf.Release() 926 927 reader := ps.createReader(mem, buf.Bytes()) 928 tbl := ps.readTable(reader) 929 defer tbl.Release() 930 931 expected := makeDateTimeTypesTable(mem, true, true) 932 defer expected.Release() 933 934 ps.Equal(expected.NumCols(), tbl.NumCols()) 935 ps.Equal(expected.NumRows(), tbl.NumRows()) 936 ps.Truef(expected.Schema().Equal(tbl.Schema()), "expected schema: %s\ngot schema: %s", expected.Schema(), tbl.Schema()) 937 938 for i := 0; i < int(expected.NumCols()); i++ { 939 exChunk := expected.Column(i).Data() 940 tblChunk := tbl.Column(i).Data() 941 942 ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks())) 943 ps.Truef(array.Equal(exChunk.Chunk(0), tblChunk.Chunk(0)), "expected %s\ngot %s", exChunk.Chunk(0), tblChunk.Chunk(0)) 944 } 945 } 946 947 func (ps *ParquetIOTestSuite) TestDateTimeTypesWithInt96ReadWriteTable() { 948 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 949 defer mem.AssertSize(ps.T(), 0) 950 951 expected := makeDateTimeTypesTable(mem, false, true) 952 defer expected.Release() 953 buf := writeTableToBuffer(ps.T(), mem, expected, expected.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 954 defer buf.Release() 955 956 reader := ps.createReader(mem, buf.Bytes()) 957 tbl := ps.readTable(reader) 958 defer tbl.Release() 959 960 ps.Equal(expected.NumCols(), tbl.NumCols()) 961 ps.Equal(expected.NumRows(), tbl.NumRows()) 962 ps.Truef(expected.Schema().Equal(tbl.Schema()), "expected schema: %s\ngot schema: %s", expected.Schema(), tbl.Schema()) 963 964 for i := 0; i < int(expected.NumCols()); i++ { 965 exChunk := expected.Column(i).Data() 966 tblChunk := tbl.Column(i).Data() 967 968 ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks())) 969 ps.Truef(array.Equal(exChunk.Chunk(0), tblChunk.Chunk(0)), "expected %s\ngot %s", exChunk.Chunk(0), tblChunk.Chunk(0)) 970 } 971 } 972 973 func (ps *ParquetIOTestSuite) TestDate64ReadWriteTable() { 974 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 975 defer mem.AssertSize(ps.T(), 0) 976 977 date64InputTable := makeDateTypeTable(mem, false, false) 978 defer date64InputTable.Release() 979 buf := writeTableToBuffer(ps.T(), mem, date64InputTable, date64InputTable.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 980 defer buf.Release() 981 982 reader := ps.createReader(mem, buf.Bytes()) 983 roundTripOutputTable := ps.readTable(reader) 984 defer roundTripOutputTable.Release() 985 986 date32ExpectedOutputTable := makeDateTypeTable(mem, true, false) 987 defer date32ExpectedOutputTable.Release() 988 989 ps.Truef(array.TableEqual(date32ExpectedOutputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", date32ExpectedOutputTable, roundTripOutputTable) 990 } 991 992 func (ps *ParquetIOTestSuite) TestTimestampTZReadWriteTable() { 993 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 994 defer mem.AssertSize(ps.T(), 0) 995 996 inputTable := makeTimestampTypeTable(mem, false) 997 defer inputTable.Release() 998 buf := writeTableToBuffer(ps.T(), mem, inputTable, inputTable.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 999 defer buf.Release() 1000 1001 reader := ps.createReader(mem, buf.Bytes()) 1002 roundTripOutputTable := ps.readTable(reader) 1003 defer roundTripOutputTable.Release() 1004 1005 expectedOutputTable := makeTimestampTypeTable(mem, true) 1006 defer expectedOutputTable.Release() 1007 1008 ps.Truef(array.TableEqual(expectedOutputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", expectedOutputTable, roundTripOutputTable) 1009 } 1010 1011 func (ps *ParquetIOTestSuite) TestDate64ReadWriteTableWithPartialDays() { 1012 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1013 defer mem.AssertSize(ps.T(), 0) 1014 1015 date64InputTableNotAlignedToDateBoundary := makeDateTypeTable(mem, false, true) 1016 defer date64InputTableNotAlignedToDateBoundary.Release() 1017 buf := writeTableToBuffer(ps.T(), mem, date64InputTableNotAlignedToDateBoundary, date64InputTableNotAlignedToDateBoundary.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 1018 defer buf.Release() 1019 1020 reader := ps.createReader(mem, buf.Bytes()) 1021 roundTripOutputTable := ps.readTable(reader) 1022 defer roundTripOutputTable.Release() 1023 1024 date32ExpectedOutputTable := makeDateTypeTable(mem, true, true) 1025 defer date32ExpectedOutputTable.Release() 1026 1027 ps.Truef(array.TableEqual(date32ExpectedOutputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", date32ExpectedOutputTable, roundTripOutputTable) 1028 } 1029 1030 func (ps *ParquetIOTestSuite) TestTimestampTZStoreSchemaReadWriteTable() { 1031 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1032 defer mem.AssertSize(ps.T(), 0) 1033 1034 inputTable := makeTimestampTypeTable(mem, false) 1035 defer inputTable.Release() 1036 buf := writeTableToBuffer(ps.T(), mem, inputTable, inputTable.NumRows(), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem), pqarrow.WithStoreSchema())) 1037 defer buf.Release() 1038 1039 reader := ps.createReader(mem, buf.Bytes()) 1040 roundTripOutputTable := ps.readTable(reader) 1041 defer roundTripOutputTable.Release() 1042 1043 ps.Truef(array.TableEqual(inputTable, roundTripOutputTable), "expected table: %s\ngot table: %s", inputTable, roundTripOutputTable) 1044 } 1045 1046 func (ps *ParquetIOTestSuite) TestLargeBinaryReadWriteTable() { 1047 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1048 defer mem.AssertSize(ps.T(), 0) 1049 1050 // While we may write using LargeString, when we read, we get an array.String back out. 1051 // So we're building a normal array.String to use with array.Equal 1052 lsBldr := array.NewLargeStringBuilder(mem) 1053 defer lsBldr.Release() 1054 lbBldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary) 1055 defer lbBldr.Release() 1056 1057 for i := 0; i < smallSize; i++ { 1058 s := strconv.FormatInt(int64(i), 10) 1059 lsBldr.Append(s) 1060 lbBldr.Append([]byte(s)) 1061 } 1062 1063 lsValues := lsBldr.NewArray() 1064 defer lsValues.Release() 1065 lbValues := lbBldr.NewArray() 1066 defer lbValues.Release() 1067 1068 lsField := arrow.Field{Name: "large_string", Type: arrow.BinaryTypes.LargeString, Nullable: true} 1069 lbField := arrow.Field{Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary, Nullable: true} 1070 expected := array.NewTable( 1071 arrow.NewSchema([]arrow.Field{lsField, lbField}, nil), 1072 []arrow.Column{ 1073 *arrow.NewColumn(lsField, arrow.NewChunked(lsField.Type, []arrow.Array{lsValues})), 1074 *arrow.NewColumn(lbField, arrow.NewChunked(lbField.Type, []arrow.Array{lbValues})), 1075 }, 1076 -1, 1077 ) 1078 defer lsValues.Release() // NewChunked 1079 defer lbValues.Release() // NewChunked 1080 defer expected.Release() 1081 ps.roundTripTable(mem, expected, true) 1082 } 1083 1084 func (ps *ParquetIOTestSuite) TestReadSingleColumnFile() { 1085 types := []arrow.DataType{ 1086 arrow.FixedWidthTypes.Boolean, 1087 arrow.PrimitiveTypes.Uint8, 1088 arrow.PrimitiveTypes.Int8, 1089 arrow.PrimitiveTypes.Uint16, 1090 arrow.PrimitiveTypes.Int16, 1091 arrow.PrimitiveTypes.Uint32, 1092 arrow.PrimitiveTypes.Int32, 1093 arrow.PrimitiveTypes.Uint64, 1094 arrow.PrimitiveTypes.Int64, 1095 arrow.PrimitiveTypes.Float32, 1096 arrow.PrimitiveTypes.Float64, 1097 } 1098 1099 nchunks := []int{1, 4} 1100 1101 for _, n := range nchunks { 1102 for _, dt := range types { 1103 ps.Run(fmt.Sprintf("%s %d chunks", dt.Name(), n), func() { 1104 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1105 defer mem.AssertSize(ps.T(), 0) 1106 ps.checkSingleColumnRead(mem, dt, n) 1107 }) 1108 } 1109 } 1110 } 1111 1112 func (ps *ParquetIOTestSuite) TestSingleColumnRequiredRead() { 1113 types := []arrow.DataType{ 1114 arrow.FixedWidthTypes.Boolean, 1115 arrow.PrimitiveTypes.Uint8, 1116 arrow.PrimitiveTypes.Int8, 1117 arrow.PrimitiveTypes.Uint16, 1118 arrow.PrimitiveTypes.Int16, 1119 arrow.PrimitiveTypes.Uint32, 1120 arrow.PrimitiveTypes.Int32, 1121 arrow.PrimitiveTypes.Uint64, 1122 arrow.PrimitiveTypes.Int64, 1123 arrow.PrimitiveTypes.Float32, 1124 arrow.PrimitiveTypes.Float64, 1125 } 1126 1127 nchunks := []int{1, 4} 1128 1129 for _, n := range nchunks { 1130 for _, dt := range types { 1131 ps.Run(fmt.Sprintf("%s %d chunks", dt.Name(), n), func() { 1132 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1133 defer mem.AssertSize(ps.T(), 0) 1134 1135 ps.checkSingleColumnRequiredTableRead(mem, dt, n) 1136 }) 1137 } 1138 } 1139 } 1140 1141 func (ps *ParquetIOTestSuite) TestReadDecimals() { 1142 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1143 defer mem.AssertSize(ps.T(), 0) 1144 1145 bigEndian := []parquet.ByteArray{ 1146 // 123456 1147 []byte{1, 226, 64}, 1148 // 987654 1149 []byte{15, 18, 6}, 1150 // -123456 1151 []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, 1152 } 1153 1154 bldr := array.NewDecimal128Builder(mem, &arrow.Decimal128Type{Precision: 6, Scale: 3}) 1155 defer bldr.Release() 1156 1157 bldr.Append(decimal128.FromU64(123456)) 1158 bldr.Append(decimal128.FromU64(987654)) 1159 bldr.Append(decimal128.FromI64(-123456)) 1160 1161 expected := bldr.NewDecimal128Array() 1162 defer expected.Release() 1163 1164 sc := schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 1165 schema.Must(schema.NewPrimitiveNodeLogical("decimals", parquet.Repetitions.Required, schema.NewDecimalLogicalType(6, 3), parquet.Types.ByteArray, -1, -1)), 1166 }, -1)) 1167 1168 sink := encoding.NewBufferWriter(0, mem) 1169 defer sink.Release() 1170 writer := file.NewParquetWriter(sink, sc) 1171 1172 rgw := writer.AppendRowGroup() 1173 cw, _ := rgw.NextColumn() 1174 cw.(*file.ByteArrayColumnChunkWriter).WriteBatch(bigEndian, nil, nil) 1175 cw.Close() 1176 rgw.Close() 1177 writer.Close() 1178 1179 rdr := ps.createReader(mem, sink.Bytes()) 1180 cr, err := rdr.GetColumn(context.TODO(), 0) 1181 ps.NoError(err) 1182 1183 chunked, err := cr.NextBatch(smallSize) 1184 ps.NoError(err) 1185 defer chunked.Release() 1186 1187 ps.Len(chunked.Chunks(), 1) 1188 ps.True(array.Equal(expected, chunked.Chunk(0))) 1189 } 1190 1191 func (ps *ParquetIOTestSuite) TestReadDecimal256() { 1192 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1193 defer mem.AssertSize(ps.T(), 0) 1194 1195 bigEndian := []parquet.ByteArray{ 1196 // 123456 1197 []byte{1, 226, 64}, 1198 // 987654 1199 []byte{15, 18, 6}, 1200 // -123456 1201 []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, 1202 } 1203 1204 bldr := array.NewDecimal256Builder(mem, &arrow.Decimal256Type{Precision: 40, Scale: 3}) 1205 defer bldr.Release() 1206 1207 bldr.Append(decimal256.FromU64(123456)) 1208 bldr.Append(decimal256.FromU64(987654)) 1209 bldr.Append(decimal256.FromI64(-123456)) 1210 1211 expected := bldr.NewDecimal256Array() 1212 defer expected.Release() 1213 1214 sc := schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{ 1215 schema.Must(schema.NewPrimitiveNodeLogical("decimals", parquet.Repetitions.Required, schema.NewDecimalLogicalType(40, 3), parquet.Types.ByteArray, -1, -1)), 1216 }, -1)) 1217 1218 sink := encoding.NewBufferWriter(0, mem) 1219 defer sink.Release() 1220 writer := file.NewParquetWriter(sink, sc) 1221 1222 rgw := writer.AppendRowGroup() 1223 cw, _ := rgw.NextColumn() 1224 cw.(*file.ByteArrayColumnChunkWriter).WriteBatch(bigEndian, nil, nil) 1225 cw.Close() 1226 rgw.Close() 1227 writer.Close() 1228 1229 rdr := ps.createReader(mem, sink.Bytes()) 1230 cr, err := rdr.GetColumn(context.TODO(), 0) 1231 ps.NoError(err) 1232 1233 chunked, err := cr.NextBatch(smallSize) 1234 ps.NoError(err) 1235 defer chunked.Release() 1236 1237 ps.Len(chunked.Chunks(), 1) 1238 ps.Truef(array.Equal(expected, chunked.Chunk(0)), "expected: %s\ngot: %s", expected, chunked.Chunk(0)) 1239 } 1240 1241 func (ps *ParquetIOTestSuite) TestReadNestedStruct() { 1242 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1243 defer mem.AssertSize(ps.T(), 0) 1244 1245 dt := arrow.StructOf(arrow.Field{ 1246 Name: "nested", 1247 Type: arrow.StructOf( 1248 arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, 1249 arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32}, 1250 arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64}, 1251 ), 1252 }) 1253 field := arrow.Field{Name: "struct", Type: dt, Nullable: true} 1254 1255 builder := array.NewStructBuilder(mem, dt) 1256 defer builder.Release() 1257 nested := builder.FieldBuilder(0).(*array.StructBuilder) 1258 1259 builder.Append(true) 1260 nested.Append(true) 1261 nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true) 1262 nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1)) 1263 nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2)) 1264 builder.AppendNull() 1265 1266 arr := builder.NewStructArray() 1267 defer arr.Release() 1268 1269 expected := array.NewTable( 1270 arrow.NewSchema([]arrow.Field{field}, nil), 1271 []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, []arrow.Array{arr}))}, 1272 -1, 1273 ) 1274 defer arr.Release() // NewChunked 1275 defer expected.Release() 1276 ps.roundTripTable(mem, expected, true) 1277 } 1278 1279 func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte { 1280 var buf bytes.Buffer 1281 arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil) 1282 ps.NoError(err) 1283 1284 writer, err := pqarrow.NewFileWriter(arrsc, &buf, parquet.NewWriterProperties(parquet.WithDictionaryDefault(false)), pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 1285 ps.NoError(err) 1286 1287 writer.NewRowGroup() 1288 ps.NoError(writer.WriteColumnData(values)) 1289 //defer values.Release() 1290 ps.NoError(writer.Close()) 1291 ps.NoError(writer.Close()) 1292 1293 return buf.Bytes() 1294 } 1295 1296 func (ps *ParquetIOTestSuite) readAndCheckSingleColumnFile(mem memory.Allocator, data []byte, values arrow.Array) { 1297 reader := ps.createReader(mem, data) 1298 cr, err := reader.GetColumn(context.TODO(), 0) 1299 ps.NoError(err) 1300 ps.NotNil(cr) 1301 defer cr.Release() 1302 1303 chunked, err := cr.NextBatch(smallSize) 1304 ps.NoError(err) 1305 defer chunked.Release() 1306 1307 ps.Len(chunked.Chunks(), 1) 1308 ps.NotNil(chunked.Chunk(0)) 1309 1310 ps.True(array.Equal(values, chunked.Chunk(0))) 1311 } 1312 1313 var fullTypeList = []arrow.DataType{ 1314 arrow.FixedWidthTypes.Boolean, 1315 arrow.PrimitiveTypes.Uint8, 1316 arrow.PrimitiveTypes.Int8, 1317 arrow.PrimitiveTypes.Uint16, 1318 arrow.PrimitiveTypes.Int16, 1319 arrow.PrimitiveTypes.Uint32, 1320 arrow.PrimitiveTypes.Int32, 1321 arrow.PrimitiveTypes.Uint64, 1322 arrow.PrimitiveTypes.Int64, 1323 arrow.FixedWidthTypes.Date32, 1324 arrow.PrimitiveTypes.Float32, 1325 arrow.PrimitiveTypes.Float64, 1326 arrow.FixedWidthTypes.Float16, 1327 arrow.BinaryTypes.String, 1328 arrow.BinaryTypes.Binary, 1329 &arrow.FixedSizeBinaryType{ByteWidth: 10}, 1330 &arrow.Decimal128Type{Precision: 1, Scale: 0}, 1331 &arrow.Decimal128Type{Precision: 5, Scale: 4}, 1332 &arrow.Decimal128Type{Precision: 10, Scale: 9}, 1333 &arrow.Decimal128Type{Precision: 19, Scale: 18}, 1334 &arrow.Decimal128Type{Precision: 23, Scale: 22}, 1335 &arrow.Decimal128Type{Precision: 27, Scale: 26}, 1336 &arrow.Decimal128Type{Precision: 38, Scale: 37}, 1337 } 1338 1339 func (ps *ParquetIOTestSuite) TestSingleColumnRequiredWrite() { 1340 for _, dt := range fullTypeList { 1341 ps.Run(dt.Name(), func() { 1342 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1343 defer mem.AssertSize(ps.T(), 0) 1344 1345 values := testutils.RandomNonNull(mem, dt, smallSize) 1346 defer values.Release() 1347 sc := ps.makeSimpleSchema(dt, parquet.Repetitions.Required) 1348 data := ps.writeColumn(mem, sc, values) 1349 ps.readAndCheckSingleColumnFile(mem, data, values) 1350 }) 1351 } 1352 } 1353 1354 func (ps *ParquetIOTestSuite) roundTripTable(mem memory.Allocator, expected arrow.Table, storeSchema bool) { 1355 var buf bytes.Buffer 1356 var props pqarrow.ArrowWriterProperties 1357 if storeSchema { 1358 props = pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema(), pqarrow.WithAllocator(mem)) 1359 } else { 1360 props = pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)) 1361 } 1362 1363 writeProps := parquet.NewWriterProperties(parquet.WithAllocator(mem)) 1364 ps.Require().NoError(pqarrow.WriteTable(expected, &buf, expected.NumRows(), writeProps, props)) 1365 1366 reader := ps.createReader(mem, buf.Bytes()) 1367 defer reader.ParquetReader().Close() 1368 1369 tbl := ps.readTable(reader) 1370 defer tbl.Release() 1371 1372 ps.Equal(expected.NumCols(), tbl.NumCols()) 1373 ps.Equal(expected.NumRows(), tbl.NumRows()) 1374 1375 exChunk := expected.Column(0).Data() 1376 tblChunk := tbl.Column(0).Data() 1377 1378 ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks())) 1379 exc := exChunk.Chunk(0) 1380 tbc := tblChunk.Chunk(0) 1381 ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc) 1382 } 1383 1384 func makeEmptyListsArray(size int) arrow.Array { 1385 // allocate an offsets buffer with only zeros 1386 offsetsNbytes := arrow.Int32Traits.BytesRequired(size + 1) 1387 offsetsBuffer := make([]byte, offsetsNbytes) 1388 1389 childBuffers := []*memory.Buffer{nil, nil} 1390 childData := array.NewData(arrow.PrimitiveTypes.Float32, 0, childBuffers, nil, 0, 0) 1391 defer childData.Release() 1392 buffers := []*memory.Buffer{nil, memory.NewBufferBytes(offsetsBuffer)} 1393 arrayData := array.NewData(arrow.ListOf(childData.DataType()), size, buffers, []arrow.ArrayData{childData}, 0, 0) 1394 defer arrayData.Release() 1395 return array.MakeFromData(arrayData) 1396 } 1397 1398 func makeListArray(values arrow.Array, size, nullcount int) arrow.Array { 1399 nonNullEntries := size - nullcount - 1 1400 lengthPerEntry := values.Len() / nonNullEntries 1401 1402 offsets := make([]byte, arrow.Int32Traits.BytesRequired(size+1)) 1403 offsetsArr := arrow.Int32Traits.CastFromBytes(offsets) 1404 1405 nullBitmap := make([]byte, int(bitutil.BytesForBits(int64(size)))) 1406 1407 curOffset := 0 1408 for i := 0; i < size; i++ { 1409 offsetsArr[i] = int32(curOffset) 1410 if !(((i % 2) == 0) && ((i / 2) < nullcount)) { 1411 // non-null list (list with index 1 is always empty) 1412 bitutil.SetBit(nullBitmap, i) 1413 if i != 1 { 1414 curOffset += lengthPerEntry 1415 } 1416 } 1417 } 1418 offsetsArr[size] = int32(values.Len()) 1419 1420 listData := array.NewData(arrow.ListOf(values.DataType()), size, 1421 []*memory.Buffer{memory.NewBufferBytes(nullBitmap), memory.NewBufferBytes(offsets)}, 1422 []arrow.ArrayData{values.Data()}, nullcount, 0) 1423 defer listData.Release() 1424 return array.NewListData(listData) 1425 } 1426 1427 func prepareEmptyListsTable(size int) arrow.Table { 1428 lists := makeEmptyListsArray(size) 1429 defer lists.Release() 1430 chunked := arrow.NewChunked(lists.DataType(), []arrow.Array{lists}) 1431 defer chunked.Release() 1432 return makeSimpleTable(chunked, true) 1433 } 1434 1435 func prepareListTable(dt arrow.DataType, size int, nullableLists bool, nullableElems bool, nullCount int) arrow.Table { 1436 nc := nullCount 1437 if !nullableElems { 1438 nc = 0 1439 } 1440 values := testutils.RandomNullable(dt, size*size, nc) 1441 defer values.Release() 1442 // also test that slice offsets are respected 1443 values = array.NewSlice(values, 5, int64(values.Len())) 1444 defer values.Release() 1445 1446 if !nullableLists { 1447 nullCount = 0 1448 } 1449 lists := makeListArray(values, size, nullCount) 1450 defer lists.Release() 1451 1452 chunked := arrow.NewChunked(lists.DataType(), []arrow.Array{lists}) 1453 defer chunked.Release() 1454 1455 return makeSimpleTable(array.NewChunkedSlice(chunked, 3, int64(size)), nullableLists) 1456 } 1457 1458 func prepareListOfListTable(dt arrow.DataType, size, nullCount int, nullableParentLists, nullableLists, nullableElems bool) arrow.Table { 1459 nc := nullCount 1460 if !nullableElems { 1461 nc = 0 1462 } 1463 1464 values := testutils.RandomNullable(dt, size*6, nc) 1465 defer values.Release() 1466 1467 if nullableLists { 1468 nc = nullCount 1469 } else { 1470 nc = 0 1471 } 1472 1473 lists := makeListArray(values, size*3, nc) 1474 defer lists.Release() 1475 1476 if !nullableParentLists { 1477 nullCount = 0 1478 } 1479 1480 parentLists := makeListArray(lists, size, nullCount) 1481 defer parentLists.Release() 1482 1483 chunked := arrow.NewChunked(parentLists.DataType(), []arrow.Array{parentLists}) 1484 defer chunked.Release() 1485 1486 return makeSimpleTable(chunked, nullableParentLists) 1487 } 1488 1489 func (ps *ParquetIOTestSuite) TestSingleEmptyListsColumnReadWrite() { 1490 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1491 defer mem.AssertSize(ps.T(), 0) 1492 1493 expected := prepareEmptyListsTable(smallSize) 1494 defer expected.Release() 1495 buf := writeTableToBuffer(ps.T(), mem, expected, smallSize, pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 1496 defer buf.Release() 1497 1498 reader := ps.createReader(mem, buf.Bytes()) 1499 tbl := ps.readTable(reader) 1500 defer tbl.Release() 1501 1502 ps.EqualValues(expected.NumCols(), tbl.NumCols()) 1503 ps.EqualValues(expected.NumRows(), tbl.NumRows()) 1504 1505 exChunk := expected.Column(0).Data() 1506 tblChunk := tbl.Column(0).Data() 1507 1508 ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks())) 1509 ps.True(array.Equal(exChunk.Chunk(0), tblChunk.Chunk(0))) 1510 } 1511 1512 func (ps *ParquetIOTestSuite) TestSingleColumnOptionalReadWrite() { 1513 for _, dt := range fullTypeList { 1514 ps.Run(dt.Name(), func() { 1515 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1516 defer mem.AssertSize(ps.T(), 0) 1517 1518 values := testutils.RandomNullable(dt, smallSize, 10) 1519 defer values.Release() 1520 sc := ps.makeSimpleSchema(dt, parquet.Repetitions.Optional) 1521 data := ps.writeColumn(mem, sc, values) 1522 ps.readAndCheckSingleColumnFile(mem, data, values) 1523 }) 1524 } 1525 } 1526 1527 func (ps *ParquetIOTestSuite) TestSingleNullableListNullableColumnReadWrite() { 1528 for _, dt := range fullTypeList { 1529 ps.Run(dt.Name(), func() { 1530 expected := prepareListTable(dt, smallSize, true, true, 10) 1531 defer expected.Release() 1532 ps.roundTripTable(memory.DefaultAllocator, expected, false) 1533 }) 1534 } 1535 } 1536 1537 func (ps *ParquetIOTestSuite) TestSingleRequiredListNullableColumnReadWrite() { 1538 for _, dt := range fullTypeList { 1539 ps.Run(dt.Name(), func() { 1540 expected := prepareListTable(dt, smallSize, false, true, 10) 1541 defer expected.Release() 1542 ps.roundTripTable(memory.DefaultAllocator, expected, false) 1543 }) 1544 } 1545 } 1546 1547 func (ps *ParquetIOTestSuite) TestSingleNullableListRequiredColumnReadWrite() { 1548 for _, dt := range fullTypeList { 1549 ps.Run(dt.Name(), func() { 1550 expected := prepareListTable(dt, smallSize, true, false, 10) 1551 defer expected.Release() 1552 ps.roundTripTable(memory.DefaultAllocator, expected, false) 1553 }) 1554 } 1555 } 1556 1557 func (ps *ParquetIOTestSuite) TestSingleRequiredListRequiredColumnReadWrite() { 1558 for _, dt := range fullTypeList { 1559 ps.Run(dt.Name(), func() { 1560 expected := prepareListTable(dt, smallSize, false, false, 0) 1561 defer expected.Release() 1562 ps.roundTripTable(memory.DefaultAllocator, expected, false) 1563 }) 1564 } 1565 } 1566 1567 func (ps *ParquetIOTestSuite) TestSingleNullableListRequiredListRequiredColumnReadWrite() { 1568 for _, dt := range fullTypeList { 1569 ps.Run(dt.Name(), func() { 1570 expected := prepareListOfListTable(dt, smallSize, 2, true, false, false) 1571 defer expected.Release() 1572 ps.roundTripTable(memory.DefaultAllocator, expected, false) 1573 }) 1574 } 1575 } 1576 1577 func (ps *ParquetIOTestSuite) TestSimpleStruct() { 1578 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1579 defer mem.AssertSize(ps.T(), 0) 1580 1581 links := arrow.StructOf(arrow.Field{Name: "Backward", Type: arrow.PrimitiveTypes.Int64, Nullable: true}, 1582 arrow.Field{Name: "Forward", Type: arrow.PrimitiveTypes.Int64, Nullable: true}) 1583 1584 bldr := array.NewStructBuilder(mem, links) 1585 defer bldr.Release() 1586 1587 backBldr := bldr.FieldBuilder(0).(*array.Int64Builder) 1588 forwardBldr := bldr.FieldBuilder(1).(*array.Int64Builder) 1589 1590 bldr.Append(true) 1591 backBldr.AppendNull() 1592 forwardBldr.Append(20) 1593 1594 bldr.Append(true) 1595 backBldr.Append(10) 1596 forwardBldr.Append(40) 1597 1598 data := bldr.NewArray() 1599 defer data.Release() 1600 1601 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{{Name: "links", Type: links}}, nil), 1602 []arrow.Column{*arrow.NewColumn(arrow.Field{Name: "links", Type: links}, arrow.NewChunked(links, []arrow.Array{data}))}, -1) 1603 defer data.Release() // NewChunked 1604 defer tbl.Release() 1605 1606 ps.roundTripTable(mem, tbl, false) 1607 } 1608 1609 func (ps *ParquetIOTestSuite) TestSingleColumnNullableStruct() { 1610 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1611 defer mem.AssertSize(ps.T(), 0) 1612 1613 links := arrow.StructOf(arrow.Field{Name: "Backward", Type: arrow.PrimitiveTypes.Int64, Nullable: true}) 1614 bldr := array.NewStructBuilder(mem, links) 1615 defer bldr.Release() 1616 1617 backBldr := bldr.FieldBuilder(0).(*array.Int64Builder) 1618 1619 bldr.AppendNull() 1620 bldr.Append(true) 1621 backBldr.Append(10) 1622 1623 data := bldr.NewArray() 1624 defer data.Release() 1625 1626 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{{Name: "links", Type: links, Nullable: true}}, nil), 1627 []arrow.Column{*arrow.NewColumn(arrow.Field{Name: "links", Type: links, Nullable: true}, arrow.NewChunked(links, []arrow.Array{data}))}, -1) 1628 defer data.Release() // NewChunked 1629 defer tbl.Release() 1630 1631 ps.roundTripTable(mem, tbl, false) 1632 } 1633 1634 func (ps *ParquetIOTestSuite) TestNestedRequiredFieldStruct() { 1635 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1636 defer mem.AssertSize(ps.T(), 0) 1637 1638 intField := arrow.Field{Name: "int_array", Type: arrow.PrimitiveTypes.Int32} 1639 intBldr := array.NewInt32Builder(mem) 1640 defer intBldr.Release() 1641 intBldr.AppendValues([]int32{0, 1, 2, 3, 4, 5, 7, 8}, nil) 1642 1643 intArr := intBldr.NewArray() 1644 defer intArr.Release() 1645 1646 validity := memory.NewBufferBytes([]byte{0xCC}) 1647 defer validity.Release() 1648 1649 structField := arrow.Field{Name: "root", Type: arrow.StructOf(intField), Nullable: true} 1650 structData := array.NewData(structField.Type, 8, []*memory.Buffer{validity}, []arrow.ArrayData{intArr.Data()}, 4, 0) 1651 defer structData.Release() 1652 stData := array.NewStructData(structData) 1653 defer stData.Release() 1654 1655 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{structField}, nil), 1656 []arrow.Column{*arrow.NewColumn(structField, 1657 arrow.NewChunked(structField.Type, []arrow.Array{stData}))}, -1) 1658 defer stData.Release() // NewChunked 1659 defer tbl.Release() 1660 1661 ps.roundTripTable(mem, tbl, false) 1662 } 1663 1664 func (ps *ParquetIOTestSuite) TestNestedNullableField() { 1665 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1666 defer mem.AssertSize(ps.T(), 0) 1667 1668 intField := arrow.Field{Name: "int_array", Type: arrow.PrimitiveTypes.Int32, Nullable: true} 1669 intBldr := array.NewInt32Builder(mem) 1670 defer intBldr.Release() 1671 intBldr.AppendValues([]int32{0, 1, 2, 3, 4, 5, 7, 8}, []bool{true, false, true, false, true, true, false, true}) 1672 1673 intArr := intBldr.NewArray() 1674 defer intArr.Release() 1675 1676 validity := memory.NewBufferBytes([]byte{0xCC}) 1677 defer validity.Release() 1678 1679 structField := arrow.Field{Name: "root", Type: arrow.StructOf(intField), Nullable: true} 1680 data := array.NewData(structField.Type, 8, []*memory.Buffer{validity}, []arrow.ArrayData{intArr.Data()}, 4, 0) 1681 defer data.Release() 1682 stData := array.NewStructData(data) 1683 defer stData.Release() 1684 1685 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{structField}, nil), 1686 []arrow.Column{*arrow.NewColumn(structField, 1687 arrow.NewChunked(structField.Type, []arrow.Array{stData}))}, -1) 1688 defer stData.Release() // NewChunked 1689 defer tbl.Release() 1690 1691 ps.roundTripTable(mem, tbl, false) 1692 } 1693 1694 func (ps *ParquetIOTestSuite) TestNestedEmptyList() { 1695 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1696 defer mem.AssertSize(ps.T(), 0) 1697 1698 bldr := array.NewStructBuilder(mem, arrow.StructOf( 1699 arrow.Field{ 1700 Name: "root", 1701 Type: arrow.StructOf( 1702 arrow.Field{ 1703 Name: "child1", 1704 Type: arrow.ListOf(arrow.StructOf( 1705 arrow.Field{ 1706 Name: "child2", 1707 Type: arrow.ListOf(arrow.StructOf( 1708 arrow.Field{ 1709 Name: "name", 1710 Type: arrow.BinaryTypes.String, 1711 }, 1712 )), 1713 }, 1714 )), 1715 }, 1716 ), 1717 }, 1718 )) 1719 defer bldr.Release() 1720 1721 rootBldr := bldr.FieldBuilder(0).(*array.StructBuilder) 1722 child1Bldr := rootBldr.FieldBuilder(0).(*array.ListBuilder) 1723 child1ElBldr := child1Bldr.ValueBuilder().(*array.StructBuilder) 1724 child2Bldr := child1ElBldr.FieldBuilder(0).(*array.ListBuilder) 1725 leafBldr := child2Bldr.ValueBuilder().(*array.StructBuilder) 1726 nameBldr := leafBldr.FieldBuilder(0).(*array.StringBuilder) 1727 1728 // target structure 8 times 1729 // { 1730 // "root": { 1731 // "child1": [ 1732 // { "child2": [{ "name": "foo" }] }, 1733 // { "child2": [] } 1734 // ] 1735 // } 1736 // } 1737 1738 for i := 0; i < 8; i++ { 1739 bldr.Append(true) 1740 rootBldr.Append(true) 1741 child1Bldr.Append(true) 1742 1743 child1ElBldr.Append(true) 1744 child2Bldr.Append(true) 1745 leafBldr.Append(true) 1746 nameBldr.Append("foo") 1747 1748 child1ElBldr.Append(true) 1749 child2Bldr.Append(true) 1750 } 1751 1752 arr := bldr.NewArray() 1753 defer arr.Release() 1754 1755 field := arrow.Field{Name: "x", Type: arr.DataType(), Nullable: true} 1756 expected := array.NewTableFromSlice(arrow.NewSchema([]arrow.Field{field}, nil), [][]arrow.Array{{arr}}) 1757 defer expected.Release() 1758 1759 ps.roundTripTable(mem, expected, false) 1760 } 1761 1762 func (ps *ParquetIOTestSuite) TestCanonicalNestedRoundTrip() { 1763 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1764 defer mem.AssertSize(ps.T(), 0) 1765 1766 docIdField := arrow.Field{Name: "DocID", Type: arrow.PrimitiveTypes.Int64} 1767 linksField := arrow.Field{Name: "Links", Type: arrow.StructOf( 1768 arrow.Field{Name: "Backward", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)}, 1769 arrow.Field{Name: "Forward", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)}, 1770 ), Nullable: true} 1771 1772 nameStruct := arrow.StructOf( 1773 arrow.Field{Name: "Language", Nullable: true, Type: arrow.ListOf( 1774 arrow.StructOf(arrow.Field{Name: "Code", Type: arrow.BinaryTypes.String}, 1775 arrow.Field{Name: "Country", Type: arrow.BinaryTypes.String, Nullable: true}))}, 1776 arrow.Field{Name: "Url", Type: arrow.BinaryTypes.String, Nullable: true}) 1777 1778 nameField := arrow.Field{Name: "Name", Type: arrow.ListOf(nameStruct)} 1779 sc := arrow.NewSchema([]arrow.Field{docIdField, linksField, nameField}, nil) 1780 1781 docIDArr, _, err := array.FromJSON(mem, docIdField.Type, strings.NewReader("[10, 20]")) 1782 ps.Require().NoError(err) 1783 defer docIDArr.Release() 1784 1785 linksIDArr, _, err := array.FromJSON(mem, linksField.Type, strings.NewReader(`[{"Backward":[], "Forward":[20, 40, 60]}, {"Backward":[10, 30], "Forward": [80]}]`)) 1786 ps.Require().NoError(err) 1787 defer linksIDArr.Release() 1788 1789 nameArr, _, err := array.FromJSON(mem, nameField.Type, strings.NewReader(` 1790 [[{"Language": [{"Code": "en_us", "Country": "us"}, 1791 {"Code": "en_us", "Country": null}], 1792 "Url": "http://A"}, 1793 {"Url": "http://B", "Language": null}, 1794 {"Language": [{"Code": "en-gb", "Country": "gb"}], "Url": null}], 1795 [{"Url": "http://C", "Language": null}]]`)) 1796 ps.Require().NoError(err) 1797 defer nameArr.Release() 1798 1799 expected := array.NewTable(sc, []arrow.Column{ 1800 *arrow.NewColumn(docIdField, arrow.NewChunked(docIdField.Type, []arrow.Array{docIDArr})), 1801 *arrow.NewColumn(linksField, arrow.NewChunked(linksField.Type, []arrow.Array{linksIDArr})), 1802 *arrow.NewColumn(nameField, arrow.NewChunked(nameField.Type, []arrow.Array{nameArr})), 1803 }, 2) 1804 defer docIDArr.Release() // NewChunked 1805 defer linksIDArr.Release() // NewChunked 1806 defer nameArr.Release() // NewChunked 1807 defer expected.Release() 1808 1809 ps.roundTripTable(mem, expected, false) 1810 } 1811 1812 func (ps *ParquetIOTestSuite) TestFixedSizeList() { 1813 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1814 defer mem.AssertSize(ps.T(), 0) 1815 1816 bldr := array.NewFixedSizeListBuilder(mem, 3, arrow.PrimitiveTypes.Int16) 1817 defer bldr.Release() 1818 1819 vb := bldr.ValueBuilder().(*array.Int16Builder) 1820 1821 bldr.AppendValues([]bool{true, true, true}) 1822 vb.AppendValues([]int16{1, 2, 3, 4, 5, 6, 7, 8, 9}, nil) 1823 1824 data := bldr.NewArray() 1825 defer data.Release() // NewArray 1826 1827 field := arrow.Field{Name: "root", Type: data.DataType(), Nullable: true} 1828 cnk := arrow.NewChunked(field.Type, []arrow.Array{data}) 1829 defer data.Release() // NewChunked 1830 1831 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{field}, nil), []arrow.Column{*arrow.NewColumn(field, cnk)}, -1) 1832 defer cnk.Release() // NewColumn 1833 defer tbl.Release() 1834 1835 ps.roundTripTable(mem, tbl, true) 1836 } 1837 1838 func (ps *ParquetIOTestSuite) TestNull() { 1839 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1840 defer mem.AssertSize(ps.T(), 0) 1841 1842 bldr := array.NewNullBuilder(mem) 1843 defer bldr.Release() 1844 1845 bldr.AppendNull() 1846 bldr.AppendNull() 1847 bldr.AppendNull() 1848 1849 data := bldr.NewArray() 1850 defer data.Release() 1851 1852 field := arrow.Field{Name: "x", Type: data.DataType(), Nullable: true} 1853 expected := array.NewTable( 1854 arrow.NewSchema([]arrow.Field{field}, nil), 1855 []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(field.Type, []arrow.Array{data}))}, 1856 -1, 1857 ) 1858 1859 ps.roundTripTable(mem, expected, true) 1860 } 1861 1862 // ARROW-17169 1863 func (ps *ParquetIOTestSuite) TestNullableListOfStruct() { 1864 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1865 defer mem.AssertSize(ps.T(), 0) 1866 1867 bldr := array.NewListBuilder(mem, arrow.StructOf( 1868 arrow.Field{Name: "a", Type: arrow.PrimitiveTypes.Int32}, 1869 arrow.Field{Name: "b", Type: arrow.BinaryTypes.String}, 1870 )) 1871 defer bldr.Release() 1872 1873 stBldr := bldr.ValueBuilder().(*array.StructBuilder) 1874 aBldr := stBldr.FieldBuilder(0).(*array.Int32Builder) 1875 bBldr := stBldr.FieldBuilder(1).(*array.StringBuilder) 1876 1877 for i := 0; i < 320; i++ { 1878 if i%5 == 0 { 1879 bldr.AppendNull() 1880 continue 1881 } 1882 bldr.Append(true) 1883 for j := 0; j < 4; j++ { 1884 stBldr.Append(true) 1885 aBldr.Append(int32(i + j)) 1886 bBldr.Append(strconv.Itoa(i + j)) 1887 } 1888 } 1889 1890 arr := bldr.NewArray() 1891 defer arr.Release() 1892 1893 field := arrow.Field{Name: "x", Type: arr.DataType(), Nullable: true} 1894 expected := array.NewTable(arrow.NewSchema([]arrow.Field{field}, nil), 1895 []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(field.Type, []arrow.Array{arr}))}, -1) 1896 defer arr.Release() // NewChunked 1897 defer expected.Release() 1898 1899 ps.roundTripTable(mem, expected, false) 1900 } 1901 1902 func (ps *ParquetIOTestSuite) TestStructWithListOfNestedStructs() { 1903 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1904 defer mem.AssertSize(ps.T(), 0) 1905 1906 bldr := array.NewStructBuilder(mem, arrow.StructOf( 1907 arrow.Field{ 1908 Nullable: true, 1909 Name: "l", 1910 Type: arrow.ListOf(arrow.StructOf( 1911 arrow.Field{ 1912 Nullable: true, 1913 Name: "a", 1914 Type: arrow.StructOf( 1915 arrow.Field{ 1916 Nullable: true, 1917 Name: "b", 1918 Type: arrow.BinaryTypes.String, 1919 }, 1920 ), 1921 }, 1922 )), 1923 }, 1924 )) 1925 defer bldr.Release() 1926 1927 lBldr := bldr.FieldBuilder(0).(*array.ListBuilder) 1928 stBldr := lBldr.ValueBuilder().(*array.StructBuilder) 1929 aBldr := stBldr.FieldBuilder(0).(*array.StructBuilder) 1930 bBldr := aBldr.FieldBuilder(0).(*array.StringBuilder) 1931 1932 bldr.AppendNull() 1933 bldr.Append(true) 1934 lBldr.Append(true) 1935 for i := 0; i < 8; i++ { 1936 stBldr.Append(true) 1937 aBldr.Append(true) 1938 bBldr.Append(strconv.Itoa(i)) 1939 } 1940 1941 arr := bldr.NewArray() 1942 defer arr.Release() 1943 1944 field := arrow.Field{Name: "x", Type: arr.DataType(), Nullable: true} 1945 expected := array.NewTable(arrow.NewSchema([]arrow.Field{field}, nil), 1946 []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(field.Type, []arrow.Array{arr}))}, -1) 1947 defer arr.Release() // NewChunked 1948 defer expected.Release() 1949 1950 ps.roundTripTable(mem, expected, false) 1951 } 1952 1953 func TestParquetArrowIO(t *testing.T) { 1954 suite.Run(t, new(ParquetIOTestSuite)) 1955 } 1956 1957 func TestBufferedRecWrite(t *testing.T) { 1958 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 1959 defer mem.AssertSize(t, 0) 1960 1961 sc := arrow.NewSchema([]arrow.Field{ 1962 {Name: "f32", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, 1963 {Name: "i32", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 1964 {Name: "struct_i64_f64", Type: arrow.StructOf( 1965 arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64, Nullable: true}, 1966 arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64, Nullable: true})}, 1967 }, nil) 1968 1969 structData := array.NewData(sc.Field(2).Type, SIZELEN, 1970 []*memory.Buffer{nil, nil}, 1971 []arrow.ArrayData{testutils.RandomNullable(arrow.PrimitiveTypes.Int64, SIZELEN, 0).Data(), testutils.RandomNullable(arrow.PrimitiveTypes.Float64, SIZELEN, 0).Data()}, 0, 0) 1972 defer structData.Release() 1973 cols := []arrow.Array{ 1974 testutils.RandomNullable(sc.Field(0).Type, SIZELEN, SIZELEN/5), 1975 testutils.RandomNullable(sc.Field(1).Type, SIZELEN, SIZELEN/5), 1976 array.NewStructData(structData), 1977 } 1978 1979 rec := array.NewRecord(sc, cols, SIZELEN) 1980 defer rec.Release() 1981 1982 var ( 1983 buf bytes.Buffer 1984 ) 1985 1986 wr, err := pqarrow.NewFileWriter(sc, &buf, 1987 parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy), parquet.WithDictionaryDefault(false), parquet.WithDataPageSize(100*1024)), 1988 pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 1989 require.NoError(t, err) 1990 1991 p1 := rec.NewSlice(0, SIZELEN/2) 1992 defer p1.Release() 1993 require.NoError(t, wr.WriteBuffered(p1)) 1994 1995 p2 := rec.NewSlice(SIZELEN/2, SIZELEN) 1996 defer p2.Release() 1997 require.NoError(t, wr.WriteBuffered(p2)) 1998 1999 wr.Close() 2000 2001 rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 2002 assert.NoError(t, err) 2003 2004 assert.EqualValues(t, 1, rdr.NumRowGroups()) 2005 assert.EqualValues(t, SIZELEN, rdr.NumRows()) 2006 rdr.Close() 2007 2008 tbl, err := pqarrow.ReadTable(context.Background(), bytes.NewReader(buf.Bytes()), nil, pqarrow.ArrowReadProperties{}, nil) 2009 assert.NoError(t, err) 2010 defer tbl.Release() 2011 2012 assert.EqualValues(t, SIZELEN, tbl.NumRows()) 2013 } 2014 2015 func (ps *ParquetIOTestSuite) TestArrowMapTypeRoundTrip() { 2016 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 2017 defer mem.AssertSize(ps.T(), 0) 2018 2019 bldr := array.NewMapBuilder(mem, arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int32, false) 2020 defer bldr.Release() 2021 2022 kb := bldr.KeyBuilder().(*array.StringBuilder) 2023 ib := bldr.ItemBuilder().(*array.Int32Builder) 2024 2025 bldr.Append(true) 2026 kb.AppendValues([]string{"Fee", "Fi", "Fo", "Fum"}, nil) 2027 ib.AppendValues([]int32{1, 2, 3, 4}, nil) 2028 2029 bldr.Append(true) 2030 kb.AppendValues([]string{"Fee", "Fi", "Fo"}, nil) 2031 ib.AppendValues([]int32{5, 4, 3}, nil) 2032 2033 bldr.AppendNull() 2034 2035 bldr.Append(true) 2036 kb.AppendValues([]string{"Fo", "Fi", "Fee"}, nil) 2037 ib.AppendValues([]int32{-1, 2, 3}, []bool{false, true, true}) 2038 2039 arr := bldr.NewArray() 2040 defer arr.Release() 2041 2042 fld := arrow.Field{Name: "mapped", Type: arr.DataType(), Nullable: true} 2043 cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr}) 2044 defer arr.Release() // NewChunked 2045 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1) 2046 defer cnk.Release() // NewColumn 2047 defer tbl.Release() 2048 2049 ps.roundTripTable(mem, tbl, true) 2050 } 2051 2052 func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() { 2053 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 2054 defer mem.AssertSize(ps.T(), 0) 2055 2056 extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType()) 2057 defer extBuilder.Release() 2058 builder := types.NewUUIDBuilder(extBuilder) 2059 builder.Append(uuid.New()) 2060 arr := builder.NewArray() 2061 defer arr.Release() 2062 2063 fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true} 2064 cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr}) 2065 defer arr.Release() // NewChunked 2066 tbl := array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1) 2067 defer cnk.Release() // NewColumn 2068 defer tbl.Release() 2069 2070 ps.roundTripTable(mem, tbl, true) 2071 } 2072 2073 func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() { 2074 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 2075 defer mem.AssertSize(ps.T(), 0) 2076 2077 var written, expected arrow.Table 2078 2079 { 2080 // Prepare `written` table with the extension type registered. 2081 extType := types.NewUUIDType() 2082 bldr := array.NewExtensionBuilder(mem, extType) 2083 defer bldr.Release() 2084 2085 bldr.Builder.(*array.FixedSizeBinaryBuilder).AppendValues( 2086 [][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")}, 2087 []bool{false, true, true, true}) 2088 2089 arr := bldr.NewArray() 2090 defer arr.Release() 2091 2092 if arrow.GetExtensionType("uuid") != nil { 2093 ps.NoError(arrow.UnregisterExtensionType("uuid")) 2094 } 2095 2096 fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true} 2097 cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr}) 2098 defer arr.Release() // NewChunked 2099 written = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1) 2100 defer cnk.Release() // NewColumn 2101 defer written.Release() 2102 } 2103 2104 { 2105 // Prepare `expected` table with the extension type unregistered in the underlying type. 2106 bldr := array.NewFixedSizeBinaryBuilder(mem, &arrow.FixedSizeBinaryType{ByteWidth: 16}) 2107 defer bldr.Release() 2108 bldr.AppendValues( 2109 [][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")}, 2110 []bool{false, true, true, true}) 2111 2112 arr := bldr.NewArray() 2113 defer arr.Release() 2114 2115 fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true} 2116 cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr}) 2117 defer arr.Release() // NewChunked 2118 expected = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1) 2119 defer cnk.Release() // NewColumn 2120 defer expected.Release() 2121 } 2122 2123 // sanity check before going deeper 2124 ps.Equal(expected.NumCols(), written.NumCols()) 2125 ps.Equal(expected.NumRows(), written.NumRows()) 2126 2127 // just like roundTripTable() but different written vs. expected tables 2128 var buf bytes.Buffer 2129 props := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema(), pqarrow.WithAllocator(mem)) 2130 2131 writeProps := parquet.NewWriterProperties(parquet.WithAllocator(mem)) 2132 ps.Require().NoError(pqarrow.WriteTable(written, &buf, written.NumRows(), writeProps, props)) 2133 2134 reader := ps.createReader(mem, buf.Bytes()) 2135 defer reader.ParquetReader().Close() 2136 2137 tbl := ps.readTable(reader) 2138 defer tbl.Release() 2139 2140 ps.Equal(expected.NumCols(), tbl.NumCols()) 2141 ps.Equal(expected.NumRows(), tbl.NumRows()) 2142 2143 exChunk := expected.Column(0).Data() 2144 tblChunk := tbl.Column(0).Data() 2145 2146 ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks())) 2147 exc := exChunk.Chunk(0) 2148 tbc := tblChunk.Chunk(0) 2149 ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc) 2150 2151 expectedMd := arrow.MetadataFrom(map[string]string{ 2152 ipc.ExtensionTypeKeyName: "uuid", 2153 ipc.ExtensionMetadataKeyName: "uuid-serialized", 2154 "PARQUET:field_id": "-1", 2155 }) 2156 ps.Truef(expectedMd.Equal(tbl.Column(0).Field().Metadata), "expected: %v\ngot: %v", expectedMd, tbl.Column(0).Field().Metadata) 2157 } 2158 2159 func TestWriteTableMemoryAllocation(t *testing.T) { 2160 mem := memory.NewCheckedAllocator(memory.DefaultAllocator) 2161 sc := arrow.NewSchema([]arrow.Field{ 2162 {Name: "f32", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, 2163 {Name: "i32", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 2164 {Name: "struct_i64_f64", Type: arrow.StructOf( 2165 arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64, Nullable: true}, 2166 arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64, Nullable: true})}, 2167 {Name: "arr_i64", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)}, 2168 {Name: "uuid", Type: types.NewUUIDType(), Nullable: true}, 2169 }, nil) 2170 2171 bld := array.NewRecordBuilder(mem, sc) 2172 bld.Field(0).(*array.Float32Builder).Append(1.0) 2173 bld.Field(1).(*array.Int32Builder).Append(1) 2174 sbld := bld.Field(2).(*array.StructBuilder) 2175 sbld.Append(true) 2176 sbld.FieldBuilder(0).(*array.Int64Builder).Append(1) 2177 sbld.FieldBuilder(1).(*array.Float64Builder).Append(1.0) 2178 abld := bld.Field(3).(*array.ListBuilder) 2179 abld.Append(true) 2180 abld.ValueBuilder().(*array.Int64Builder).Append(2) 2181 bld.Field(4).(*types.UUIDBuilder).Append(uuid.MustParse("00000000-0000-0000-0000-000000000001")) 2182 2183 rec := bld.NewRecord() 2184 bld.Release() 2185 2186 var buf bytes.Buffer 2187 wr, err := pqarrow.NewFileWriter(sc, &buf, 2188 parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy)), 2189 pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem))) 2190 require.NoError(t, err) 2191 2192 require.NoError(t, wr.Write(rec)) 2193 rec.Release() 2194 wr.Close() 2195 2196 require.Zero(t, mem.CurrentAlloc()) 2197 } 2198 2199 func TestEmptyListDeltaBinaryPacked(t *testing.T) { 2200 schema := arrow.NewSchema([]arrow.Field{ 2201 {Name: "ts", Type: arrow.ListOf(arrow.PrimitiveTypes.Uint64), 2202 Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}}, nil) 2203 builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) 2204 defer builder.Release() 2205 2206 listBuilder := builder.Field(0).(*array.ListBuilder) 2207 listBuilder.Append(true) 2208 arrowRec := builder.NewRecord() 2209 defer arrowRec.Release() 2210 2211 var buf bytes.Buffer 2212 wr, err := pqarrow.NewFileWriter(schema, &buf, 2213 parquet.NewWriterProperties( 2214 parquet.WithDictionaryFor("ts.list.element", false), 2215 parquet.WithEncodingFor("ts.list.element", parquet.Encodings.DeltaBinaryPacked)), 2216 pqarrow.DefaultWriterProps()) 2217 require.NoError(t, err) 2218 2219 require.NoError(t, wr.WriteBuffered(arrowRec)) 2220 require.NoError(t, wr.Close()) 2221 2222 rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 2223 require.NoError(t, err) 2224 reader, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator) 2225 require.NoError(t, err) 2226 defer rdr.Close() 2227 2228 tbl, err := reader.ReadTable(context.Background()) 2229 require.NoError(t, err) 2230 defer tbl.Release() 2231 2232 assert.True(t, schema.Equal(tbl.Schema())) 2233 assert.EqualValues(t, 1, tbl.NumRows()) 2234 }