github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/page_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "io" 6 "reflect" 7 "testing" 8 9 "github.com/parquet-go/parquet-go" 10 "github.com/parquet-go/parquet-go/deprecated" 11 "github.com/parquet-go/parquet-go/encoding/plain" 12 "github.com/parquet-go/parquet-go/internal/unsafecast" 13 ) 14 15 func TestPage(t *testing.T) { 16 t.Run("BOOLEAN", testPageBoolean) 17 t.Run("INT32", testPageInt32) 18 t.Run("INT64", testPageInt64) 19 t.Run("INT96", testPageInt96) 20 t.Run("FLOAT", testPageFloat) 21 t.Run("DOUBLE", testPageDouble) 22 t.Run("BYTE_ARRAY", testPageByteArray) 23 t.Run("FIXED_LEN_BYTE_ARRAY", testPageFixedLenByteArray) 24 } 25 26 func testPageBoolean(t *testing.T) { 27 schema := parquet.SchemaOf(struct{ Value bool }{}) 28 29 t.Run("parquet", func(t *testing.T) { 30 testPage(t, schema, pageTest{ 31 write: func(w parquet.ValueWriter) (interface{}, error) { 32 values := make([]bool, 50_000) 33 for i := range values { 34 values[i] = i%2 == 0 35 } 36 n, err := w.(parquet.BooleanWriter).WriteBooleans(values) 37 return values[:n], err 38 }, 39 40 read: func(r parquet.ValueReader) (interface{}, error) { 41 values := make([]bool, 50_000) 42 n, err := r.(parquet.BooleanReader).ReadBooleans(values) 43 return values[:n], err 44 }, 45 }) 46 }) 47 } 48 49 func testPageInt32(t *testing.T) { 50 schema := parquet.SchemaOf(struct{ Value int32 }{}) 51 52 t.Run("io", func(t *testing.T) { 53 testBufferPage(t, schema, pageTest{ 54 write: func(w parquet.ValueWriter) (interface{}, error) { 55 values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 56 n, err := w.(io.Writer).Write(unsafecast.Int32ToBytes(values)) 57 return values[:n/4], err 58 }, 59 60 read: func(r parquet.ValueReader) (interface{}, error) { 61 values := make([]int32, 10) 62 n, err := r.(io.Reader).Read(unsafecast.Int32ToBytes(values)) 63 return values[:n/4], err 64 }, 65 }) 66 }) 67 68 t.Run("parquet", func(t *testing.T) { 69 testPage(t, schema, pageTest{ 70 write: func(w parquet.ValueWriter) (interface{}, error) { 71 values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 72 n, err := w.(parquet.Int32Writer).WriteInt32s(values) 73 return values[:n], err 74 }, 75 76 read: func(r parquet.ValueReader) (interface{}, error) { 77 values := make([]int32, 10) 78 n, err := r.(parquet.Int32Reader).ReadInt32s(values) 79 return values[:n], err 80 }, 81 }) 82 }) 83 } 84 85 func testPageInt64(t *testing.T) { 86 schema := parquet.SchemaOf(struct{ Value int64 }{}) 87 88 t.Run("io", func(t *testing.T) { 89 testBufferPage(t, schema, pageTest{ 90 write: func(w parquet.ValueWriter) (interface{}, error) { 91 values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 92 n, err := w.(io.Writer).Write(unsafecast.Int64ToBytes(values)) 93 return values[:n/8], err 94 }, 95 96 read: func(r parquet.ValueReader) (interface{}, error) { 97 values := make([]int64, 10) 98 n, err := r.(io.Reader).Read(unsafecast.Int64ToBytes(values)) 99 return values[:n/8], err 100 }, 101 }) 102 }) 103 104 t.Run("parquet", func(t *testing.T) { 105 testPage(t, schema, pageTest{ 106 write: func(w parquet.ValueWriter) (interface{}, error) { 107 values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 108 n, err := w.(parquet.Int64Writer).WriteInt64s(values) 109 return values[:n], err 110 }, 111 112 read: func(r parquet.ValueReader) (interface{}, error) { 113 values := make([]int64, 10) 114 n, err := r.(parquet.Int64Reader).ReadInt64s(values) 115 return values[:n], err 116 }, 117 }) 118 }) 119 } 120 121 func testPageInt96(t *testing.T) { 122 schema := parquet.SchemaOf(struct{ Value deprecated.Int96 }{}) 123 124 t.Run("io", func(t *testing.T) { 125 testBufferPage(t, schema, pageTest{ 126 write: func(w parquet.ValueWriter) (interface{}, error) { 127 values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}} 128 n, err := w.(io.Writer).Write(deprecated.Int96ToBytes(values)) 129 return values[:n/12], err 130 }, 131 132 read: func(r parquet.ValueReader) (interface{}, error) { 133 values := make([]deprecated.Int96, 3) 134 n, err := r.(io.Reader).Read(deprecated.Int96ToBytes(values)) 135 return values[:n/12], err 136 }, 137 }) 138 }) 139 140 t.Run("parquet", func(t *testing.T) { 141 testPage(t, schema, pageTest{ 142 write: func(w parquet.ValueWriter) (interface{}, error) { 143 values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}} 144 n, err := w.(parquet.Int96Writer).WriteInt96s(values) 145 return values[:n], err 146 }, 147 148 read: func(r parquet.ValueReader) (interface{}, error) { 149 values := make([]deprecated.Int96, 3) 150 n, err := r.(parquet.Int96Reader).ReadInt96s(values) 151 return values[:n], err 152 }, 153 }) 154 }) 155 } 156 157 func testPageFloat(t *testing.T) { 158 schema := parquet.SchemaOf(struct{ Value float32 }{}) 159 160 t.Run("io", func(t *testing.T) { 161 testBufferPage(t, schema, pageTest{ 162 write: func(w parquet.ValueWriter) (interface{}, error) { 163 values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 164 n, err := w.(io.Writer).Write(unsafecast.Float32ToBytes(values)) 165 return values[:n/4], err 166 }, 167 168 read: func(r parquet.ValueReader) (interface{}, error) { 169 values := make([]float32, 10) 170 n, err := r.(io.Reader).Read(unsafecast.Float32ToBytes(values)) 171 return values[:n/4], err 172 }, 173 }) 174 }) 175 176 t.Run("parquet", func(t *testing.T) { 177 testPage(t, schema, pageTest{ 178 write: func(w parquet.ValueWriter) (interface{}, error) { 179 values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 180 n, err := w.(parquet.FloatWriter).WriteFloats(values) 181 return values[:n], err 182 }, 183 184 read: func(r parquet.ValueReader) (interface{}, error) { 185 values := make([]float32, 10) 186 n, err := r.(parquet.FloatReader).ReadFloats(values) 187 return values[:n], err 188 }, 189 }) 190 }) 191 } 192 193 func testPageDouble(t *testing.T) { 194 schema := parquet.SchemaOf(struct{ Value float64 }{}) 195 196 t.Run("io", func(t *testing.T) { 197 testBufferPage(t, schema, pageTest{ 198 write: func(w parquet.ValueWriter) (interface{}, error) { 199 values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 200 n, err := w.(io.Writer).Write(unsafecast.Float64ToBytes(values)) 201 return values[:n/8], err 202 }, 203 204 read: func(r parquet.ValueReader) (interface{}, error) { 205 values := make([]float64, 10) 206 n, err := r.(io.Reader).Read(unsafecast.Float64ToBytes(values)) 207 return values[:n/8], err 208 }, 209 }) 210 }) 211 212 t.Run("parquet", func(t *testing.T) { 213 testPage(t, schema, pageTest{ 214 write: func(w parquet.ValueWriter) (interface{}, error) { 215 values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 216 n, err := w.(parquet.DoubleWriter).WriteDoubles(values) 217 return values[:n], err 218 }, 219 220 read: func(r parquet.ValueReader) (interface{}, error) { 221 values := make([]float64, 10) 222 n, err := r.(parquet.DoubleReader).ReadDoubles(values) 223 return values[:n], err 224 }, 225 }) 226 }) 227 } 228 229 func testPageByteArray(t *testing.T) { 230 schema := parquet.SchemaOf(struct{ Value []byte }{}) 231 232 t.Run("io", func(t *testing.T) { 233 testBufferPage(t, schema, pageTest{ 234 write: func(w parquet.ValueWriter) (interface{}, error) { 235 values := []byte{} 236 values = plain.AppendByteArray(values, []byte("A")) 237 values = plain.AppendByteArray(values, []byte("B")) 238 values = plain.AppendByteArray(values, []byte("C")) 239 n, err := w.(io.Writer).Write(values) 240 return values[:n], err 241 }, 242 243 read: func(r parquet.ValueReader) (interface{}, error) { 244 values := make([]byte, 3+3*plain.ByteArrayLengthSize) 245 n, err := r.(io.Reader).Read(values) 246 return values[:n], err 247 }, 248 }) 249 }) 250 251 t.Run("parquet", func(t *testing.T) { 252 testPage(t, schema, pageTest{ 253 write: func(w parquet.ValueWriter) (interface{}, error) { 254 values := []byte{} 255 values = plain.AppendByteArray(values, []byte("A")) 256 values = plain.AppendByteArray(values, []byte("B")) 257 values = plain.AppendByteArray(values, []byte("C")) 258 _, err := w.(parquet.ByteArrayWriter).WriteByteArrays(values) 259 return values, err 260 }, 261 262 read: func(r parquet.ValueReader) (interface{}, error) { 263 values := make([]byte, 3+3*plain.ByteArrayLengthSize) 264 n, err := r.(parquet.ByteArrayReader).ReadByteArrays(values) 265 return values[:n+n*plain.ByteArrayLengthSize], err 266 }, 267 }) 268 }) 269 } 270 271 func testPageFixedLenByteArray(t *testing.T) { 272 schema := parquet.SchemaOf(struct{ Value [3]byte }{}) 273 274 t.Run("io", func(t *testing.T) { 275 testBufferPage(t, schema, pageTest{ 276 write: func(w parquet.ValueWriter) (interface{}, error) { 277 values := []byte("123456789") 278 n, err := w.(io.Writer).Write(values) 279 return values[:n], err 280 }, 281 282 read: func(r parquet.ValueReader) (interface{}, error) { 283 values := make([]byte, 3*3) 284 n, err := r.(io.Reader).Read(values) 285 return values[:n], err 286 }, 287 }) 288 }) 289 290 t.Run("parquet", func(t *testing.T) { 291 testPage(t, schema, pageTest{ 292 write: func(w parquet.ValueWriter) (interface{}, error) { 293 values := []byte("123456789") 294 n, err := w.(parquet.FixedLenByteArrayWriter).WriteFixedLenByteArrays(values) 295 return values[:3*n], err 296 }, 297 298 read: func(r parquet.ValueReader) (interface{}, error) { 299 values := make([]byte, 3*3) 300 n, err := r.(parquet.FixedLenByteArrayReader).ReadFixedLenByteArrays(values) 301 return values[:3*n], err 302 }, 303 }) 304 }) 305 } 306 307 type pageTest struct { 308 write func(parquet.ValueWriter) (interface{}, error) 309 read func(parquet.ValueReader) (interface{}, error) 310 } 311 312 func testPage(t *testing.T, schema *parquet.Schema, test pageTest) { 313 t.Run("buffer", func(t *testing.T) { testBufferPage(t, schema, test) }) 314 t.Run("file", func(t *testing.T) { testFilePage(t, schema, test) }) 315 } 316 317 func testBufferPage(t *testing.T, schema *parquet.Schema, test pageTest) { 318 buffer := parquet.NewBuffer(schema) 319 column := buffer.ColumnBuffers()[0] 320 321 w, err := test.write(column) 322 if err != nil { 323 t.Fatal("writing page values:", err) 324 } 325 326 r, err := test.read(column.Page().Values()) 327 if err != io.EOF { 328 t.Errorf("expected io.EOF after reading all values but got %v", err) 329 } 330 if !reflect.DeepEqual(w, r) { 331 t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w) 332 } 333 } 334 335 func testFilePage(t *testing.T, schema *parquet.Schema, test pageTest) { 336 buffer := parquet.NewBuffer(schema) 337 column := buffer.ColumnBuffers()[0] 338 339 w, err := test.write(column) 340 if err != nil { 341 t.Fatal("writing page values:", err) 342 } 343 344 output := new(bytes.Buffer) 345 writer := parquet.NewWriter(output) 346 n, err := writer.WriteRowGroup(buffer) 347 if err != nil { 348 t.Fatal("writing parquet file:", err) 349 } 350 if err := writer.Close(); err != nil { 351 t.Fatal("writing parquet file:", err) 352 } 353 if n != buffer.NumRows() { 354 t.Fatalf("number of rows written mismatch: got=%d want=%d", n, buffer.NumRows()) 355 } 356 357 reader := bytes.NewReader(output.Bytes()) 358 f, err := parquet.OpenFile(reader, reader.Size()) 359 if err != nil { 360 t.Fatal("opening parquet file:", err) 361 } 362 363 pages := f.RowGroups()[0].ColumnChunks()[0].Pages() 364 defer pages.Close() 365 366 p, err := pages.ReadPage() 367 if err != nil { 368 t.Fatal("reading parquet page:", err) 369 } 370 defer parquet.Release(p) 371 372 values := p.Values() 373 r, err := test.read(values) 374 if err != io.EOF && err != nil { 375 t.Errorf("expected io.EOF after reading all values but got %v", err) 376 } 377 if !reflect.DeepEqual(w, r) { 378 t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w) 379 } 380 if r, err := test.read(values); reflect.ValueOf(r).Len() != 0 || err != io.EOF { 381 t.Errorf("expected no data and io.EOF after reading all values but got %d and %v", r, err) 382 } 383 } 384 385 type testStruct struct { 386 Value *string 387 } 388 389 func TestOptionalPageTrailingNulls(t *testing.T) { 390 schema := parquet.SchemaOf(&testStruct{}) 391 buffer := parquet.NewBuffer(schema) 392 393 str := "test" 394 rows := []testStruct{{ 395 Value: nil, 396 }, { 397 Value: &str, 398 }, { 399 Value: nil, 400 }} 401 402 for _, row := range rows { 403 _, err := buffer.WriteRows([]parquet.Row{schema.Deconstruct(nil, row)}) 404 if err != nil { 405 t.Fatal("writing row:", err) 406 } 407 } 408 409 resultRows := make([]parquet.Row, 0, len(rows)) 410 bufferRows := make([]parquet.Row, 10) 411 reader := buffer.Rows() 412 defer reader.Close() 413 for { 414 n, err := reader.ReadRows(bufferRows) 415 resultRows = append(resultRows, bufferRows[:n]...) 416 if err != nil { 417 if err == io.EOF { 418 break 419 } 420 t.Fatal("reading rows:", err) 421 } 422 } 423 424 if len(resultRows) != len(rows) { 425 t.Errorf("wrong number of rows read: got=%d want=%d", len(resultRows), len(rows)) 426 } 427 } 428 429 func TestOptionalPagePreserveIndex(t *testing.T) { 430 schema := parquet.SchemaOf(&testStruct{}) 431 buffer := parquet.NewBuffer(schema) 432 433 _, err := buffer.WriteRows([]parquet.Row{ 434 schema.Deconstruct(nil, &testStruct{Value: nil}), 435 }) 436 if err != nil { 437 t.Fatal("writing row:", err) 438 } 439 440 rows := buffer.Rows() 441 defer rows.Close() 442 443 rowbuf := make([]parquet.Row, 2) 444 445 n, err := rows.ReadRows(rowbuf) 446 if err != nil && err != io.EOF { 447 t.Fatal("reading rows:", err) 448 } 449 if n != 1 { 450 t.Fatal("wrong number of rows returned:", n) 451 } 452 if rowbuf[0][0].Column() != 0 { 453 t.Errorf("wrong index: got=%d want=%d", rowbuf[0][0].Column(), 0) 454 } 455 456 n, err = rows.ReadRows(rowbuf) 457 if err != io.EOF { 458 t.Fatal("reading EOF:", err) 459 } 460 if n != 0 { 461 t.Fatal("expected no more rows after EOF:", n) 462 } 463 } 464 465 func TestRepeatedPageTrailingNulls(t *testing.T) { 466 type testStruct struct { 467 A []string `parquet:"a"` 468 } 469 470 s := parquet.SchemaOf(&testStruct{}) 471 472 records := []*testStruct{ 473 {A: nil}, 474 {A: []string{"test"}}, 475 {A: nil}, 476 } 477 478 buf := parquet.NewBuffer(s) 479 for _, rec := range records { 480 row := s.Deconstruct(nil, rec) 481 _, err := buf.WriteRows([]parquet.Row{row}) 482 if err != nil { 483 t.Fatal(err) 484 } 485 } 486 487 rows := make([]parquet.Row, len(records)+1) 488 reader := buf.Rows() 489 defer reader.Close() 490 491 n, err := reader.ReadRows(rows) 492 if err != nil && err != io.EOF { 493 t.Fatal("reading rows:", err) 494 } 495 496 if n != len(records) { 497 t.Errorf("wrong number of rows read: got=%d want=%d", n, len(records)) 498 } 499 } 500 501 func TestReslicingBooleanPage(t *testing.T) { 502 type testStruct struct { 503 B bool `parquet:"b"` 504 } 505 506 numValues := 100 507 expected := []*testStruct{} 508 for i := 0; i < numValues; i++ { 509 expected = append(expected, &testStruct{B: i%2 == 0}) 510 } 511 512 buf := new(bytes.Buffer) 513 writer := parquet.NewGenericWriter[*testStruct](buf) 514 _, err := writer.Write(expected) 515 if err != nil { 516 t.Fatal(err) 517 } 518 err = writer.Close() 519 if err != nil { 520 t.Fatal(err) 521 } 522 523 reader := bytes.NewReader(buf.Bytes()) 524 pf, err := parquet.OpenFile(reader, reader.Size()) 525 if err != nil { 526 t.Fatal(err) 527 } 528 529 // grab the page we wrote above 530 rg := pf.RowGroups()[0] 531 cc := rg.ColumnChunks() 532 pgs := cc[0].Pages() 533 534 pg, err := pgs.ReadPage() 535 if err != nil { 536 t.Fatal(err) 537 } 538 539 // continue reslicing and reading the values 540 sliceEvery := 3 541 for i := 0; i < numValues-1; i += sliceEvery { 542 vs := make([]parquet.Value, numValues) 543 544 low := int64(sliceEvery) 545 high := int64(numValues - i) 546 547 if low >= high { 548 break 549 } 550 551 // slice the page 552 pg = pg.Slice(low, high) 553 v := pg.Values() 554 v.ReadValues(vs) 555 556 // and the expected values with the same low/high 557 expected = expected[low:high] 558 559 // confirm values match 560 for n, exp := range expected { 561 if exp.B != vs[n].Boolean() { 562 t.Fatalf("unexpected value: %v at pos: %d", vs[n], n) 563 } 564 n++ 565 } 566 } 567 }