github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/reader_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "math" 8 "math/rand" 9 "reflect" 10 "testing" 11 12 "github.com/vc42/parquet-go" 13 "github.com/vc42/parquet-go/internal/quick" 14 ) 15 16 func rowsOf(numRows int, model interface{}) rows { 17 prng := rand.New(rand.NewSource(0)) 18 return randomRowsOf(prng, numRows, model) 19 } 20 21 func randomRowsOf(prng *rand.Rand, numRows int, model interface{}) rows { 22 typ := reflect.TypeOf(model) 23 rows := make(rows, numRows) 24 makeValue := quick.MakeValueFuncOf(typ) 25 for i := range rows { 26 v := reflect.New(typ).Elem() 27 makeValue(v, prng) 28 rows[i] = v.Interface() 29 } 30 return rows 31 } 32 33 var readerTests = []struct { 34 scenario string 35 model interface{} 36 }{ 37 { 38 scenario: "BOOLEAN", 39 model: booleanColumn{}, 40 }, 41 42 { 43 scenario: "INT32", 44 model: int32Column{}, 45 }, 46 47 { 48 scenario: "INT64", 49 model: int64Column{}, 50 }, 51 52 { 53 scenario: "INT96", 54 model: int96Column{}, 55 }, 56 57 { 58 scenario: "FLOAT", 59 model: floatColumn{}, 60 }, 61 62 { 63 scenario: "DOUBLE", 64 model: doubleColumn{}, 65 }, 66 67 { 68 scenario: "BYTE_ARRAY", 69 model: byteArrayColumn{}, 70 }, 71 72 { 73 scenario: "FIXED_LEN_BYTE_ARRAY", 74 model: fixedLenByteArrayColumn{}, 75 }, 76 77 { 78 scenario: "STRING", 79 model: stringColumn{}, 80 }, 81 82 { 83 scenario: "STRING (dict)", 84 model: indexedStringColumn{}, 85 }, 86 87 { 88 scenario: "UUID", 89 model: uuidColumn{}, 90 }, 91 92 { 93 scenario: "DECIMAL", 94 model: decimalColumn{}, 95 }, 96 97 { 98 scenario: "AddressBook", 99 model: addressBook{}, 100 }, 101 102 { 103 scenario: "one optional level", 104 model: listColumn2{}, 105 }, 106 107 { 108 scenario: "one repeated level", 109 model: listColumn1{}, 110 }, 111 112 { 113 scenario: "two repeated levels", 114 model: listColumn0{}, 115 }, 116 117 { 118 scenario: "three repeated levels", 119 model: listColumn0{}, 120 }, 121 122 { 123 scenario: "nested lists", 124 model: nestedListColumn{}, 125 }, 126 127 { 128 scenario: "key-value pairs", 129 model: struct { 130 KeyValuePairs map[utf8string]utf8string 131 }{}, 132 }, 133 134 { 135 scenario: "multiple key-value pairs", 136 model: struct { 137 KeyValuePairs0 map[utf8string]utf8string 138 KeyValuePairs1 map[utf8string]utf8string 139 KeyValuePairs2 map[utf8string]utf8string 140 }{}, 141 }, 142 143 { 144 scenario: "repeated key-value pairs", 145 model: struct { 146 RepeatedKeyValuePairs []map[utf8string]utf8string 147 }{}, 148 }, 149 150 { 151 scenario: "map of repeated values", 152 model: struct { 153 MapOfRepeated map[utf8string][]utf8string 154 }{}, 155 }, 156 } 157 158 func TestReader(t *testing.T) { 159 buf := new(bytes.Buffer) 160 file := bytes.NewReader(nil) 161 162 for _, test := range readerTests { 163 t.Run(test.scenario, func(t *testing.T) { 164 const N = 42 165 166 rowType := reflect.TypeOf(test.model) 167 rowPtr := reflect.New(rowType) 168 rowZero := reflect.Zero(rowType) 169 rowValue := rowPtr.Elem() 170 171 for n := 1; n < N; n++ { 172 t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) { 173 defer buf.Reset() 174 rows := rowsOf(n, test.model) 175 176 if err := writeParquetFileWithBuffer(buf, rows); err != nil { 177 t.Fatal(err) 178 } 179 180 file.Reset(buf.Bytes()) 181 r := parquet.NewReader(file, parquet.SchemaOf(test.model)) 182 183 for i, v := range rows { 184 if err := r.Read(rowPtr.Interface()); err != nil { 185 t.Fatal(err) 186 } 187 if !reflect.DeepEqual(rowValue.Interface(), v) { 188 t.Errorf("row mismatch at index %d\nwant = %+v\ngot = %+v", i, v, rowValue.Interface()) 189 } 190 rowValue.Set(rowZero) 191 } 192 193 if err := r.Read(rowPtr.Interface()); err != io.EOF { 194 t.Errorf("expected EOF after reading all values but got: %v", err) 195 } 196 }) 197 } 198 }) 199 } 200 } 201 202 func BenchmarkReaderReadType(b *testing.B) { 203 buf := new(bytes.Buffer) 204 file := bytes.NewReader(nil) 205 206 for _, test := range readerTests { 207 b.Run(test.scenario, func(b *testing.B) { 208 defer buf.Reset() 209 rows := rowsOf(benchmarkNumRows, test.model) 210 211 if err := writeParquetFile(buf, rows); err != nil { 212 b.Fatal(err) 213 } 214 file.Reset(buf.Bytes()) 215 f, err := parquet.OpenFile(file, file.Size()) 216 if err != nil { 217 b.Fatal(err) 218 } 219 220 rowType := reflect.TypeOf(test.model) 221 rowPtr := reflect.New(rowType) 222 rowZero := reflect.Zero(rowType) 223 rowValue := rowPtr.Elem() 224 225 r := parquet.NewReader(f) 226 p := rowPtr.Interface() 227 228 benchmarkRowsPerSecond(b, func() (n int) { 229 for i := 0; i < benchmarkRowsPerStep; i++ { 230 if err := r.Read(p); err != nil { 231 if err == io.EOF { 232 r.Reset() 233 } else { 234 b.Fatal(err) 235 } 236 } 237 } 238 rowValue.Set(rowZero) 239 return benchmarkRowsPerStep 240 }) 241 242 b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows))) 243 }) 244 } 245 } 246 247 func BenchmarkReaderReadRow(b *testing.B) { 248 buf := new(bytes.Buffer) 249 file := bytes.NewReader(nil) 250 251 for _, test := range readerTests { 252 b.Run(test.scenario, func(b *testing.B) { 253 defer buf.Reset() 254 rows := rowsOf(benchmarkNumRows, test.model) 255 256 if err := writeParquetFile(buf, rows); err != nil { 257 b.Fatal(err) 258 } 259 file.Reset(buf.Bytes()) 260 f, err := parquet.OpenFile(file, file.Size()) 261 if err != nil { 262 b.Fatal(err) 263 } 264 265 r := parquet.NewReader(f) 266 rowbuf := make([]parquet.Row, benchmarkRowsPerStep) 267 268 benchmarkRowsPerSecond(b, func() int { 269 n, err := r.ReadRows(rowbuf) 270 if err != nil { 271 if err == io.EOF { 272 r.Reset() 273 } else { 274 b.Fatal(err) 275 } 276 } 277 return n 278 }) 279 280 b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows))) 281 }) 282 } 283 } 284 285 func TestReaderReadSubset(t *testing.T) { 286 // In this example we'll write 3 columns to the file - X, Y, and Z, but 287 // we'll only read out the X and Y columns. Returns true if all writes 288 // and reads were successful, and false otherwise. 289 type Point3D struct{ X, Y, Z int64 } 290 type Point2D struct{ X, Y int64 } 291 292 err := quickCheck(func(points3D []Point3D) bool { 293 if len(points3D) == 0 { 294 return true 295 } 296 buf := new(bytes.Buffer) 297 err := writeParquetFile(buf, makeRows(points3D)) 298 if err != nil { 299 t.Error(err) 300 return false 301 } 302 reader := parquet.NewReader(bytes.NewReader(buf.Bytes())) 303 for i := 0; ; i++ { 304 row := Point2D{} 305 err := reader.Read(&row) 306 if err != nil { 307 if err == io.EOF && i == len(points3D) { 308 break 309 } 310 t.Error(err) 311 return false 312 } 313 if row != (Point2D{X: points3D[i].X, Y: points3D[i].Y}) { 314 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, points3D[i], row) 315 return false 316 } 317 } 318 return true 319 }) 320 if err != nil { 321 t.Error(err) 322 } 323 } 324 325 func TestReaderSeekToRow(t *testing.T) { 326 type rowType struct { 327 Name utf8string `parquet:",dict"` 328 } 329 330 rows := rowsOf(10, rowType{}) 331 buf := new(bytes.Buffer) 332 err := writeParquetFile(buf, rows) 333 if err != nil { 334 t.Fatal(err) 335 } 336 337 reader := parquet.NewReader(bytes.NewReader(buf.Bytes())) 338 for i := 0; i < 10; i++ { 339 if err := reader.SeekToRow(int64(i)); err != nil { 340 t.Fatalf("seek to row %d: %v", i, err) 341 } 342 343 row := new(rowType) 344 err := reader.Read(row) 345 if err != nil { 346 t.Fatalf("reading row %d: %v", i, err) 347 } 348 349 if *row != rows[i] { 350 t.Fatalf("row %d mismatch: got=%+v want=%+v", i, *row, rows[i]) 351 } 352 } 353 } 354 355 func TestSeekToRowNoDict(t *testing.T) { 356 type rowType struct { 357 Name utf8string `parquet:","` // no dictionary encoding 358 } 359 360 // write samples to in-memory buffer 361 buf := new(bytes.Buffer) 362 schema := parquet.SchemaOf(new(rowType)) 363 w := parquet.NewWriter(buf, schema) 364 sample := rowType{ 365 Name: "foo1", 366 } 367 // write two rows 368 w.Write(sample) 369 sample.Name = "foo2" 370 w.Write(sample) 371 w.Close() 372 373 // create reader 374 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 375 376 // read second row 377 r.SeekToRow(1) 378 row := new(rowType) 379 err := r.Read(row) 380 if err != nil { 381 t.Fatalf("reading row: %v", err) 382 } 383 // fmt.Println(&sample, row) 384 if *row != sample { 385 t.Fatalf("read != write") 386 } 387 } 388 389 func TestSeekToRowReadAll(t *testing.T) { 390 type rowType struct { 391 Name utf8string `parquet:",dict"` 392 } 393 394 // write samples to in-memory buffer 395 buf := new(bytes.Buffer) 396 schema := parquet.SchemaOf(new(rowType)) 397 w := parquet.NewWriter(buf, schema) 398 sample := rowType{ 399 Name: "foo1", 400 } 401 // write two rows 402 w.Write(sample) 403 sample.Name = "foo2" 404 w.Write(sample) 405 w.Close() 406 407 // create reader 408 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 409 410 // read first row 411 r.SeekToRow(0) 412 row := new(rowType) 413 err := r.Read(row) 414 if err != nil { 415 t.Fatalf("reading row: %v", err) 416 } 417 // read second row 418 r.SeekToRow(1) 419 row = new(rowType) 420 err = r.Read(row) 421 if err != nil { 422 t.Fatalf("reading row: %v", err) 423 } 424 // fmt.Println(&sample, row) 425 if *row != sample { 426 t.Fatalf("read != write") 427 } 428 } 429 430 func TestSeekToRowDictReadSecond(t *testing.T) { 431 type rowType struct { 432 Name utf8string `parquet:",dict"` 433 } 434 435 // write samples to in-memory buffer 436 buf := new(bytes.Buffer) 437 schema := parquet.SchemaOf(new(rowType)) 438 w := parquet.NewWriter(buf, schema) 439 sample := rowType{ 440 Name: "foo1", 441 } 442 // write two rows 443 w.Write(sample) 444 sample.Name = "foo2" 445 w.Write(sample) 446 w.Close() 447 448 // create reader 449 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 450 451 // read second row 452 r.SeekToRow(1) 453 row := new(rowType) 454 err := r.Read(row) 455 if err != nil { 456 t.Fatalf("reading row: %v", err) 457 } 458 // fmt.Println(&sample, row) 459 if *row != sample { 460 t.Fatalf("read != write") 461 } 462 } 463 464 func TestSeekToRowDictReadMultiplePages(t *testing.T) { 465 type rowType struct { 466 Name utf8string `parquet:",dict"` 467 } 468 469 // write samples to in-memory buffer 470 buf := new(bytes.Buffer) 471 schema := parquet.SchemaOf(new(rowType)) 472 w := parquet.NewWriter(buf, schema, &parquet.WriterConfig{ 473 PageBufferSize: 10, 474 }) 475 sample := rowType{ 476 Name: "foo1", 477 } 478 479 // write enough rows to spill over a single page 480 for i := 0; i < 10; i++ { 481 w.Write(sample) 482 } 483 sample.Name = "foo2" 484 w.Write(sample) 485 w.Close() 486 487 // create reader 488 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 489 490 // read 11th row 491 r.SeekToRow(10) 492 row := new(rowType) 493 err := r.Read(row) 494 if err != nil { 495 t.Fatalf("reading row: %v", err) 496 } 497 if *row != sample { 498 t.Fatalf("read != write") 499 } 500 }