github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "math" 8 "math/rand" 9 "reflect" 10 "testing" 11 12 "github.com/segmentio/parquet-go" 13 "github.com/segmentio/parquet-go/internal/quick" 14 ) 15 16 func rowsOf(numRows int, model interface{}) rows { 17 prng := rand.New(rand.NewSource(0)) 18 return randomRowsOf(prng, numRows, model) 19 } 20 21 func randomRowsOf(prng *rand.Rand, numRows int, model interface{}) rows { 22 typ := reflect.TypeOf(model) 23 rows := make(rows, numRows) 24 makeValue := quick.MakeValueFuncOf(typ) 25 for i := range rows { 26 v := reflect.New(typ).Elem() 27 makeValue(v, prng) 28 rows[i] = v.Interface() 29 } 30 return rows 31 } 32 33 var readerTests = []struct { 34 scenario string 35 model interface{} 36 }{ 37 { 38 scenario: "BOOLEAN", 39 model: booleanColumn{}, 40 }, 41 42 { 43 scenario: "INT32", 44 model: int32Column{}, 45 }, 46 47 { 48 scenario: "INT64", 49 model: int64Column{}, 50 }, 51 52 { 53 scenario: "INT96", 54 model: int96Column{}, 55 }, 56 57 { 58 scenario: "FLOAT", 59 model: floatColumn{}, 60 }, 61 62 { 63 scenario: "DOUBLE", 64 model: doubleColumn{}, 65 }, 66 67 { 68 scenario: "BYTE_ARRAY", 69 model: byteArrayColumn{}, 70 }, 71 72 { 73 scenario: "FIXED_LEN_BYTE_ARRAY", 74 model: fixedLenByteArrayColumn{}, 75 }, 76 77 { 78 scenario: "STRING", 79 model: stringColumn{}, 80 }, 81 82 { 83 scenario: "STRING (dict)", 84 model: indexedStringColumn{}, 85 }, 86 87 { 88 scenario: "UUID", 89 model: uuidColumn{}, 90 }, 91 92 { 93 scenario: "time.Time", 94 model: timeColumn{}, 95 }, 96 97 { 98 scenario: "time.Time in ms", 99 model: timeInMillisColumn{}, 100 }, 101 102 { 103 scenario: "DECIMAL", 104 model: decimalColumn{}, 105 }, 106 107 { 108 scenario: "AddressBook", 109 model: addressBook{}, 110 }, 111 112 { 113 scenario: "one optional level", 114 model: listColumn2{}, 115 }, 116 117 { 118 scenario: "one repeated level", 119 model: listColumn1{}, 120 }, 121 122 { 123 scenario: "two repeated levels", 124 model: listColumn0{}, 125 }, 126 127 { 128 scenario: "three repeated levels", 129 model: listColumn0{}, 130 }, 131 132 { 133 scenario: "nested lists", 134 model: nestedListColumn{}, 135 }, 136 137 { 138 scenario: "key-value pairs", 139 model: struct { 140 KeyValuePairs map[utf8string]utf8string 141 }{}, 142 }, 143 144 { 145 scenario: "multiple key-value pairs", 146 model: struct { 147 KeyValuePairs0 map[utf8string]utf8string 148 KeyValuePairs1 map[utf8string]utf8string 149 KeyValuePairs2 map[utf8string]utf8string 150 }{}, 151 }, 152 153 { 154 scenario: "repeated key-value pairs", 155 model: struct { 156 RepeatedKeyValuePairs []map[utf8string]utf8string 157 }{}, 158 }, 159 160 { 161 scenario: "map of repeated values", 162 model: struct { 163 MapOfRepeated map[utf8string][]utf8string 164 }{}, 165 }, 166 } 167 168 func TestReader(t *testing.T) { 169 buf := new(bytes.Buffer) 170 file := bytes.NewReader(nil) 171 172 for _, test := range readerTests { 173 t.Run(test.scenario, func(t *testing.T) { 174 const N = 42 175 176 rowType := reflect.TypeOf(test.model) 177 rowPtr := reflect.New(rowType) 178 rowZero := reflect.Zero(rowType) 179 rowValue := rowPtr.Elem() 180 181 for n := 1; n < N; n++ { 182 t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) { 183 defer buf.Reset() 184 rows := rowsOf(n, test.model) 185 186 if err := writeParquetFileWithBuffer(buf, rows); err != nil { 187 t.Fatal(err) 188 } 189 190 file.Reset(buf.Bytes()) 191 r := parquet.NewReader(file, parquet.SchemaOf(test.model)) 192 193 for i, v := range rows { 194 if err := r.Read(rowPtr.Interface()); err != nil { 195 t.Fatal(err) 196 } 197 if !reflect.DeepEqual(rowValue.Interface(), v) { 198 t.Errorf("row mismatch at index %d\nwant = %+v\ngot = %+v", i, v, rowValue.Interface()) 199 } 200 rowValue.Set(rowZero) 201 } 202 203 if err := r.Read(rowPtr.Interface()); err != io.EOF { 204 t.Errorf("expected EOF after reading all values but got: %v", err) 205 } 206 }) 207 } 208 }) 209 } 210 } 211 212 func BenchmarkReaderReadType(b *testing.B) { 213 buf := new(bytes.Buffer) 214 file := bytes.NewReader(nil) 215 216 for _, test := range readerTests { 217 b.Run(test.scenario, func(b *testing.B) { 218 defer buf.Reset() 219 rows := rowsOf(benchmarkNumRows, test.model) 220 221 if err := writeParquetFile(buf, rows); err != nil { 222 b.Fatal(err) 223 } 224 file.Reset(buf.Bytes()) 225 f, err := parquet.OpenFile(file, file.Size()) 226 if err != nil { 227 b.Fatal(err) 228 } 229 230 rowType := reflect.TypeOf(test.model) 231 rowPtr := reflect.New(rowType) 232 rowZero := reflect.Zero(rowType) 233 rowValue := rowPtr.Elem() 234 235 r := parquet.NewReader(f) 236 p := rowPtr.Interface() 237 238 benchmarkRowsPerSecond(b, func() (n int) { 239 for i := 0; i < benchmarkRowsPerStep; i++ { 240 if err := r.Read(p); err != nil { 241 if err == io.EOF { 242 r.Reset() 243 } else { 244 b.Fatal(err) 245 } 246 } 247 } 248 rowValue.Set(rowZero) 249 return benchmarkRowsPerStep 250 }) 251 252 b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows))) 253 }) 254 } 255 } 256 257 func BenchmarkReaderReadRow(b *testing.B) { 258 buf := new(bytes.Buffer) 259 file := bytes.NewReader(nil) 260 261 for _, test := range readerTests { 262 b.Run(test.scenario, func(b *testing.B) { 263 defer buf.Reset() 264 rows := rowsOf(benchmarkNumRows, test.model) 265 266 if err := writeParquetFile(buf, rows); err != nil { 267 b.Fatal(err) 268 } 269 file.Reset(buf.Bytes()) 270 f, err := parquet.OpenFile(file, file.Size()) 271 if err != nil { 272 b.Fatal(err) 273 } 274 275 r := parquet.NewReader(f) 276 rowbuf := make([]parquet.Row, benchmarkRowsPerStep) 277 278 benchmarkRowsPerSecond(b, func() int { 279 n, err := r.ReadRows(rowbuf) 280 if err != nil { 281 if err == io.EOF { 282 r.Reset() 283 } else { 284 b.Fatal(err) 285 } 286 } 287 return n 288 }) 289 290 b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows))) 291 }) 292 } 293 } 294 295 func TestReaderReadSubset(t *testing.T) { 296 // In this example we'll write 3 columns to the file - X, Y, and Z, but 297 // we'll only read out the X and Y columns. Returns true if all writes 298 // and reads were successful, and false otherwise. 299 type Point3D struct{ X, Y, Z int64 } 300 type Point2D struct{ X, Y int64 } 301 302 err := quickCheck(func(points3D []Point3D) bool { 303 if len(points3D) == 0 { 304 return true 305 } 306 buf := new(bytes.Buffer) 307 err := writeParquetFile(buf, makeRows(points3D)) 308 if err != nil { 309 t.Error(err) 310 return false 311 } 312 reader := parquet.NewReader(bytes.NewReader(buf.Bytes())) 313 for i := 0; ; i++ { 314 row := Point2D{} 315 err := reader.Read(&row) 316 if err != nil { 317 if err == io.EOF && i == len(points3D) { 318 break 319 } 320 t.Error(err) 321 return false 322 } 323 if row != (Point2D{X: points3D[i].X, Y: points3D[i].Y}) { 324 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, points3D[i], row) 325 return false 326 } 327 } 328 return true 329 }) 330 if err != nil { 331 t.Error(err) 332 } 333 } 334 335 func TestReaderSeekToRow(t *testing.T) { 336 type rowType struct { 337 Name utf8string `parquet:",dict"` 338 } 339 340 rows := rowsOf(10, rowType{}) 341 buf := new(bytes.Buffer) 342 err := writeParquetFile(buf, rows) 343 if err != nil { 344 t.Fatal(err) 345 } 346 347 reader := parquet.NewReader(bytes.NewReader(buf.Bytes())) 348 for i := 0; i < 10; i++ { 349 if err := reader.SeekToRow(int64(i)); err != nil { 350 t.Fatalf("seek to row %d: %v", i, err) 351 } 352 353 row := new(rowType) 354 err := reader.Read(row) 355 if err != nil { 356 t.Fatalf("reading row %d: %v", i, err) 357 } 358 359 if *row != rows[i] { 360 t.Fatalf("row %d mismatch: got=%+v want=%+v", i, *row, rows[i]) 361 } 362 } 363 } 364 365 func TestSeekToRowNoDict(t *testing.T) { 366 type rowType struct { 367 Name utf8string `parquet:","` // no dictionary encoding 368 } 369 370 // write samples to in-memory buffer 371 buf := new(bytes.Buffer) 372 schema := parquet.SchemaOf(new(rowType)) 373 w := parquet.NewWriter(buf, schema) 374 sample := rowType{ 375 Name: "foo1", 376 } 377 // write two rows 378 w.Write(sample) 379 sample.Name = "foo2" 380 w.Write(sample) 381 w.Close() 382 383 // create reader 384 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 385 386 // read second row 387 r.SeekToRow(1) 388 row := new(rowType) 389 err := r.Read(row) 390 if err != nil { 391 t.Fatalf("reading row: %v", err) 392 } 393 // fmt.Println(&sample, row) 394 if *row != sample { 395 t.Fatalf("read != write") 396 } 397 } 398 399 func TestSeekToRowReadAll(t *testing.T) { 400 type rowType struct { 401 Name utf8string `parquet:",dict"` 402 } 403 404 // write samples to in-memory buffer 405 buf := new(bytes.Buffer) 406 schema := parquet.SchemaOf(new(rowType)) 407 w := parquet.NewWriter(buf, schema) 408 sample := rowType{ 409 Name: "foo1", 410 } 411 // write two rows 412 w.Write(sample) 413 sample.Name = "foo2" 414 w.Write(sample) 415 w.Close() 416 417 // create reader 418 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 419 420 // read first row 421 r.SeekToRow(0) 422 row := new(rowType) 423 err := r.Read(row) 424 if err != nil { 425 t.Fatalf("reading row: %v", err) 426 } 427 // read second row 428 r.SeekToRow(1) 429 row = new(rowType) 430 err = r.Read(row) 431 if err != nil { 432 t.Fatalf("reading row: %v", err) 433 } 434 // fmt.Println(&sample, row) 435 if *row != sample { 436 t.Fatalf("read != write") 437 } 438 } 439 440 func TestSeekToRowDictReadSecond(t *testing.T) { 441 type rowType struct { 442 Name utf8string `parquet:",dict"` 443 } 444 445 // write samples to in-memory buffer 446 buf := new(bytes.Buffer) 447 schema := parquet.SchemaOf(new(rowType)) 448 w := parquet.NewWriter(buf, schema) 449 sample := rowType{ 450 Name: "foo1", 451 } 452 // write two rows 453 w.Write(sample) 454 sample.Name = "foo2" 455 w.Write(sample) 456 w.Close() 457 458 // create reader 459 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 460 461 // read second row 462 r.SeekToRow(1) 463 row := new(rowType) 464 err := r.Read(row) 465 if err != nil { 466 t.Fatalf("reading row: %v", err) 467 } 468 // fmt.Println(&sample, row) 469 if *row != sample { 470 t.Fatalf("read != write") 471 } 472 } 473 474 func TestSeekToRowDictReadMultiplePages(t *testing.T) { 475 type rowType struct { 476 Name utf8string `parquet:",dict"` 477 } 478 479 // write samples to in-memory buffer 480 buf := new(bytes.Buffer) 481 schema := parquet.SchemaOf(new(rowType)) 482 w := parquet.NewWriter(buf, schema, &parquet.WriterConfig{ 483 PageBufferSize: 10, 484 }) 485 sample := rowType{ 486 Name: "foo1", 487 } 488 489 // write enough rows to spill over a single page 490 for i := 0; i < 10; i++ { 491 w.Write(sample) 492 } 493 sample.Name = "foo2" 494 w.Write(sample) 495 w.Close() 496 497 // create reader 498 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 499 500 // read 11th row 501 r.SeekToRow(10) 502 row := new(rowType) 503 err := r.Read(row) 504 if err != nil { 505 t.Fatalf("reading row: %v", err) 506 } 507 if *row != sample { 508 t.Fatalf("read != write") 509 } 510 }