github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/parquet_go18_test.go (about) 1 //go:build go1.18 2 3 package parquet_test 4 5 import ( 6 "bytes" 7 "fmt" 8 "io" 9 "log" 10 "os" 11 "reflect" 12 "testing" 13 14 "github.com/segmentio/parquet-go" 15 "google.golang.org/protobuf/types/known/structpb" 16 ) 17 18 func ExampleReadFile() { 19 type Row struct { 20 ID int64 `parquet:"id"` 21 Name string `parquet:"name,zstd"` 22 } 23 24 ExampleWriteFile() 25 26 rows, err := parquet.ReadFile[Row]("/tmp/file.parquet") 27 if err != nil { 28 log.Fatal(err) 29 } 30 31 for _, row := range rows { 32 fmt.Printf("%d: %q\n", row.ID, row.Name) 33 } 34 35 // Output: 36 // 0: "Bob" 37 // 1: "Alice" 38 // 2: "Franky" 39 } 40 41 func ExampleWriteFile() { 42 type Row struct { 43 ID int64 `parquet:"id"` 44 Name string `parquet:"name,zstd"` 45 } 46 47 if err := parquet.WriteFile("/tmp/file.parquet", []Row{ 48 {ID: 0, Name: "Bob"}, 49 {ID: 1, Name: "Alice"}, 50 {ID: 2, Name: "Franky"}, 51 }); err != nil { 52 log.Fatal(err) 53 } 54 55 // Output: 56 } 57 58 func ExampleRead_any() { 59 type Row struct{ FirstName, LastName string } 60 61 buf := new(bytes.Buffer) 62 err := parquet.Write(buf, []Row{ 63 {FirstName: "Luke", LastName: "Skywalker"}, 64 {FirstName: "Han", LastName: "Solo"}, 65 {FirstName: "R2", LastName: "D2"}, 66 }) 67 if err != nil { 68 log.Fatal(err) 69 } 70 71 file := bytes.NewReader(buf.Bytes()) 72 73 rows, err := parquet.Read[any](file, file.Size()) 74 if err != nil { 75 log.Fatal(err) 76 } 77 78 for _, row := range rows { 79 fmt.Printf("%q\n", row) 80 } 81 82 // Output: 83 // map["FirstName":"Luke" "LastName":"Skywalker"] 84 // map["FirstName":"Han" "LastName":"Solo"] 85 // map["FirstName":"R2" "LastName":"D2"] 86 } 87 88 func ExampleWrite_any() { 89 schema := parquet.SchemaOf(struct { 90 FirstName string 91 LastName string 92 }{}) 93 94 buf := new(bytes.Buffer) 95 err := parquet.Write[any]( 96 buf, 97 []any{ 98 map[string]string{"FirstName": "Luke", "LastName": "Skywalker"}, 99 map[string]string{"FirstName": "Han", "LastName": "Solo"}, 100 map[string]string{"FirstName": "R2", "LastName": "D2"}, 101 }, 102 schema, 103 ) 104 if err != nil { 105 log.Fatal(err) 106 } 107 108 file := bytes.NewReader(buf.Bytes()) 109 110 rows, err := parquet.Read[any](file, file.Size()) 111 if err != nil { 112 log.Fatal(err) 113 } 114 115 for _, row := range rows { 116 fmt.Printf("%q\n", row) 117 } 118 119 // Output: 120 // map["FirstName":"Luke" "LastName":"Skywalker"] 121 // map["FirstName":"Han" "LastName":"Solo"] 122 // map["FirstName":"R2" "LastName":"D2"] 123 } 124 125 func ExampleSearch() { 126 type Row struct{ FirstName, LastName string } 127 128 buf := new(bytes.Buffer) 129 // The column being searched should be sorted to avoid a full scan of the 130 // column. See the section of the readme on sorting for how to sort on 131 // insertion into the parquet file using parquet.SortingColumns 132 rows := []Row{ 133 {FirstName: "C", LastName: "3PO"}, 134 {FirstName: "Han", LastName: "Solo"}, 135 {FirstName: "Leia", LastName: "Organa"}, 136 {FirstName: "Luke", LastName: "Skywalker"}, 137 {FirstName: "R2", LastName: "D2"}, 138 } 139 // The tiny page buffer size ensures we get multiple pages out of the example above. 140 w := parquet.NewGenericWriter[Row](buf, parquet.PageBufferSize(12), parquet.WriteBufferSize(0)) 141 // Need to write 1 row at a time here as writing many at once disregards PageBufferSize option. 142 for _, row := range rows { 143 _, err := w.Write([]Row{row}) 144 if err != nil { 145 log.Fatal(err) 146 } 147 } 148 err := w.Close() 149 if err != nil { 150 log.Fatal(err) 151 } 152 153 reader := bytes.NewReader(buf.Bytes()) 154 file, err := parquet.OpenFile(reader, reader.Size()) 155 if err != nil { 156 log.Fatal(err) 157 } 158 159 // Search is scoped to a single RowGroup/ColumnChunk 160 rowGroup := file.RowGroups()[0] 161 firstNameColChunk := rowGroup.ColumnChunks()[0] 162 163 found := parquet.Search(firstNameColChunk.ColumnIndex(), parquet.ValueOf("Luke"), parquet.ByteArrayType) 164 offsetIndex := firstNameColChunk.OffsetIndex() 165 fmt.Printf("numPages: %d\n", offsetIndex.NumPages()) 166 fmt.Printf("result found in page: %d\n", found) 167 if found < offsetIndex.NumPages() { 168 r := parquet.NewGenericReader[Row](file) 169 defer r.Close() 170 // Seek to the first row in the page the result was found 171 r.SeekToRow(offsetIndex.FirstRowIndex(found)) 172 result := make([]Row, 2) 173 _, _ = r.Read(result) 174 // Leia is in index 0 for the page. 175 for _, row := range result { 176 if row.FirstName == "Luke" { 177 fmt.Printf("%q\n", row) 178 } 179 } 180 } 181 182 // Output: 183 // numPages: 3 184 // result found in page: 1 185 // {"Luke" "Skywalker"} 186 } 187 188 func TestIssue360(t *testing.T) { 189 type TestType struct { 190 Key []int 191 } 192 193 schema := parquet.SchemaOf(TestType{}) 194 buffer := parquet.NewGenericBuffer[any](schema) 195 196 data := make([]any, 1) 197 data[0] = TestType{Key: []int{1}} 198 _, err := buffer.Write(data) 199 if err != nil { 200 fmt.Println("Exiting with error: ", err) 201 return 202 } 203 204 var out bytes.Buffer 205 writer := parquet.NewGenericWriter[any](&out, schema) 206 207 _, err = parquet.CopyRows(writer, buffer.Rows()) 208 if err != nil { 209 fmt.Println("Exiting with error: ", err) 210 return 211 } 212 writer.Close() 213 214 br := bytes.NewReader(out.Bytes()) 215 rows, _ := parquet.Read[any](br, br.Size()) 216 217 expect := []any{ 218 map[string]any{ 219 "Key": []any{ 220 int64(1), 221 }, 222 }, 223 } 224 225 assertRowsEqual(t, expect, rows) 226 } 227 228 func TestIssue362ParquetReadFromGenericReaders(t *testing.T) { 229 path := "testdata/dms_test_table_LOAD00000001.parquet" 230 fp, err := os.Open(path) 231 if err != nil { 232 t.Fatal(err) 233 } 234 defer fp.Close() 235 236 r1 := parquet.NewGenericReader[any](fp) 237 rows1 := make([]any, r1.NumRows()) 238 _, err = r1.Read(rows1) 239 if err != nil && err != io.EOF { 240 t.Fatal(err) 241 } 242 243 r2 := parquet.NewGenericReader[any](fp) 244 rows2 := make([]any, r2.NumRows()) 245 _, err = r2.Read(rows2) 246 if err != nil && err != io.EOF { 247 t.Fatal(err) 248 } 249 } 250 251 func TestIssue362ParquetReadFile(t *testing.T) { 252 rows1, err := parquet.ReadFile[any]("testdata/dms_test_table_LOAD00000001.parquet") 253 if err != nil { 254 t.Fatal(err) 255 } 256 257 rows2, err := parquet.ReadFile[any]("testdata/dms_test_table_LOAD00000001.parquet") 258 if err != nil { 259 t.Fatal(err) 260 } 261 262 assertRowsEqual(t, rows1, rows2) 263 } 264 265 func TestIssue368(t *testing.T) { 266 f, err := os.Open("testdata/issue368.parquet") 267 if err != nil { 268 t.Fatal(err) 269 } 270 defer f.Close() 271 272 info, err := f.Stat() 273 if err != nil { 274 t.Fatal(err) 275 } 276 277 pf, err := parquet.OpenFile(f, info.Size()) 278 if err != nil { 279 t.Fatal(err) 280 } 281 282 reader := parquet.NewGenericReader[any](pf) 283 defer reader.Close() 284 285 trs := make([]any, 1) 286 for { 287 _, err := reader.Read(trs) 288 if err != nil { 289 break 290 } 291 } 292 } 293 294 func TestIssue377(t *testing.T) { 295 type People struct { 296 Name string 297 Age int 298 } 299 300 type Nested struct { 301 P []People 302 F string 303 GF string 304 } 305 row1 := Nested{P: []People{ 306 { 307 Name: "Bob", 308 Age: 10, 309 }}} 310 ods := []Nested{ 311 row1, 312 } 313 buf := new(bytes.Buffer) 314 w := parquet.NewGenericWriter[Nested](buf) 315 _, err := w.Write(ods) 316 if err != nil { 317 t.Fatal("write error: ", err) 318 } 319 w.Close() 320 321 file := bytes.NewReader(buf.Bytes()) 322 rows, err := parquet.Read[Nested](file, file.Size()) 323 if err != nil { 324 t.Fatal("read error: ", err) 325 } 326 327 assertRowsEqual(t, rows, ods) 328 } 329 330 func TestIssue423(t *testing.T) { 331 type Inner struct { 332 Value string `parquet:","` 333 } 334 type Outer struct { 335 Label string `parquet:","` 336 Inner Inner `parquet:",json"` 337 Slice []Inner `parquet:",json"` 338 // This is the only tricky situation. Because we're delegating to json Marshaler/Unmarshaler 339 // We use the json tags for optionality. 340 Ptr *Inner `json:",omitempty" parquet:",json"` 341 342 // This tests BC behavior that slices of bytes and json strings still get written/read in a BC way. 343 String string `parquet:",json"` 344 Bytes []byte `parquet:",json"` 345 MapOfStructPb map[string]*structpb.Value `parquet:",json"` 346 StructPB *structpb.Value `parquet:",json"` 347 } 348 349 writeRows := []Outer{ 350 { 351 Label: "welp", 352 Inner: Inner{ 353 Value: "this is a string", 354 }, 355 Slice: []Inner{ 356 { 357 Value: "in a slice", 358 }, 359 }, 360 Ptr: nil, 361 String: `{"hello":"world"}`, 362 Bytes: []byte(`{"goodbye":"world"}`), 363 MapOfStructPb: map[string]*structpb.Value{ 364 "answer": structpb.NewNumberValue(42.00), 365 }, 366 StructPB: structpb.NewBoolValue(true), 367 }, 368 { 369 Label: "foxes", 370 Inner: Inner{ 371 Value: "the quick brown fox jumped over the yellow lazy dog.", 372 }, 373 Slice: []Inner{ 374 { 375 Value: "in a slice", 376 }, 377 }, 378 Ptr: &Inner{ 379 Value: "not nil", 380 }, 381 String: `{"hello":"world"}`, 382 Bytes: []byte(`{"goodbye":"world"}`), 383 MapOfStructPb: map[string]*structpb.Value{ 384 "doubleAnswer": structpb.NewNumberValue(84.00), 385 }, 386 StructPB: structpb.NewBoolValue(false), 387 }, 388 } 389 390 schema := parquet.SchemaOf(new(Outer)) 391 fmt.Println(schema.String()) 392 buf := new(bytes.Buffer) 393 w := parquet.NewGenericWriter[Outer](buf, schema) 394 _, err := w.Write(writeRows) 395 if err != nil { 396 t.Fatal("write error: ", err) 397 } 398 w.Close() 399 400 file := bytes.NewReader(buf.Bytes()) 401 readRows, err := parquet.Read[Outer](file, file.Size()) 402 if err != nil { 403 t.Fatal("read error: ", err) 404 } 405 406 assertRowsEqual(t, writeRows, readRows) 407 } 408 409 func TestReadFileGenericMultipleRowGroupsMultiplePages(t *testing.T) { 410 type MyRow struct { 411 ID [16]byte `parquet:"id,delta,uuid"` 412 File string `parquet:"file,dict,zstd"` 413 Index int64 `parquet:"index,delta,zstd"` 414 } 415 416 numRows := 20_000 417 maxPageBytes := 5000 418 419 tmp, err := os.CreateTemp("/tmp", "*.parquet") 420 if err != nil { 421 t.Fatal("os.CreateTemp: ", err) 422 } 423 path := tmp.Name() 424 defer os.Remove(path) 425 t.Log("file:", path) 426 427 // The page buffer size ensures we get multiple pages out of this example. 428 w := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes)) 429 // Need to write 1 row at a time here as writing many at once disregards PageBufferSize option. 430 for i := 0; i < numRows; i++ { 431 row := MyRow{ 432 ID: [16]byte{15: byte(i)}, 433 File: "hi" + fmt.Sprint(i), 434 Index: int64(i), 435 } 436 _, err := w.Write([]MyRow{row}) 437 if err != nil { 438 t.Fatal("w.Write: ", err) 439 } 440 // Flush writes rows as row group. 4 total (20k/5k) in this file. 441 if (i+1)%maxPageBytes == 0 { 442 err = w.Flush() 443 if err != nil { 444 t.Fatal("w.Flush: ", err) 445 } 446 } 447 } 448 err = w.Close() 449 if err != nil { 450 t.Fatal("w.Close: ", err) 451 } 452 err = tmp.Close() 453 if err != nil { 454 t.Fatal("tmp.Close: ", err) 455 } 456 457 rows, err := parquet.ReadFile[MyRow](path) 458 if err != nil { 459 t.Fatal("parquet.ReadFile: ", err) 460 } 461 462 if len(rows) != numRows { 463 t.Fatalf("not enough values were read: want=%d got=%d", len(rows), numRows) 464 } 465 for i, row := range rows { 466 id := [16]byte{15: byte(i)} 467 file := "hi" + fmt.Sprint(i) 468 index := int64(i) 469 470 if row.ID != id || row.File != file || row.Index != index { 471 t.Fatalf("rows mismatch at index: %d got: %+v", i, row) 472 } 473 } 474 } 475 476 func assertRowsEqual[T any](t *testing.T, rows1, rows2 []T) { 477 if !reflect.DeepEqual(rows1, rows2) { 478 t.Error("rows mismatch") 479 480 t.Log("want:") 481 logRows(t, rows1) 482 483 t.Log("got:") 484 logRows(t, rows2) 485 } 486 } 487 488 func logRows[T any](t *testing.T, rows []T) { 489 for _, row := range rows { 490 t.Logf(". %#v\n", row) 491 } 492 }