github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/writer_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "os" 7 "os/exec" 8 "strings" 9 "testing" 10 11 "github.com/google/uuid" 12 "github.com/hexops/gotextdiff" 13 "github.com/hexops/gotextdiff/myers" 14 "github.com/hexops/gotextdiff/span" 15 "github.com/vc42/parquet-go" 16 "github.com/vc42/parquet-go/compress" 17 ) 18 19 const ( 20 v1 = 1 21 v2 = 2 22 ) 23 24 func scanParquetFile(f *os.File) error { 25 s, err := f.Stat() 26 if err != nil { 27 return err 28 } 29 30 p, err := parquet.OpenFile(f, s.Size()) 31 if err != nil { 32 return err 33 } 34 35 return scanParquetValues(p.Root()) 36 } 37 38 func scanParquetValues(col *parquet.Column) error { 39 return forEachColumnValue(col, func(leaf *parquet.Column, value parquet.Value) error { 40 fmt.Printf("%s > %+v\n", strings.Join(leaf.Path(), "."), value) 41 return nil 42 }) 43 } 44 45 func generateParquetFile(rows rows, options ...parquet.WriterOption) ([]byte, error) { 46 tmp, err := os.CreateTemp("/tmp", "*.parquet") 47 if err != nil { 48 return nil, err 49 } 50 defer tmp.Close() 51 path := tmp.Name() 52 defer os.Remove(path) 53 //fmt.Println(path) 54 55 writerOptions := []parquet.WriterOption{parquet.PageBufferSize(20)} 56 writerOptions = append(writerOptions, options...) 57 58 if err := writeParquetFile(tmp, rows, writerOptions...); err != nil { 59 return nil, err 60 } 61 62 if err := scanParquetFile(tmp); err != nil { 63 return nil, err 64 } 65 66 return parquetTools("dump", path) 67 } 68 69 type firstAndLastName struct { 70 FirstName string `parquet:"first_name,dict,zstd"` 71 LastName string `parquet:"last_name,delta,zstd"` 72 } 73 74 type timeseries struct { 75 Name string `parquet:"name,dict"` 76 Timestamp int64 `parquet:"timestamp,delta"` 77 Value float64 `parquet:"value"` 78 } 79 80 type event struct { 81 Name string `parquet:"name,dict"` 82 Type string `parquet:"-"` 83 Value float64 `parquet:"value"` 84 Category string `parquet:"-"` 85 } 86 87 var writerTests = []struct { 88 scenario string 89 version int 90 codec compress.Codec 91 rows []interface{} 92 dump string 93 }{ 94 { 95 scenario: "page v1 with dictionary encoding", 96 version: v1, 97 rows: []interface{}{ 98 &firstAndLastName{FirstName: "Han", LastName: "Solo"}, 99 &firstAndLastName{FirstName: "Leia", LastName: "Skywalker"}, 100 &firstAndLastName{FirstName: "Luke", LastName: "Skywalker"}, 101 }, 102 dump: `row group 0 103 -------------------------------------------------------------------------------- 104 first_name: BINARY ZSTD DO:4 FPO:55 SZ:123/96/0.78 VC:3 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column] 105 last_name: BINARY ZSTD DO:0 FPO:127 SZ:127/121/0.95 VC:3 ENC:DELTA_BYTE_ARRAY ST:[no stats for this column] 106 107 first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN 108 ---------------------------------------------------------------------------- 109 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:5 VC:2 110 page 1: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:3 VC:1 111 112 last_name TV=3 RL=0 DL=0 113 ---------------------------------------------------------------------------- 114 page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:56 VC:2 115 page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:19 VC:1 116 117 BINARY first_name 118 -------------------------------------------------------------------------------- 119 *** row group 1 of 1, values 1 to 3 *** 120 value 1: R:0 D:0 V:Han 121 value 2: R:0 D:0 V:Leia 122 value 3: R:0 D:0 V:Luke 123 124 BINARY last_name 125 -------------------------------------------------------------------------------- 126 *** row group 1 of 1, values 1 to 3 *** 127 value 1: R:0 D:0 V:Solo 128 value 2: R:0 D:0 V:Skywalker 129 value 3: R:0 D:0 V:Skywalker 130 `, 131 }, 132 133 { // same as the previous test but uses page v2 where data pages aren't compressed 134 scenario: "page v2 with dictionary encoding", 135 version: v2, 136 rows: []interface{}{ 137 &firstAndLastName{FirstName: "Han", LastName: "Solo"}, 138 &firstAndLastName{FirstName: "Leia", LastName: "Skywalker"}, 139 &firstAndLastName{FirstName: "Luke", LastName: "Skywalker"}, 140 }, 141 dump: `row group 0 142 -------------------------------------------------------------------------------- 143 first_name: BINARY ZSTD DO:4 FPO:55 SZ:115/106/0.92 VC:3 ENC:RLE_DICTIONARY,PLAIN ST:[no stats for this column] 144 last_name: BINARY ZSTD DO:0 FPO:119 SZ:137/131/0.96 VC:3 ENC:DELTA_BYTE_ARRAY ST:[no stats for this column] 145 146 first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN 147 ---------------------------------------------------------------------------- 148 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:5 VC:2 149 page 1: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:3 VC:1 150 151 last_name TV=3 RL=0 DL=0 152 ---------------------------------------------------------------------------- 153 page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2 154 page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:19 VC:1 155 156 BINARY first_name 157 -------------------------------------------------------------------------------- 158 *** row group 1 of 1, values 1 to 3 *** 159 value 1: R:0 D:0 V:Han 160 value 2: R:0 D:0 V:Leia 161 value 3: R:0 D:0 V:Luke 162 163 BINARY last_name 164 -------------------------------------------------------------------------------- 165 *** row group 1 of 1, values 1 to 3 *** 166 value 1: R:0 D:0 V:Solo 167 value 2: R:0 D:0 V:Skywalker 168 value 3: R:0 D:0 V:Skywalker 169 `, 170 }, 171 172 { 173 scenario: "timeseries with delta encoding", 174 version: v2, 175 codec: &parquet.Gzip, 176 rows: []interface{}{ 177 timeseries{Name: "http_request_total", Timestamp: 1639444033, Value: 100}, 178 timeseries{Name: "http_request_total", Timestamp: 1639444058, Value: 0}, 179 timeseries{Name: "http_request_total", Timestamp: 1639444085, Value: 42}, 180 timeseries{Name: "http_request_total", Timestamp: 1639444093, Value: 1}, 181 timeseries{Name: "http_request_total", Timestamp: 1639444101, Value: 2}, 182 timeseries{Name: "http_request_total", Timestamp: 1639444108, Value: 5}, 183 timeseries{Name: "http_request_total", Timestamp: 1639444133, Value: 4}, 184 timeseries{Name: "http_request_total", Timestamp: 1639444137, Value: 5}, 185 timeseries{Name: "http_request_total", Timestamp: 1639444141, Value: 6}, 186 timeseries{Name: "http_request_total", Timestamp: 1639444144, Value: 10}, 187 }, 188 dump: `row group 0 189 -------------------------------------------------------------------------------- 190 name: BINARY GZIP DO:4 FPO:70 SZ:216/191/0.88 VC:10 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column] 191 timestamp: INT64 GZIP DO:0 FPO:220 SZ:299/550/1.84 VC:10 ENC:DELTA_BINARY_PACKED ST:[no stats for this column] 192 value: DOUBLE GZIP DO:0 FPO:519 SZ:292/192/0.66 VC:10 ENC:PLAIN ST:[no stats for this column] 193 194 name TV=10 RL=0 DL=0 DS: 1 DE:PLAIN 195 ---------------------------------------------------------------------------- 196 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2 197 page 1: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2 198 page 2: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2 199 page 3: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2 200 page 4: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2 201 202 timestamp TV=10 RL=0 DL=0 203 ---------------------------------------------------------------------------- 204 page 0: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3 205 page 1: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3 206 page 2: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3 207 page 3: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:9 VC:1 208 209 value TV=10 RL=0 DL=0 210 ---------------------------------------------------------------------------- 211 page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3 212 page 1: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3 213 page 2: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3 214 page 3: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:8 VC:1 215 216 BINARY name 217 -------------------------------------------------------------------------------- 218 *** row group 1 of 1, values 1 to 10 *** 219 value 1: R:0 D:0 V:http_request_total 220 value 2: R:0 D:0 V:http_request_total 221 value 3: R:0 D:0 V:http_request_total 222 value 4: R:0 D:0 V:http_request_total 223 value 5: R:0 D:0 V:http_request_total 224 value 6: R:0 D:0 V:http_request_total 225 value 7: R:0 D:0 V:http_request_total 226 value 8: R:0 D:0 V:http_request_total 227 value 9: R:0 D:0 V:http_request_total 228 value 10: R:0 D:0 V:http_request_total 229 230 INT64 timestamp 231 -------------------------------------------------------------------------------- 232 *** row group 1 of 1, values 1 to 10 *** 233 value 1: R:0 D:0 V:1639444033 234 value 2: R:0 D:0 V:1639444058 235 value 3: R:0 D:0 V:1639444085 236 value 4: R:0 D:0 V:1639444093 237 value 5: R:0 D:0 V:1639444101 238 value 6: R:0 D:0 V:1639444108 239 value 7: R:0 D:0 V:1639444133 240 value 8: R:0 D:0 V:1639444137 241 value 9: R:0 D:0 V:1639444141 242 value 10: R:0 D:0 V:1639444144 243 244 DOUBLE value 245 -------------------------------------------------------------------------------- 246 *** row group 1 of 1, values 1 to 10 *** 247 value 1: R:0 D:0 V:100.0 248 value 2: R:0 D:0 V:0.0 249 value 3: R:0 D:0 V:42.0 250 value 4: R:0 D:0 V:1.0 251 value 5: R:0 D:0 V:2.0 252 value 6: R:0 D:0 V:5.0 253 value 7: R:0 D:0 V:4.0 254 value 8: R:0 D:0 V:5.0 255 value 9: R:0 D:0 V:6.0 256 value 10: R:0 D:0 V:10.0 257 `, 258 }, 259 260 { 261 scenario: "example from the twitter blog (v1)", 262 version: v1, 263 rows: []interface{}{ 264 AddressBook{ 265 Owner: "Julien Le Dem", 266 OwnerPhoneNumbers: []string{ 267 "555 123 4567", 268 "555 666 1337", 269 }, 270 Contacts: []Contact{ 271 { 272 Name: "Dmitriy Ryaboy", 273 PhoneNumber: "555 987 6543", 274 }, 275 { 276 Name: "Chris Aniszczyk", 277 }, 278 }, 279 }, 280 AddressBook{ 281 Owner: "A. Nonymous", 282 OwnerPhoneNumbers: nil, 283 }, 284 }, 285 286 dump: `row group 0 287 -------------------------------------------------------------------------------- 288 owner: BINARY ZSTD DO:0 FPO:4 SZ:81/73/0.90 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] 289 ownerPhoneNumbers: BINARY GZIP DO:0 FPO:85 SZ:179/129/0.72 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] 290 contacts: 291 .name: BINARY UNCOMPRESSED DO:0 FPO:264 SZ:138/138/1.00 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] 292 .phoneNumber: BINARY ZSTD DO:0 FPO:402 SZ:113/95/0.84 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] 293 294 owner TV=2 RL=0 DL=0 295 ---------------------------------------------------------------------------- 296 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:50 VC:2 297 298 ownerPhoneNumbers TV=3 RL=1 DL=1 299 ---------------------------------------------------------------------------- 300 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:64 VC:2 301 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1 302 303 contacts.name TV=3 RL=1 DL=1 304 ---------------------------------------------------------------------------- 305 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:73 VC:2 306 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:17 VC:1 307 308 contacts.phoneNumber TV=3 RL=1 DL=2 309 ---------------------------------------------------------------------------- 310 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:33 VC:2 311 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1 312 313 BINARY owner 314 -------------------------------------------------------------------------------- 315 *** row group 1 of 1, values 1 to 2 *** 316 value 1: R:0 D:0 V:Julien Le Dem 317 value 2: R:0 D:0 V:A. Nonymous 318 319 BINARY ownerPhoneNumbers 320 -------------------------------------------------------------------------------- 321 *** row group 1 of 1, values 1 to 3 *** 322 value 1: R:0 D:1 V:555 123 4567 323 value 2: R:1 D:1 V:555 666 1337 324 value 3: R:0 D:0 V:<null> 325 326 BINARY contacts.name 327 -------------------------------------------------------------------------------- 328 *** row group 1 of 1, values 1 to 3 *** 329 value 1: R:0 D:1 V:Dmitriy Ryaboy 330 value 2: R:1 D:1 V:Chris Aniszczyk 331 value 3: R:0 D:0 V:<null> 332 333 BINARY contacts.phoneNumber 334 -------------------------------------------------------------------------------- 335 *** row group 1 of 1, values 1 to 3 *** 336 value 1: R:0 D:2 V:555 987 6543 337 value 2: R:1 D:1 V:<null> 338 value 3: R:0 D:0 V:<null> 339 `, 340 }, 341 342 { 343 scenario: "example from the twitter blog (v2)", 344 version: v2, 345 rows: []interface{}{ 346 AddressBook{ 347 Owner: "Julien Le Dem", 348 OwnerPhoneNumbers: []string{ 349 "555 123 4567", 350 "555 666 1337", 351 }, 352 Contacts: []Contact{ 353 { 354 Name: "Dmitriy Ryaboy", 355 PhoneNumber: "555 987 6543", 356 }, 357 { 358 Name: "Chris Aniszczyk", 359 }, 360 }, 361 }, 362 AddressBook{ 363 Owner: "A. Nonymous", 364 OwnerPhoneNumbers: nil, 365 }, 366 }, 367 368 dump: `row group 0 369 -------------------------------------------------------------------------------- 370 owner: BINARY ZSTD DO:0 FPO:4 SZ:86/78/0.91 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] 371 ownerPhoneNumbers: BINARY GZIP DO:0 FPO:90 SZ:172/122/0.71 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] 372 contacts: 373 .name: BINARY UNCOMPRESSED DO:0 FPO:262 SZ:132/132/1.00 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] 374 .phoneNumber: BINARY ZSTD DO:0 FPO:394 SZ:108/90/0.83 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] 375 376 owner TV=2 RL=0 DL=0 377 ---------------------------------------------------------------------------- 378 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:50 VC:2 379 380 ownerPhoneNumbers TV=3 RL=1 DL=1 381 ---------------------------------------------------------------------------- 382 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2 383 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 384 385 contacts.name TV=3 RL=1 DL=1 386 ---------------------------------------------------------------------------- 387 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:65 VC:2 388 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 389 390 contacts.phoneNumber TV=3 RL=1 DL=2 391 ---------------------------------------------------------------------------- 392 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:25 VC:2 393 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 394 395 BINARY owner 396 -------------------------------------------------------------------------------- 397 *** row group 1 of 1, values 1 to 2 *** 398 value 1: R:0 D:0 V:Julien Le Dem 399 value 2: R:0 D:0 V:A. Nonymous 400 401 BINARY ownerPhoneNumbers 402 -------------------------------------------------------------------------------- 403 *** row group 1 of 1, values 1 to 3 *** 404 value 1: R:0 D:1 V:555 123 4567 405 value 2: R:1 D:1 V:555 666 1337 406 value 3: R:0 D:0 V:<null> 407 408 BINARY contacts.name 409 -------------------------------------------------------------------------------- 410 *** row group 1 of 1, values 1 to 3 *** 411 value 1: R:0 D:1 V:Dmitriy Ryaboy 412 value 2: R:1 D:1 V:Chris Aniszczyk 413 value 3: R:0 D:0 V:<null> 414 415 BINARY contacts.phoneNumber 416 -------------------------------------------------------------------------------- 417 *** row group 1 of 1, values 1 to 3 *** 418 value 1: R:0 D:2 V:555 987 6543 419 value 2: R:1 D:1 V:<null> 420 value 3: R:0 D:0 V:<null> 421 `, 422 }, 423 424 { 425 scenario: "omit `-` fields", 426 version: v1, 427 rows: []interface{}{ 428 &event{Name: "customer1", Type: "request", Value: 42.0}, 429 &event{Name: "customer2", Type: "access", Value: 1.0}, 430 }, 431 dump: `row group 0 432 -------------------------------------------------------------------------------- 433 name: BINARY UNCOMPRESSED DO:4 FPO:49 SZ:73/73/1.00 VC:2 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column] 434 value: DOUBLE UNCOMPRESSED DO:0 FPO:77 SZ:39/39/1.00 VC:2 ENC:PLAIN ST:[no stats for this column] 435 436 name TV=2 RL=0 DL=0 DS: 2 DE:PLAIN 437 ---------------------------------------------------------------------------- 438 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[verified] SZ:5 VC:2 439 440 value TV=2 RL=0 DL=0 441 ---------------------------------------------------------------------------- 442 page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] CRC:[verified] SZ:16 VC:2 443 444 BINARY name 445 -------------------------------------------------------------------------------- 446 *** row group 1 of 1, values 1 to 2 *** 447 value 1: R:0 D:0 V:customer1 448 value 2: R:0 D:0 V:customer2 449 450 DOUBLE value 451 -------------------------------------------------------------------------------- 452 *** row group 1 of 1, values 1 to 2 *** 453 value 1: R:0 D:0 V:42.0 454 value 2: R:0 D:0 V:1.0 455 `, 456 }, 457 } 458 459 func TestWriter(t *testing.T) { 460 if !hasParquetTools() { 461 t.Skip("parquet-tools are not installed") 462 } 463 464 for _, test := range writerTests { 465 dataPageVersion := test.version 466 codec := test.codec 467 rows := test.rows 468 dump := test.dump 469 470 t.Run(test.scenario, func(t *testing.T) { 471 t.Parallel() 472 473 b, err := generateParquetFile(makeRows(rows), 474 parquet.DataPageVersion(dataPageVersion), 475 parquet.Compression(codec), 476 ) 477 if err != nil { 478 t.Logf("\n%s", string(b)) 479 t.Fatal(err) 480 } 481 482 if string(b) != dump { 483 edits := myers.ComputeEdits(span.URIFromPath("want.txt"), dump, string(b)) 484 diff := fmt.Sprint(gotextdiff.ToUnified("want.txt", "got.txt", dump, edits)) 485 t.Errorf("\n%s", diff) 486 } 487 }) 488 } 489 } 490 491 func hasParquetTools() bool { 492 _, err := exec.LookPath("parquet-tools") 493 return err == nil 494 } 495 496 func parquetTools(cmd, path string) ([]byte, error) { 497 p := exec.Command("parquet-tools", cmd, "--debug", "--disable-crop", path) 498 499 output, err := p.CombinedOutput() 500 if err != nil { 501 return output, err 502 } 503 504 // parquet-tools has trailing spaces on some lines 505 lines := bytes.Split(output, []byte("\n")) 506 507 for i, line := range lines { 508 lines[i] = bytes.TrimRight(line, " ") 509 } 510 511 return bytes.Join(lines, []byte("\n")), nil 512 } 513 514 func TestWriterGenerateBloomFilters(t *testing.T) { 515 type Person struct { 516 FirstName utf8string `parquet:"first_name"` 517 LastName utf8string `parquet:"last_name"` 518 } 519 520 err := quickCheck(func(rows []Person) bool { 521 if len(rows) == 0 { // TODO: support writing files with no rows 522 return true 523 } 524 525 buffer := new(bytes.Buffer) 526 writer := parquet.NewWriter(buffer, 527 parquet.BloomFilters( 528 parquet.SplitBlockFilter("last_name"), 529 ), 530 ) 531 for i := range rows { 532 if err := writer.Write(&rows[i]); err != nil { 533 t.Error(err) 534 return false 535 } 536 } 537 if err := writer.Close(); err != nil { 538 t.Error(err) 539 return false 540 } 541 542 reader := bytes.NewReader(buffer.Bytes()) 543 f, err := parquet.OpenFile(reader, reader.Size()) 544 if err != nil { 545 t.Error(err) 546 return false 547 } 548 rowGroup := f.RowGroups()[0] 549 columns := rowGroup.ColumnChunks() 550 firstName := columns[0] 551 lastName := columns[1] 552 553 if firstName.BloomFilter() != nil { 554 t.Errorf(`"first_name" column has a bloom filter even though none were configured`) 555 return false 556 } 557 558 bloomFilter := lastName.BloomFilter() 559 if bloomFilter == nil { 560 t.Error(`"last_name" column has no bloom filter despite being configured to have one`) 561 return false 562 } 563 564 for i, row := range rows { 565 if ok, err := bloomFilter.Check(parquet.ValueOf(row.LastName)); err != nil { 566 t.Errorf("unexpected error checking bloom filter: %v", err) 567 return false 568 } else if !ok { 569 t.Errorf("bloom filter does not contain value %q of row %d", row.LastName, i) 570 return false 571 } 572 } 573 574 return true 575 }) 576 if err != nil { 577 t.Error(err) 578 } 579 } 580 581 func TestBloomFilterForDict(t *testing.T) { 582 type testStruct struct { 583 A string `parquet:"a,dict"` 584 } 585 586 schema := parquet.SchemaOf(&testStruct{}) 587 588 b := bytes.NewBuffer(nil) 589 w := parquet.NewWriter( 590 b, 591 schema, 592 parquet.BloomFilters(parquet.SplitBlockFilter("a")), 593 ) 594 595 err := w.Write(&testStruct{A: "test"}) 596 if err != nil { 597 t.Fatal(err) 598 } 599 600 err = w.Close() 601 if err != nil { 602 t.Fatal(err) 603 } 604 605 f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) 606 if err != nil { 607 t.Fatal(err) 608 } 609 610 ok, err := f.RowGroups()[0].ColumnChunks()[0].BloomFilter().Check(parquet.ValueOf("test")) 611 if err != nil { 612 t.Fatal(err) 613 } 614 if !ok { 615 t.Error("bloom filter should have contained 'test'") 616 } 617 } 618 619 func TestWriterRepeatedUUIDDict(t *testing.T) { 620 inputID := uuid.MustParse("123456ab-0000-0000-0000-000000000000") 621 records := []struct { 622 List []uuid.UUID `parquet:"list,dict"` 623 }{{ 624 []uuid.UUID{inputID}, 625 }} 626 schema := parquet.SchemaOf(&records[0]) 627 b := bytes.NewBuffer(nil) 628 w := parquet.NewWriter(b, schema) 629 if err := w.Write(records[0]); err != nil { 630 t.Fatal(err) 631 } 632 if err := w.Close(); err != nil { 633 t.Fatal(err) 634 } 635 636 f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) 637 if err != nil { 638 t.Fatal(err) 639 } 640 641 rowbuf := make([]parquet.Row, 1) 642 rows := f.RowGroups()[0].Rows() 643 defer rows.Close() 644 n, err := rows.ReadRows(rowbuf) 645 if n == 0 { 646 t.Fatalf("reading row from parquet file: %v", err) 647 } 648 if len(rowbuf[0]) != 1 { 649 t.Errorf("expected 1 value in row, got %d", len(rowbuf[0])) 650 } 651 if !bytes.Equal(inputID[:], rowbuf[0][0].Bytes()) { 652 t.Errorf("expected to get UUID %q back out, got %q", inputID, rowbuf[0][0].Bytes()) 653 } 654 }