github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/writer_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "os" 7 "os/exec" 8 "strings" 9 "testing" 10 11 "github.com/google/uuid" 12 "github.com/hexops/gotextdiff" 13 "github.com/hexops/gotextdiff/myers" 14 "github.com/hexops/gotextdiff/span" 15 16 "github.com/segmentio/parquet-go" 17 "github.com/segmentio/parquet-go/compress" 18 ) 19 20 const ( 21 v1 = 1 22 v2 = 2 23 ) 24 25 func scanParquetFile(f *os.File) error { 26 s, err := f.Stat() 27 if err != nil { 28 return err 29 } 30 31 p, err := parquet.OpenFile(f, s.Size()) 32 if err != nil { 33 return err 34 } 35 36 return scanParquetValues(p.Root()) 37 } 38 39 func scanParquetValues(col *parquet.Column) error { 40 return forEachColumnValue(col, func(leaf *parquet.Column, value parquet.Value) error { 41 fmt.Printf("%s > %+v\n", strings.Join(leaf.Path(), "."), value) 42 return nil 43 }) 44 } 45 46 func generateParquetFile(rows rows, options ...parquet.WriterOption) ([]byte, error) { 47 tmp, err := os.CreateTemp("/tmp", "*.parquet") 48 if err != nil { 49 return nil, err 50 } 51 defer tmp.Close() 52 path := tmp.Name() 53 defer os.Remove(path) 54 // fmt.Println(path) 55 56 writerOptions := []parquet.WriterOption{parquet.PageBufferSize(20)} 57 writerOptions = append(writerOptions, options...) 58 59 if err := writeParquetFile(tmp, rows, writerOptions...); err != nil { 60 return nil, err 61 } 62 63 if err := scanParquetFile(tmp); err != nil { 64 return nil, err 65 } 66 67 return parquetTools("dump", path) 68 } 69 70 type firstAndLastName struct { 71 FirstName string `parquet:"first_name,dict,zstd"` 72 LastName string `parquet:"last_name,delta,zstd"` 73 } 74 75 type timeseries struct { 76 Name string `parquet:"name,dict"` 77 Timestamp int64 `parquet:"timestamp,delta"` 78 Value float64 `parquet:"value"` 79 } 80 81 type event struct { 82 Name string `parquet:"name,dict"` 83 Type string `parquet:"-"` 84 Value float64 `parquet:"value"` 85 Category string `parquet:"-"` 86 } 87 88 var writerTests = []struct { 89 scenario string 90 version int 91 codec compress.Codec 92 rows []interface{} 93 dump string 94 }{ 95 { 96 scenario: "page v1 with dictionary encoding", 97 version: v1, 98 rows: []interface{}{ 99 &firstAndLastName{FirstName: "Han", LastName: "Solo"}, 100 &firstAndLastName{FirstName: "Leia", LastName: "Skywalker"}, 101 &firstAndLastName{FirstName: "Luke", LastName: "Skywalker"}, 102 }, 103 dump: `row group 0 104 -------------------------------------------------------------------------------- 105 first_name: BINARY ZSTD DO:4 FPO:55 SZ:90/72/0.80 VC:3 ENC:RLE_DICTIONARY,PLAIN ST:[min: Han, max: Luke, num_nulls not defined] 106 last_name: BINARY ZSTD DO:0 FPO:94 SZ:127/121/0.95 VC:3 ENC:DELTA_BYTE_ARRAY ST:[min: Skywalker, max: Solo, num_nulls not defined] 107 108 first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN 109 ---------------------------------------------------------------------------- 110 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:7 VC:3 111 112 last_name TV=3 RL=0 DL=0 113 ---------------------------------------------------------------------------- 114 page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:56 VC:2 115 page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:19 VC:1 116 117 BINARY first_name 118 -------------------------------------------------------------------------------- 119 *** row group 1 of 1, values 1 to 3 *** 120 value 1: R:0 D:0 V:Han 121 value 2: R:0 D:0 V:Leia 122 value 3: R:0 D:0 V:Luke 123 124 BINARY last_name 125 -------------------------------------------------------------------------------- 126 *** row group 1 of 1, values 1 to 3 *** 127 value 1: R:0 D:0 V:Solo 128 value 2: R:0 D:0 V:Skywalker 129 value 3: R:0 D:0 V:Skywalker 130 `, 131 }, 132 133 { // same as the previous test but uses page v2 where data pages aren't compressed 134 scenario: "page v2 with dictionary encoding", 135 version: v2, 136 rows: []interface{}{ 137 &firstAndLastName{FirstName: "Han", LastName: "Solo"}, 138 &firstAndLastName{FirstName: "Leia", LastName: "Skywalker"}, 139 &firstAndLastName{FirstName: "Luke", LastName: "Skywalker"}, 140 }, 141 dump: `row group 0 142 -------------------------------------------------------------------------------- 143 first_name: BINARY ZSTD DO:4 FPO:55 SZ:86/77/0.90 VC:3 ENC:PLAIN,RLE_DICTIONARY ST:[min: Han, max: Luke, num_nulls not defined] 144 last_name: BINARY ZSTD DO:0 FPO:90 SZ:137/131/0.96 VC:3 ENC:DELTA_BYTE_ARRAY ST:[min: Skywalker, max: Solo, num_nulls not defined] 145 146 first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN 147 ---------------------------------------------------------------------------- 148 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:7 VC:3 149 150 last_name TV=3 RL=0 DL=0 151 ---------------------------------------------------------------------------- 152 page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2 153 page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:19 VC:1 154 155 BINARY first_name 156 -------------------------------------------------------------------------------- 157 *** row group 1 of 1, values 1 to 3 *** 158 value 1: R:0 D:0 V:Han 159 value 2: R:0 D:0 V:Leia 160 value 3: R:0 D:0 V:Luke 161 162 BINARY last_name 163 -------------------------------------------------------------------------------- 164 *** row group 1 of 1, values 1 to 3 *** 165 value 1: R:0 D:0 V:Solo 166 value 2: R:0 D:0 V:Skywalker 167 value 3: R:0 D:0 V:Skywalker 168 `, 169 }, 170 171 { 172 scenario: "timeseries with delta encoding", 173 version: v2, 174 codec: &parquet.Gzip, 175 rows: []interface{}{ 176 timeseries{Name: "http_request_total", Timestamp: 1639444033, Value: 100}, 177 timeseries{Name: "http_request_total", Timestamp: 1639444058, Value: 0}, 178 timeseries{Name: "http_request_total", Timestamp: 1639444085, Value: 42}, 179 timeseries{Name: "http_request_total", Timestamp: 1639444093, Value: 1}, 180 timeseries{Name: "http_request_total", Timestamp: 1639444101, Value: 2}, 181 timeseries{Name: "http_request_total", Timestamp: 1639444108, Value: 5}, 182 timeseries{Name: "http_request_total", Timestamp: 1639444133, Value: 4}, 183 timeseries{Name: "http_request_total", Timestamp: 1639444137, Value: 5}, 184 timeseries{Name: "http_request_total", Timestamp: 1639444141, Value: 6}, 185 timeseries{Name: "http_request_total", Timestamp: 1639444144, Value: 10}, 186 }, 187 dump: `row group 0 188 -------------------------------------------------------------------------------- 189 name: BINARY GZIP DO:4 FPO:70 SZ:126/101/0.80 VC:10 ENC:PLAIN,RLE_DICTIONARY ST:[min: http_request_total, max: http_request_total, num_nulls not defined] 190 timestamp: INT64 GZIP DO:0 FPO:130 SZ:299/550/1.84 VC:10 ENC:DELTA_BINARY_PACKED ST:[min: 1639444033, max: 1639444144, num_nulls not defined] 191 value: DOUBLE GZIP DO:0 FPO:429 SZ:292/192/0.66 VC:10 ENC:PLAIN ST:[min: -0.0, max: 100.0, num_nulls not defined] 192 193 name TV=10 RL=0 DL=0 DS: 1 DE:PLAIN 194 ---------------------------------------------------------------------------- 195 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5 196 page 1: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5 197 198 timestamp TV=10 RL=0 DL=0 199 ---------------------------------------------------------------------------- 200 page 0: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3 201 page 1: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3 202 page 2: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3 203 page 3: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:9 VC:1 204 205 value TV=10 RL=0 DL=0 206 ---------------------------------------------------------------------------- 207 page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3 208 page 1: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3 209 page 2: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3 210 page 3: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:8 VC:1 211 212 BINARY name 213 -------------------------------------------------------------------------------- 214 *** row group 1 of 1, values 1 to 10 *** 215 value 1: R:0 D:0 V:http_request_total 216 value 2: R:0 D:0 V:http_request_total 217 value 3: R:0 D:0 V:http_request_total 218 value 4: R:0 D:0 V:http_request_total 219 value 5: R:0 D:0 V:http_request_total 220 value 6: R:0 D:0 V:http_request_total 221 value 7: R:0 D:0 V:http_request_total 222 value 8: R:0 D:0 V:http_request_total 223 value 9: R:0 D:0 V:http_request_total 224 value 10: R:0 D:0 V:http_request_total 225 226 INT64 timestamp 227 -------------------------------------------------------------------------------- 228 *** row group 1 of 1, values 1 to 10 *** 229 value 1: R:0 D:0 V:1639444033 230 value 2: R:0 D:0 V:1639444058 231 value 3: R:0 D:0 V:1639444085 232 value 4: R:0 D:0 V:1639444093 233 value 5: R:0 D:0 V:1639444101 234 value 6: R:0 D:0 V:1639444108 235 value 7: R:0 D:0 V:1639444133 236 value 8: R:0 D:0 V:1639444137 237 value 9: R:0 D:0 V:1639444141 238 value 10: R:0 D:0 V:1639444144 239 240 DOUBLE value 241 -------------------------------------------------------------------------------- 242 *** row group 1 of 1, values 1 to 10 *** 243 value 1: R:0 D:0 V:100.0 244 value 2: R:0 D:0 V:0.0 245 value 3: R:0 D:0 V:42.0 246 value 4: R:0 D:0 V:1.0 247 value 5: R:0 D:0 V:2.0 248 value 6: R:0 D:0 V:5.0 249 value 7: R:0 D:0 V:4.0 250 value 8: R:0 D:0 V:5.0 251 value 9: R:0 D:0 V:6.0 252 value 10: R:0 D:0 V:10.0 253 `, 254 }, 255 256 { 257 scenario: "example from the twitter blog (v1)", 258 version: v1, 259 rows: []interface{}{ 260 AddressBook{ 261 Owner: "Julien Le Dem", 262 OwnerPhoneNumbers: []string{ 263 "555 123 4567", 264 "555 666 1337", 265 }, 266 Contacts: []Contact{ 267 { 268 Name: "Dmitriy Ryaboy", 269 PhoneNumber: "555 987 6543", 270 }, 271 { 272 Name: "Chris Aniszczyk", 273 }, 274 }, 275 }, 276 AddressBook{ 277 Owner: "A. Nonymous", 278 OwnerPhoneNumbers: nil, 279 }, 280 }, 281 282 dump: `row group 0 283 -------------------------------------------------------------------------------- 284 owner: BINARY ZSTD DO:0 FPO:4 SZ:81/73/0.90 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[min: A. Nonymous, max: Julien Le Dem, num_nulls not defined] 285 ownerPhoneNumbers: BINARY GZIP DO:0 FPO:85 SZ:179/129/0.72 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 123 4567, max: 555 666 1337, num_nulls: 1] 286 contacts: 287 .name: BINARY UNCOMPRESSED DO:0 FPO:264 SZ:138/138/1.00 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: Chris Aniszczyk, max: Dmitriy Ryaboy, num_nulls: 1] 288 .phoneNumber: BINARY ZSTD DO:0 FPO:402 SZ:113/95/0.84 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 987 6543, max: 555 987 6543, num_nulls: 2] 289 290 owner TV=2 RL=0 DL=0 291 ---------------------------------------------------------------------------- 292 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:50 VC:2 293 294 ownerPhoneNumbers TV=3 RL=1 DL=1 295 ---------------------------------------------------------------------------- 296 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:64 VC:2 297 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1 298 299 contacts.name TV=3 RL=1 DL=1 300 ---------------------------------------------------------------------------- 301 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:73 VC:2 302 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:17 VC:1 303 304 contacts.phoneNumber TV=3 RL=1 DL=2 305 ---------------------------------------------------------------------------- 306 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:33 VC:2 307 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1 308 309 BINARY owner 310 -------------------------------------------------------------------------------- 311 *** row group 1 of 1, values 1 to 2 *** 312 value 1: R:0 D:0 V:Julien Le Dem 313 value 2: R:0 D:0 V:A. Nonymous 314 315 BINARY ownerPhoneNumbers 316 -------------------------------------------------------------------------------- 317 *** row group 1 of 1, values 1 to 3 *** 318 value 1: R:0 D:1 V:555 123 4567 319 value 2: R:1 D:1 V:555 666 1337 320 value 3: R:0 D:0 V:<null> 321 322 BINARY contacts.name 323 -------------------------------------------------------------------------------- 324 *** row group 1 of 1, values 1 to 3 *** 325 value 1: R:0 D:1 V:Dmitriy Ryaboy 326 value 2: R:1 D:1 V:Chris Aniszczyk 327 value 3: R:0 D:0 V:<null> 328 329 BINARY contacts.phoneNumber 330 -------------------------------------------------------------------------------- 331 *** row group 1 of 1, values 1 to 3 *** 332 value 1: R:0 D:2 V:555 987 6543 333 value 2: R:1 D:1 V:<null> 334 value 3: R:0 D:0 V:<null> 335 `, 336 }, 337 338 { 339 scenario: "example from the twitter blog (v2)", 340 version: v2, 341 rows: []interface{}{ 342 AddressBook{ 343 Owner: "Julien Le Dem", 344 OwnerPhoneNumbers: []string{ 345 "555 123 4567", 346 "555 666 1337", 347 }, 348 Contacts: []Contact{ 349 { 350 Name: "Dmitriy Ryaboy", 351 PhoneNumber: "555 987 6543", 352 }, 353 { 354 Name: "Chris Aniszczyk", 355 }, 356 }, 357 }, 358 AddressBook{ 359 Owner: "A. Nonymous", 360 OwnerPhoneNumbers: nil, 361 }, 362 }, 363 364 dump: `row group 0 365 -------------------------------------------------------------------------------- 366 owner: BINARY ZSTD DO:0 FPO:4 SZ:86/78/0.91 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[min: A. Nonymous, max: Julien Le Dem, num_nulls not defined] 367 ownerPhoneNumbers: BINARY GZIP DO:0 FPO:90 SZ:172/122/0.71 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 123 4567, max: 555 666 1337, num_nulls: 1] 368 contacts: 369 .name: BINARY UNCOMPRESSED DO:0 FPO:262 SZ:132/132/1.00 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: Chris Aniszczyk, max: Dmitriy Ryaboy, num_nulls: 1] 370 .phoneNumber: BINARY ZSTD DO:0 FPO:394 SZ:108/90/0.83 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 987 6543, max: 555 987 6543, num_nulls: 2] 371 372 owner TV=2 RL=0 DL=0 373 ---------------------------------------------------------------------------- 374 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:50 VC:2 375 376 ownerPhoneNumbers TV=3 RL=1 DL=1 377 ---------------------------------------------------------------------------- 378 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2 379 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 380 381 contacts.name TV=3 RL=1 DL=1 382 ---------------------------------------------------------------------------- 383 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:65 VC:2 384 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 385 386 contacts.phoneNumber TV=3 RL=1 DL=2 387 ---------------------------------------------------------------------------- 388 page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:25 VC:2 389 page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 390 391 BINARY owner 392 -------------------------------------------------------------------------------- 393 *** row group 1 of 1, values 1 to 2 *** 394 value 1: R:0 D:0 V:Julien Le Dem 395 value 2: R:0 D:0 V:A. Nonymous 396 397 BINARY ownerPhoneNumbers 398 -------------------------------------------------------------------------------- 399 *** row group 1 of 1, values 1 to 3 *** 400 value 1: R:0 D:1 V:555 123 4567 401 value 2: R:1 D:1 V:555 666 1337 402 value 3: R:0 D:0 V:<null> 403 404 BINARY contacts.name 405 -------------------------------------------------------------------------------- 406 *** row group 1 of 1, values 1 to 3 *** 407 value 1: R:0 D:1 V:Dmitriy Ryaboy 408 value 2: R:1 D:1 V:Chris Aniszczyk 409 value 3: R:0 D:0 V:<null> 410 411 BINARY contacts.phoneNumber 412 -------------------------------------------------------------------------------- 413 *** row group 1 of 1, values 1 to 3 *** 414 value 1: R:0 D:2 V:555 987 6543 415 value 2: R:1 D:1 V:<null> 416 value 3: R:0 D:0 V:<null> 417 `, 418 }, 419 420 { 421 scenario: "omit `-` fields", 422 version: v1, 423 rows: []interface{}{ 424 &event{Name: "customer1", Type: "request", Value: 42.0}, 425 &event{Name: "customer2", Type: "access", Value: 1.0}, 426 }, 427 dump: `row group 0 428 -------------------------------------------------------------------------------- 429 name: BINARY UNCOMPRESSED DO:4 FPO:49 SZ:73/73/1.00 VC:2 ENC:RLE_DICTIONARY,PLAIN ST:[min: customer1, max: customer2, num_nulls not defined] 430 value: DOUBLE UNCOMPRESSED DO:0 FPO:77 SZ:39/39/1.00 VC:2 ENC:PLAIN ST:[min: 1.0, max: 42.0, num_nulls not defined] 431 432 name TV=2 RL=0 DL=0 DS: 2 DE:PLAIN 433 ---------------------------------------------------------------------------- 434 page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[verified] SZ:5 VC:2 435 436 value TV=2 RL=0 DL=0 437 ---------------------------------------------------------------------------- 438 page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] CRC:[verified] SZ:16 VC:2 439 440 BINARY name 441 -------------------------------------------------------------------------------- 442 *** row group 1 of 1, values 1 to 2 *** 443 value 1: R:0 D:0 V:customer1 444 value 2: R:0 D:0 V:customer2 445 446 DOUBLE value 447 -------------------------------------------------------------------------------- 448 *** row group 1 of 1, values 1 to 2 *** 449 value 1: R:0 D:0 V:42.0 450 value 2: R:0 D:0 V:1.0 451 `, 452 }, 453 } 454 455 func TestWriter(t *testing.T) { 456 if !hasParquetTools() { 457 t.Skip("Skipping TestWriter writerTests because parquet-tools are not installed in Github CI. FIXME.") // TODO 458 } 459 460 for _, test := range writerTests { 461 dataPageVersion := test.version 462 codec := test.codec 463 rows := test.rows 464 dump := test.dump 465 466 t.Run(test.scenario, func(t *testing.T) { 467 t.Parallel() 468 469 b, err := generateParquetFile(makeRows(rows), 470 parquet.DataPageVersion(dataPageVersion), 471 parquet.Compression(codec), 472 ) 473 if err != nil { 474 t.Logf("\n%s", string(b)) 475 t.Fatal(err) 476 } 477 478 if string(b) != dump { 479 edits := myers.ComputeEdits(span.URIFromPath("want.txt"), dump, string(b)) 480 diff := fmt.Sprint(gotextdiff.ToUnified("want.txt", "got.txt", dump, edits)) 481 t.Errorf("\n%s", diff) 482 } 483 }) 484 } 485 } 486 487 func hasParquetTools() bool { 488 _, err := exec.LookPath("parquet-tools") 489 return err == nil 490 } 491 492 func parquetTools(cmd, path string) ([]byte, error) { 493 p := exec.Command("parquet-tools", cmd, "--debug", "--disable-crop", path) 494 495 output, err := p.CombinedOutput() 496 if err != nil { 497 return output, err 498 } 499 500 // parquet-tools has trailing spaces on some lines 501 lines := bytes.Split(output, []byte("\n")) 502 503 for i, line := range lines { 504 lines[i] = bytes.TrimRight(line, " ") 505 } 506 507 return bytes.Join(lines, []byte("\n")), nil 508 } 509 510 func TestWriterGenerateBloomFilters(t *testing.T) { 511 type Person struct { 512 FirstName utf8string `parquet:"first_name"` 513 LastName utf8string `parquet:"last_name"` 514 } 515 516 err := quickCheck(func(rows []Person) bool { 517 if len(rows) == 0 { // TODO: support writing files with no rows 518 return true 519 } 520 521 buffer := new(bytes.Buffer) 522 writer := parquet.NewWriter(buffer, 523 parquet.BloomFilters( 524 parquet.SplitBlockFilter(10, "last_name"), 525 ), 526 ) 527 for i := range rows { 528 if err := writer.Write(&rows[i]); err != nil { 529 t.Error(err) 530 return false 531 } 532 } 533 if err := writer.Close(); err != nil { 534 t.Error(err) 535 return false 536 } 537 538 reader := bytes.NewReader(buffer.Bytes()) 539 f, err := parquet.OpenFile(reader, reader.Size()) 540 if err != nil { 541 t.Error(err) 542 return false 543 } 544 rowGroup := f.RowGroups()[0] 545 columns := rowGroup.ColumnChunks() 546 firstName := columns[0] 547 lastName := columns[1] 548 549 if firstName.BloomFilter() != nil { 550 t.Errorf(`"first_name" column has a bloom filter even though none were configured`) 551 return false 552 } 553 554 bloomFilter := lastName.BloomFilter() 555 if bloomFilter == nil { 556 t.Error(`"last_name" column has no bloom filter despite being configured to have one`) 557 return false 558 } 559 560 for i, row := range rows { 561 if ok, err := bloomFilter.Check(parquet.ValueOf(row.LastName)); err != nil { 562 t.Errorf("unexpected error checking bloom filter: %v", err) 563 return false 564 } else if !ok { 565 t.Errorf("bloom filter does not contain value %q of row %d", row.LastName, i) 566 return false 567 } 568 } 569 570 return true 571 }) 572 if err != nil { 573 t.Error(err) 574 } 575 } 576 577 func TestBloomFilterForDict(t *testing.T) { 578 type testStruct struct { 579 A string `parquet:"a,dict"` 580 } 581 582 schema := parquet.SchemaOf(&testStruct{}) 583 584 b := bytes.NewBuffer(nil) 585 w := parquet.NewWriter( 586 b, 587 schema, 588 parquet.BloomFilters(parquet.SplitBlockFilter(10, "a")), 589 ) 590 591 err := w.Write(&testStruct{A: "test"}) 592 if err != nil { 593 t.Fatal(err) 594 } 595 596 err = w.Close() 597 if err != nil { 598 t.Fatal(err) 599 } 600 601 f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) 602 if err != nil { 603 t.Fatal(err) 604 } 605 606 ok, err := f.RowGroups()[0].ColumnChunks()[0].BloomFilter().Check(parquet.ValueOf("test")) 607 if err != nil { 608 t.Fatal(err) 609 } 610 if !ok { 611 t.Error("bloom filter should have contained 'test'") 612 } 613 } 614 615 func TestWriterRepeatedUUIDDict(t *testing.T) { 616 inputID := uuid.MustParse("123456ab-0000-0000-0000-000000000000") 617 records := []struct { 618 List []uuid.UUID `parquet:"list,dict"` 619 }{{ 620 []uuid.UUID{inputID}, 621 }} 622 schema := parquet.SchemaOf(&records[0]) 623 b := bytes.NewBuffer(nil) 624 w := parquet.NewWriter(b, schema) 625 if err := w.Write(records[0]); err != nil { 626 t.Fatal(err) 627 } 628 if err := w.Close(); err != nil { 629 t.Fatal(err) 630 } 631 632 f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) 633 if err != nil { 634 t.Fatal(err) 635 } 636 637 rowbuf := make([]parquet.Row, 1) 638 rows := f.RowGroups()[0].Rows() 639 defer rows.Close() 640 n, err := rows.ReadRows(rowbuf) 641 if n == 0 { 642 t.Fatalf("reading row from parquet file: %v", err) 643 } 644 if len(rowbuf[0]) != 1 { 645 t.Errorf("expected 1 value in row, got %d", len(rowbuf[0])) 646 } 647 if !bytes.Equal(inputID[:], rowbuf[0][0].Bytes()) { 648 t.Errorf("expected to get UUID %q back out, got %q", inputID, rowbuf[0][0].Bytes()) 649 } 650 } 651 652 func TestWriterResetWithBloomFilters(t *testing.T) { 653 type Test struct { 654 Value string `parquet:"value,dict"` 655 } 656 657 writer := parquet.NewWriter(new(bytes.Buffer), 658 parquet.BloomFilters( 659 parquet.SplitBlockFilter(10, "value"), 660 ), 661 ) 662 663 if err := writer.Write(&Test{Value: "foo"}); err != nil { 664 t.Fatal(err) 665 } 666 667 if err := writer.Close(); err != nil { 668 t.Fatal(err) 669 } 670 671 writer.Reset(new(bytes.Buffer)) 672 673 if err := writer.Write(&Test{Value: "bar"}); err != nil { 674 t.Fatal(err) 675 } 676 677 if err := writer.Close(); err != nil { 678 t.Fatal(err) 679 } 680 } 681 682 func TestWriterMaxRowsPerRowGroup(t *testing.T) { 683 output := new(bytes.Buffer) 684 writer := parquet.NewWriter(output, parquet.MaxRowsPerRowGroup(10)) 685 686 for i := 0; i < 100; i++ { 687 err := writer.Write(struct{ FirstName, LastName string }{ 688 FirstName: "0123456789"[i%10 : i%10+1], 689 LastName: "foo", 690 }) 691 if err != nil { 692 t.Fatal(err) 693 } 694 } 695 696 if err := writer.Close(); err != nil { 697 t.Fatal(err) 698 } 699 700 f, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len())) 701 if err != nil { 702 t.Fatal(err) 703 } 704 705 rowGroups := f.RowGroups() 706 if len(rowGroups) != 10 { 707 t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups)) 708 } 709 } 710 711 func TestSetKeyValueMetadata(t *testing.T) { 712 testKey := "test-key" 713 testValue := "test-value" 714 715 type testStruct struct { 716 A string `parquet:"a,dict"` 717 } 718 719 schema := parquet.SchemaOf(&testStruct{}) 720 721 b := bytes.NewBuffer(nil) 722 w := parquet.NewWriter( 723 b, 724 schema, 725 ) 726 727 err := w.Write(&testStruct{A: "test"}) 728 if err != nil { 729 t.Fatal(err) 730 } 731 732 w.SetKeyValueMetadata(testKey, testValue) 733 734 err = w.Close() 735 if err != nil { 736 t.Fatal(err) 737 } 738 739 f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) 740 if err != nil { 741 t.Fatal(err) 742 } 743 744 value, ok := f.Lookup(testKey) 745 if !ok { 746 t.Fatalf("key/value metadata should have included %q", testKey) 747 } 748 if value != testValue { 749 t.Errorf("expected %q, got %q", testValue, value) 750 } 751 } 752 753 func TestSetKeyValueMetadataOverwritesExisting(t *testing.T) { 754 testKey := "test-key" 755 testValue := "test-value" 756 757 type testStruct struct { 758 A string `parquet:"a,dict"` 759 } 760 761 schema := parquet.SchemaOf(&testStruct{}) 762 763 b := bytes.NewBuffer(nil) 764 w := parquet.NewWriter( 765 b, 766 schema, 767 parquet.KeyValueMetadata(testKey, "original-value"), 768 ) 769 770 err := w.Write(&testStruct{A: "test"}) 771 if err != nil { 772 t.Fatal(err) 773 } 774 775 w.SetKeyValueMetadata(testKey, testValue) 776 777 err = w.Close() 778 if err != nil { 779 t.Fatal(err) 780 } 781 782 f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) 783 if err != nil { 784 t.Fatal(err) 785 } 786 787 value, ok := f.Lookup(testKey) 788 if !ok { 789 t.Fatalf("key/value metadata should have included %q", testKey) 790 } 791 if value != testValue { 792 t.Errorf("expected %q, got %q", testValue, value) 793 } 794 }