github.com/fraugster/parquet-go@v0.12.0/parquetschema/schema_parser_test.go (about) 1 package parquetschema 2 3 import ( 4 "testing" 5 6 "github.com/davecgh/go-spew/spew" 7 "github.com/fraugster/parquet-go/parquet" 8 "github.com/stretchr/testify/assert" 9 ) 10 11 func TestSchemaParser(t *testing.T) { 12 testData := []struct { 13 Msg string 14 ExpectErr bool 15 Strict bool 16 }{ 17 // 0. 18 {`message foo { }`, false, false}, 19 {`message foo {`, true, false}, // missing closing brace 20 {`message foo { required int64 bar; }`, false, false}, 21 {`message foo { repeated int64 bar; }`, false, false}, 22 {`message foo { optional int64 bar; }`, false, false}, 23 {`message foo { justwrong int64 bar; }`, true, false}, // incorrect repetition type 24 {`message foo { optional int64 bar }`, true, false}, // missing semicolon after column name 25 {`message foo { required binary the_id = 1; required binary client = 2; }`, false, false}, 26 {`message foo { optional boolean is_fraud; }`, false, false}, 27 {`message foo { 28 required binary the_id (STRING) = 1; 29 required binary client (STRING) = 2; 30 required binary request_body = 3; 31 required int64 ts = 4; 32 required group data_enriched (MAP) { 33 repeated group key_value (MAP_KEY_VALUE) { 34 required binary key = 5; 35 required binary value = 6; 36 } 37 } 38 optional boolean is_fraud = 7; 39 }`, false, false}, 40 // 10. 41 {`message $ { }`, false, false}, // unusual token 42 {`message foo { optional int128 bar; }`, true, false}, // invalid type 43 {`message foo { optional int64 bar (BLUB); }`, true, false}, // invalid logical type 44 {`message foo { optional int32 bar; }`, false, false}, 45 {`message foo { optional double bar; }`, false, false}, 46 {`message foo { optional float bar; }`, false, false}, 47 {`message foo { optional int96 bar; }`, false, false}, 48 {`message foo { 49 required group ids (LIST) { 50 repeated group list { 51 required int64 element; 52 } 53 } 54 }`, false, false}, 55 {`message foo { 56 optional group array_of_arrays (LIST) { 57 repeated group list { 58 required group element (LIST) { 59 repeated group list { 60 required int32 element; 61 } 62 } 63 } 64 } 65 }`, false, false}, 66 {`message foo { 67 optional group bar (MAP) { 68 repeated group key_value { 69 required int32 key; 70 required int32 value; 71 } 72 } 73 }`, false, false}, 74 // 20. 75 {`message foo { 76 optional group bar (LIST) { 77 repeated group list { 78 required int64 element; 79 } 80 } 81 }`, false, false}, 82 {`message foo { 83 optional group bar (LIST) { 84 repeated group element { 85 required int64 element; 86 } 87 } 88 }`, false, false}, // repeated group is called "element", not "list"; but that's valid under the backwards compatibility rules. 89 {`message foo { 90 optional group bar (LIST) { 91 repeated int64 list; 92 } 93 }`, true, false}, // repeated list is not a group. 94 {`message foo { 95 repeated group bar (LIST) { 96 repeated group list { 97 optional int64 element; 98 } 99 } 100 }`, true, false}, // bar is LIST but has repetition type repeated. 101 {`message foo { 102 optional group bar (LIST) { 103 repeated group list { 104 optional int64 element; 105 optional int64 element2; 106 } 107 } 108 }`, true, false}, // bar.list has 2 children. 109 {`message foo { 110 optional group bar (LIST) { 111 repeated group list { 112 optional int64 invalid; 113 } 114 } 115 }`, true, false}, // bar.list has 1 child, but it's called invalid, not element. 116 {`message foo { 117 optional group bar (LIST) { 118 repeated group list { 119 repeated int64 element; 120 } 121 } 122 }`, true, false}, // bar.list.element is of the wrong repetition type. 123 {`message foo { 124 optional group bar (LIST) { 125 repeated group list { 126 required int64 baz; 127 } 128 optional int64 list_size; 129 } 130 }`, true, false}, // only element underneath (LIST) allowed is repeated group list; list_size is invalid. 131 {`message foo { 132 optional group bar (MAP) { 133 repeated group key_value { 134 required int64 key; 135 optional int32 value; 136 } 137 } 138 }`, false, false}, 139 {`message foo { 140 optional group bar (MAP) { 141 repeated group stuff { 142 required int64 key; 143 optional int32 value; 144 } 145 } 146 }`, true, true}, // repeated group underneath (MAP) is not called key_value. 147 // 30. 148 {`message foo { 149 optional group bar (MAP) { 150 repeated int64 key_value; 151 } 152 }`, true, false}, // repeated key_value is not a group. 153 {`message foo { 154 optional group bar (MAP) { 155 } 156 }`, true, false}, // empty group bar. 157 {`message foo { 158 optional group bar (MAP) { 159 repeated group key_value { 160 required int64 key; 161 optional int32 value; 162 optional int32 another_value; 163 } 164 } 165 }`, true, false}, // inside key_value, only key and value are allowed. 166 {`message foo { 167 optional group bar (MAP) { 168 repeated group key_value { 169 optional int64 key; 170 optional int32 value; 171 } 172 } 173 }`, true, true}, // bar.key_value.key must be required. 174 {`message foo { 175 optional group bar (MAP) { 176 repeated group key_value { 177 required int64 key; 178 } 179 } 180 }`, true, false}, // bar.key_value.value is missing. 181 {`message foo { 182 optional group bar (MAP) { 183 repeated group key_value { 184 required int64 key; 185 optional int32 key; 186 } 187 } 188 }`, true, true}, // bar.key_value has 2 children but child value is missing. 189 {`message foo { 190 optional group bar (MAP) { 191 repeated group key_value { 192 required int64 value; 193 optional int32 value; 194 } 195 } 196 }`, true, true}, // strict: bar.key_value has 2 children but child key is missing. 197 {`message foo { 198 required int32 date (DATE); 199 }`, false, false}, 200 {`message foo { 201 required int64 date (DATE); 202 }`, true, false}, // date is annotated as DATE but data type is int64. 203 {`message foo { 204 required int64 ts (TIMESTAMP(MILLIS, true)); 205 }`, false, false}, 206 // 40. 207 {`message foo { 208 required int64 ts (TIMESTAMP(MICROS, false)); 209 }`, false, false}, 210 {`message foo { 211 required int64 ts (TIMESTAMP(NANOS, false)); 212 }`, false, false}, 213 {`message foo { 214 required int96 ts (TIMESTAMP(NANOS, false)); 215 }`, false, false}, 216 {`message foo { 217 required int32 ts (TIMESTAMP(NANOS, false)); 218 }`, true, false}, // all TIMESTAMPs must be int64. 219 {`message foo { 220 required int64 ts (TIMESTAMP(,)); 221 }`, true, false}, // invalid annotation syntax for TIMESTAMP. 222 {`message foo { 223 required int64 ts (TIMESTAMP(FOO,false)); 224 }`, true, false}, // invalid TIMESTAMP unit. 225 {`message foo { 226 required int64 ts (TIMESTAMP(MILLIS,bla)); 227 }`, true, false}, // invalid TIMESTAMP isAdjustedToUTC. 228 {`message foo { 229 required fixed_len_byte_array(16) theid (UUID); 230 }`, false, false}, 231 {`message foo { 232 required fixed_len_byte_array theid; 233 }`, true, false}, // no length provided. 234 {`message foo { 235 required fixed_len_byte_array(-1) theid; 236 }`, true, false}, // negative length. 237 {`message foo { 238 required binary group (STRING); 239 }`, false, false}, 240 // 50. 241 {`message foo { 242 required int64 ts (TIME(NANOS, true)); 243 }`, false, false}, 244 {`message foo { 245 required int64 ts (TIME(MICROS, true)); 246 }`, false, false}, 247 {`message foo { 248 required int32 ts (TIME(MILLIS, true)); 249 }`, false, false}, 250 {`message foo { 251 required int64 ts (TIME(MILLIS, true)); 252 }`, true, false}, // TIME(MILLIS, ...) must be used with int32. 253 {`message foo { 254 required int64 ts (TIME(FOOS, true)); 255 }`, true, false}, // invalid unit FOOS. 256 {`message foo { 257 required int64 ts (TIME(MICROS, bloob)); 258 }`, true, false}, // invalid boolean bloob 259 {`message foo { 260 required int32 foo (INT(8, true)); 261 }`, false, false}, 262 {`message foo { 263 required int32 foo (INT(16, false)); 264 }`, false, false}, 265 {`message foo { 266 required int32 foo (INT(32, true)); 267 }`, false, false}, 268 {`message foo { 269 required int64 foo (INT(64, true)); 270 }`, false, false}, 271 // 60. 272 {`message foo { 273 required int32 foo (INT(64, true)); 274 }`, true, false}, // int32 can't be annotated as INT(64, true) 275 {`message foo { 276 required int64 foo (INT(32, true)); 277 }`, true, false}, // int64 can't be annotated as INT(32, true) 278 {`message foo { 279 required int32 foo (INT(28, true)); 280 }`, true, false}, // invalid bitwidth 281 {`message foo { 282 required int32 foo (INT(32, foobar)); 283 }`, true, false}, // invalid isSigned 284 {`message foo { 285 required int32 foo (DECIMAL(5, 3)); 286 }`, false, false}, 287 {`message foo { 288 required int32 foo (DECIMAL(12, 3)); 289 }`, true, false}, // precision out of bounds. 290 {`message foo { 291 required int64 foo (DECIMAL(12, 3)); 292 }`, false, false}, 293 {`message foo { 294 required int64 foo (DECIMAL(20, 3)); 295 }`, true, false}, // precision out of bounds. 296 {`message foo { 297 required int64 foo (DECIMAL); 298 }`, false, false}, // no precision, scale parameters -> it's a converted type, so not an error; see also issue 12. 299 {`message foo { 300 required fixed_len_byte_array(10) foo (DECIMAL(20,10)); 301 }`, false, false}, 302 // 70. 303 {`message foo { 304 required fixed_len_byte_array(10) foo (DECIMAL(24,10)); 305 }`, true, false}, // 24 is out of bounds; maximum for 10 is 23. 306 {`message foo { 307 required binary foo (DECIMAL(100,10)); 308 }`, false, false}, 309 {`message foo { 310 required binary foo (DECIMAL(0,10)); 311 }`, true, false}, // invalid precision. 312 {`message foo { 313 required float foo (DECIMAL(1,10)); 314 }`, true, false}, // invalid data type. 315 {`message foo { 316 required binary foo (JSON); 317 }`, false, false}, 318 {`message foo { 319 required int64 foo (JSON); 320 }`, true, false}, // only binary can be annotated as JSON. 321 {`message foo { 322 required binary foo (BSON); 323 }`, false, false}, 324 {`message foo { 325 required int32 foo (BSON); 326 }`, true, false}, // only binary can be annotated as BSON. 327 {`message foo { 328 required fixed_len_byte_array(32) foo (UUID); 329 }`, true, false}, // invalid length for UUID. 330 {`message foo { 331 required int64 foo (ENUM); 332 }`, true, false}, // invalid type for ENUM. 333 // 80. 334 {`message foo { 335 required int64 foo (UTF8); 336 }`, true, false}, // invalid type for UTF8. 337 {`message foo { 338 required double foo (TIME_MILLIS); 339 }`, true, false}, // invalid type for TIME_MILLIS. 340 {`message foo { 341 required float foo (TIME_MICROS); 342 }`, true, false}, // invalid type for TIME_MICROS. 343 {`message foo { 344 required double foo (TIMESTAMP_MILLIS); 345 }`, true, false}, // invalid type for TIMESTAMP_MILLIS. 346 {`message foo { 347 required double foo (TIMESTAMP_MICROS); 348 }`, true, false}, // invalid type for TIMESTAMP_MICROS. 349 {`message foo { 350 required double foo (UINT_8); 351 }`, true, false}, // invalid type for UINT_8. 352 {`message foo { 353 required double foo (INT_64); 354 }`, true, false}, // invalid type for INT_64. 355 {`message foo { 356 required double foo (INTERVAL); 357 }`, true, false}, // invalid type for INTERVAL. 358 {`message foo { 359 required double foo (TIME(NANOS, true)); 360 }`, true, false}, // invalid type for TIME(NANOS, true). 361 {`message foo { 362 required double foo (TIME(MICROS, true)); 363 }`, true, false}, // invalid type for TIME(MICROS, true). 364 // 90. 365 {`message foo { 366 required double foo (MAP); 367 }`, true, false}, // invalid type for MAP. 368 {`message foo { 369 required double foo (LIST); 370 }`, true, false}, // invalid type for LIST. 371 {` 372 message foo { }`, false, false}, // this is necessary because we once had a parser bug when the first character of the parsed text was a newline. 373 {`message foo { 374 required group bar (MAP) { 375 repeated group key_value (MAP_KEY_VALUE) { 376 required int64 key; 377 required int64 value; 378 } 379 optional double baz; 380 } 381 }`, true, false}, // underneath the MAP group there is not only a key_value (MAP_KEY_VALUE), but also the field baz, which should not be there. 382 {`message foo { 383 required fixed_len_byte_array(100000000000000000000000000000000000000000000000000000000) theid (UUID); 384 }`, true, false}, // length couldn't be parsed properly. 385 {`message foo { 386 required int64 bar = 20000000000000000000000; 387 }`, true, false}, // field ID couldn't be parsed properly 388 {`message hive_schema { 389 optional group foo_list (LIST) { 390 repeated group bag { 391 optional binary array_element (STRING); 392 } 393 } 394 } 395 `, false, false}, // this is to test the backward-compatibility rules for lists when reading schemas. 396 {`message foo { 397 optional group foo_list (LIST) { 398 repeated int64 data; 399 } 400 }`, false, false}, // backwards compat rule 1. 401 {`message foo { 402 optional group foo_list (LIST) { 403 repeated group bag { 404 } 405 } 406 }`, true, false}, // empty repeated group child element. 407 {`message foo { 408 optional group foo_list (LIST) { 409 repeated group foobar { 410 optional int64 a; 411 optional int64 b; 412 } 413 } 414 }`, false, false}, // backwards compat rule 2. 415 // 100. 416 {`message foo { 417 optional group foo_list (LIST) { 418 repeated group array { 419 optional int64 data; 420 } 421 } 422 }`, false, false}, // backwards compat rule 3. 423 {`message foo { 424 optional group bar (MAP) { 425 repeated group key_value { 426 required int64 foo; 427 optional int32 bar; 428 } 429 } 430 }`, false, false}, 431 {`message foo { 432 optional group foo_list (LIST) { 433 repeated group array { 434 optional int64 data; 435 } 436 } 437 }`, true, true}, // backwards compat rule 3 should fail in strict mode. 438 {`message foo { 439 optional group bar (MAP) { 440 repeated group key_value { 441 required int64 foo; 442 optional int32 bar; 443 } 444 } 445 }`, true, true}, // key and value missing. 446 {`message foo { 447 optional group bar (MAP) { 448 repeated group key_value { 449 required int64 key; 450 } 451 } 452 }`, true, true}, // value is missing. 453 {`message foo { 454 optional group bar (MAP_KEY_VALUE) { 455 repeated group map { 456 required binary key (UTF8); 457 optional int32 value; 458 } 459 } 460 }`, false, false}, 461 {`message foo { 462 optional group bar (MAP_KEY_VALUE) { 463 repeated group map { 464 required binary key (UTF8); 465 optional int32 value; 466 } 467 } 468 }`, true, true}, // incorrectly annotated MAP_KEY_VALUE in strict mode. 469 {`message foo { 470 optional group bar (MAP) { 471 repeated group map { 472 required boolean key (STRING); 473 optional int32 value; 474 } 475 } 476 }`, true, false}, // type and logical type don't match for key. 477 {`message foo { 478 optional group bar (LIST) { 479 repeated group list { 480 required int64 element (STRING); 481 } 482 } 483 }`, true, false}, // type and logical type don't match for element. 484 {`message foo { 485 optional group bar (INVALID) { 486 487 } 488 }`, false, true}, // invalid ConvertedType 489 // 110. 490 {`message foo { required binary METADATA$ACTION (STRING); }`, false, false}, // column name includes special character. 491 } 492 493 for idx, tt := range testData { 494 p := newSchemaParser(tt.Msg) 495 err := p.parse() 496 497 if tt.Strict { 498 schemaDef := &SchemaDefinition{RootColumn: p.root} 499 err = schemaDef.ValidateStrict() 500 } 501 502 if tt.ExpectErr { 503 assert.Error(t, err, "%d. expected error, got none; parsed message: %s", idx, spew.Sdump(p.root)) 504 } else { 505 assert.NoError(t, err, "%d. expected no error, got error instead", idx) 506 } 507 } 508 } 509 510 func TestLineNumber(t *testing.T) { 511 msg := `message foo { 512 optional group signals (LIST) { 513 repeated group list { 514 required group element { 515 required binary name (STRING); 516 optional binary category (STRING); 517 required binary condition (STRING); 518 optional binary group (STRING); 519 optional binary text (STRING); 520 required binary type (ENUM); 521 repeated binary highlight (STRING); 522 required binary strength (ENUM) 523 } 524 } 525 } 526 ` 527 p := newSchemaParser(msg) 528 err := p.parse() 529 assert.Error(t, err) 530 531 assert.Contains(t, err.Error(), "line 13:") 532 } 533 534 func TestValidate(t *testing.T) { 535 testData := []struct { 536 schemaDef *SchemaDefinition 537 expectErr bool 538 }{ 539 { 540 schemaDef: nil, 541 expectErr: true, 542 }, 543 { 544 schemaDef: &SchemaDefinition{}, 545 expectErr: true, 546 }, 547 { 548 schemaDef: &SchemaDefinition{ 549 RootColumn: &ColumnDefinition{}, 550 }, 551 expectErr: true, 552 }, 553 { 554 schemaDef: &SchemaDefinition{ 555 RootColumn: &ColumnDefinition{ 556 SchemaElement: &parquet.SchemaElement{}, 557 }, 558 }, 559 expectErr: true, 560 }, 561 { 562 schemaDef: &SchemaDefinition{ 563 RootColumn: &ColumnDefinition{ 564 SchemaElement: &parquet.SchemaElement{ 565 Name: "foo", 566 }, 567 }, 568 }, 569 expectErr: false, 570 }, 571 { 572 schemaDef: &SchemaDefinition{ 573 RootColumn: &ColumnDefinition{ 574 SchemaElement: &parquet.SchemaElement{ 575 Name: "foo", 576 }, 577 Children: []*ColumnDefinition{ 578 { 579 SchemaElement: &parquet.SchemaElement{ 580 Name: "bar", 581 }, 582 }, 583 }, 584 }, 585 }, 586 expectErr: true, 587 }, 588 { 589 schemaDef: &SchemaDefinition{ 590 RootColumn: &ColumnDefinition{ 591 SchemaElement: &parquet.SchemaElement{ 592 Name: "foo", 593 }, 594 Children: []*ColumnDefinition{ 595 { 596 SchemaElement: &parquet.SchemaElement{ 597 Name: "bar", 598 Type: parquet.TypePtr(parquet.Type_BOOLEAN), 599 }, 600 }, 601 }, 602 }, 603 }, 604 expectErr: false, 605 }, 606 { 607 schemaDef: &SchemaDefinition{ 608 RootColumn: &ColumnDefinition{ 609 SchemaElement: &parquet.SchemaElement{ 610 Name: "foo", 611 }, 612 Children: []*ColumnDefinition{ 613 { 614 SchemaElement: &parquet.SchemaElement{ 615 Name: "bar", 616 Type: parquet.TypePtr(parquet.Type_BYTE_ARRAY), 617 }, 618 Children: []*ColumnDefinition{ 619 { 620 SchemaElement: &parquet.SchemaElement{ 621 Name: "baz", 622 Type: parquet.TypePtr(parquet.Type_BOOLEAN), 623 }, 624 }, 625 }, 626 }, 627 }, 628 }, 629 }, 630 expectErr: true, 631 }, 632 } 633 634 for idx, tt := range testData { 635 err := tt.schemaDef.Validate() 636 if tt.expectErr { 637 assert.Error(t, err, "%d. validation didn't fail", idx) 638 } else { 639 assert.NoError(t, err, "%d. validation failed", idx) 640 } 641 err = tt.schemaDef.ValidateStrict() 642 if tt.expectErr { 643 assert.Error(t, err, "%d. validation didn't fail", idx) 644 } else { 645 assert.NoError(t, err, "%d. validation failed", idx) 646 } 647 } 648 }