github.com/fraugster/parquet-go@v0.12.0/parquetschema/schema_parser_test.go (about)

     1  package parquetschema
     2  
     3  import (
     4  	"testing"
     5  
     6  	"github.com/davecgh/go-spew/spew"
     7  	"github.com/fraugster/parquet-go/parquet"
     8  	"github.com/stretchr/testify/assert"
     9  )
    10  
    11  func TestSchemaParser(t *testing.T) {
    12  	testData := []struct {
    13  		Msg       string
    14  		ExpectErr bool
    15  		Strict    bool
    16  	}{
    17  		// 0.
    18  		{`message foo { }`, false, false},
    19  		{`message foo {`, true, false}, // missing closing brace
    20  		{`message foo { required int64 bar; }`, false, false},
    21  		{`message foo { repeated int64 bar; }`, false, false},
    22  		{`message foo { optional int64 bar; }`, false, false},
    23  		{`message foo { justwrong int64 bar; }`, true, false}, // incorrect repetition type
    24  		{`message foo { optional int64 bar }`, true, false},   // missing semicolon after column name
    25  		{`message foo { required binary the_id = 1; required binary client = 2; }`, false, false},
    26  		{`message foo { optional boolean is_fraud; }`, false, false},
    27  		{`message foo {
    28  			required binary the_id (STRING) = 1;
    29  			required binary client (STRING) = 2;
    30  			required binary request_body = 3;
    31  			required int64 ts = 4;
    32  			required group data_enriched (MAP) {
    33  				repeated group key_value (MAP_KEY_VALUE) {
    34  					required binary key = 5;
    35  					required binary value = 6;
    36  				}
    37  			}
    38  			optional boolean is_fraud = 7;
    39  		}`, false, false},
    40  		// 10.
    41  		{`message $ { }`, false, false},                             // unusual token
    42  		{`message foo { optional int128 bar; }`, true, false},       // invalid type
    43  		{`message foo { optional int64 bar (BLUB); }`, true, false}, // invalid logical type
    44  		{`message foo { optional int32 bar; }`, false, false},
    45  		{`message foo { optional double bar; }`, false, false},
    46  		{`message foo { optional float bar; }`, false, false},
    47  		{`message foo { optional int96 bar; }`, false, false},
    48  		{`message foo {
    49  			required group ids (LIST) {
    50  				repeated group list {
    51  					required int64 element;
    52  				}
    53  			}
    54  		}`, false, false},
    55  		{`message foo {
    56  			optional group array_of_arrays (LIST) {
    57  				repeated group list {
    58  					required group element (LIST) {
    59  						repeated group list {
    60  							required int32 element;
    61  						}
    62  					}
    63  				}
    64  			}
    65  		}`, false, false},
    66  		{`message foo {
    67  			optional group bar (MAP) {
    68  				repeated group key_value {
    69  					required int32 key;
    70  					required int32 value;
    71  				}
    72  			}
    73  		}`, false, false},
    74  		// 20.
    75  		{`message foo {
    76  			optional group bar (LIST) {
    77  				repeated group list {
    78  					required int64 element;
    79  				}
    80  			}
    81  		}`, false, false},
    82  		{`message foo {
    83  			optional group bar (LIST) {
    84  				repeated group element {
    85  					required int64 element;
    86  				}
    87  			}
    88  		}`, false, false}, // repeated group is called "element", not "list"; but that's valid under the backwards compatibility rules.
    89  		{`message foo {
    90  			optional group bar (LIST) {
    91  				repeated int64 list;
    92  			}
    93  		}`, true, false}, // repeated list is not a group.
    94  		{`message foo {
    95  			repeated group bar (LIST) {
    96  				repeated group list {
    97  					optional int64 element;
    98  				}
    99  			}
   100  		}`, true, false}, // bar is LIST but has repetition type repeated.
   101  		{`message foo {
   102  			optional group bar (LIST) {
   103  				repeated group list {
   104  					optional int64 element;
   105  					optional int64 element2;
   106  				}
   107  			}
   108  		}`, true, false}, // bar.list has 2 children.
   109  		{`message foo {
   110  			optional group bar (LIST) {
   111  				repeated group list {
   112  					optional int64 invalid;
   113  				}
   114  			}
   115  		}`, true, false}, // bar.list has 1 child, but it's called invalid, not element.
   116  		{`message foo {
   117  			optional group bar (LIST) {
   118  				repeated group list {
   119  					repeated int64 element;
   120  				}
   121  			}
   122  		}`, true, false}, // bar.list.element is of the wrong repetition type.
   123  		{`message foo {
   124  			optional group bar (LIST) {
   125  				repeated group list {
   126  					required int64 baz;
   127  				}
   128  				optional int64 list_size;
   129  			}
   130  		}`, true, false}, // only element underneath (LIST) allowed is repeated group list; list_size is invalid.
   131  		{`message foo {
   132  			optional group bar (MAP) {
   133  				repeated group key_value {
   134  					required int64 key;
   135  					optional int32 value;
   136  				}
   137  			}
   138  		}`, false, false},
   139  		{`message foo {
   140  			optional group bar (MAP) {
   141  				repeated group stuff {
   142  					required int64 key;
   143  					optional int32 value;
   144  				}
   145  			}
   146  		}`, true, true}, // repeated group underneath (MAP) is not called key_value.
   147  		// 30.
   148  		{`message foo {
   149  			optional group bar (MAP) {
   150  				repeated int64 key_value;
   151  			}
   152  		}`, true, false}, // repeated key_value is not a group.
   153  		{`message foo {
   154  			optional group bar (MAP) {
   155  			}
   156  		}`, true, false}, // empty group bar.
   157  		{`message foo {
   158  			optional group bar (MAP) {
   159  				repeated group key_value {
   160  					required int64 key;
   161  					optional int32 value;
   162  					optional int32 another_value;
   163  				}
   164  			}
   165  		}`, true, false}, // inside key_value, only key and value are allowed.
   166  		{`message foo {
   167  			optional group bar (MAP) {
   168  				repeated group key_value {
   169  					optional int64 key;
   170  					optional int32 value;
   171  				}
   172  			}
   173  		}`, true, true}, // bar.key_value.key must be required.
   174  		{`message foo {
   175  			optional group bar (MAP) {
   176  				repeated group key_value {
   177  					required int64 key;
   178  				}
   179  			}
   180  		}`, true, false}, // bar.key_value.value is missing.
   181  		{`message foo {
   182  			optional group bar (MAP) {
   183  				repeated group key_value {
   184  					required int64 key;
   185  					optional int32 key;
   186  				}
   187  			}
   188  		}`, true, true}, // bar.key_value has 2 children but child value is missing.
   189  		{`message foo {
   190  			optional group bar (MAP) {
   191  				repeated group key_value {
   192  					required int64 value;
   193  					optional int32 value;
   194  				}
   195  			}
   196  		}`, true, true}, // strict: bar.key_value has 2 children but child key is missing.
   197  		{`message foo {
   198  			required int32 date (DATE);
   199  		}`, false, false},
   200  		{`message foo {
   201  			required int64 date (DATE);
   202  		}`, true, false}, // date is annotated as DATE but data type is int64.
   203  		{`message foo {
   204  			required int64 ts (TIMESTAMP(MILLIS, true));
   205  		}`, false, false},
   206  		// 40.
   207  		{`message foo {
   208  			required int64 ts (TIMESTAMP(MICROS, false));
   209  		}`, false, false},
   210  		{`message foo {
   211  			required int64 ts (TIMESTAMP(NANOS, false));
   212  		}`, false, false},
   213  		{`message foo {
   214  			required int96 ts (TIMESTAMP(NANOS, false));
   215  		}`, false, false},
   216  		{`message foo {
   217  			required int32 ts (TIMESTAMP(NANOS, false));
   218  		}`, true, false}, // all TIMESTAMPs must be int64.
   219  		{`message foo {
   220  			required int64 ts (TIMESTAMP(,));
   221  		}`, true, false}, // invalid annotation syntax for TIMESTAMP.
   222  		{`message foo {
   223  			required int64 ts (TIMESTAMP(FOO,false));
   224  		}`, true, false}, // invalid TIMESTAMP unit.
   225  		{`message foo {
   226  			required int64 ts (TIMESTAMP(MILLIS,bla));
   227  		}`, true, false}, // invalid TIMESTAMP isAdjustedToUTC.
   228  		{`message foo {
   229  			required fixed_len_byte_array(16) theid (UUID);
   230  		}`, false, false},
   231  		{`message foo {
   232  			required fixed_len_byte_array theid;
   233  		}`, true, false}, // no length provided.
   234  		{`message foo {
   235  			required fixed_len_byte_array(-1) theid;
   236  		}`, true, false}, // negative length.
   237  		{`message foo {
   238  			required binary group (STRING);
   239  		}`, false, false},
   240  		// 50.
   241  		{`message foo {
   242  			required int64 ts (TIME(NANOS, true));
   243  		}`, false, false},
   244  		{`message foo {
   245  			required int64 ts (TIME(MICROS, true));
   246  		}`, false, false},
   247  		{`message foo {
   248  			required int32 ts (TIME(MILLIS, true));
   249  		}`, false, false},
   250  		{`message foo {
   251  			required int64 ts (TIME(MILLIS, true));
   252  		}`, true, false}, // TIME(MILLIS, ...) must be used with int32.
   253  		{`message foo {
   254  			required int64 ts (TIME(FOOS, true));
   255  		}`, true, false}, // invalid unit FOOS.
   256  		{`message foo {
   257  			required int64 ts (TIME(MICROS, bloob));
   258  		}`, true, false}, // invalid boolean bloob
   259  		{`message foo {
   260  			required int32 foo (INT(8, true));
   261  		}`, false, false},
   262  		{`message foo {
   263  			required int32 foo (INT(16, false));
   264  		}`, false, false},
   265  		{`message foo {
   266  			required int32 foo (INT(32, true));
   267  		}`, false, false},
   268  		{`message foo {
   269  			required int64 foo (INT(64, true));
   270  		}`, false, false},
   271  		// 60.
   272  		{`message foo {
   273  			required int32 foo (INT(64, true));
   274  		}`, true, false}, // int32 can't be annotated as INT(64, true)
   275  		{`message foo {
   276  			required int64 foo (INT(32, true));
   277  		}`, true, false}, // int64 can't be annotated as INT(32, true)
   278  		{`message foo {
   279  			required int32 foo (INT(28, true));
   280  		}`, true, false}, // invalid bitwidth
   281  		{`message foo {
   282  			required int32 foo (INT(32, foobar));
   283  		}`, true, false}, // invalid isSigned
   284  		{`message foo {
   285  			required int32 foo (DECIMAL(5, 3));
   286  		}`, false, false},
   287  		{`message foo {
   288  			required int32 foo (DECIMAL(12, 3));
   289  		}`, true, false}, // precision out of bounds.
   290  		{`message foo {
   291  			required int64 foo (DECIMAL(12, 3));
   292  		}`, false, false},
   293  		{`message foo {
   294  			required int64 foo (DECIMAL(20, 3));
   295  		}`, true, false}, // precision out of bounds.
   296  		{`message foo {
   297  			required int64 foo (DECIMAL);
   298  		}`, false, false}, // no precision, scale parameters -> it's a converted type, so not an error; see also issue 12.
   299  		{`message foo {
   300  			required fixed_len_byte_array(10) foo (DECIMAL(20,10));
   301  		}`, false, false},
   302  		// 70.
   303  		{`message foo {
   304  			required fixed_len_byte_array(10) foo (DECIMAL(24,10));
   305  		}`, true, false}, // 24 is out of bounds; maximum for 10 is 23.
   306  		{`message foo {
   307  			required binary foo (DECIMAL(100,10));
   308  		}`, false, false},
   309  		{`message foo {
   310  			required binary foo (DECIMAL(0,10));
   311  		}`, true, false}, // invalid precision.
   312  		{`message foo {
   313  			required float foo (DECIMAL(1,10));
   314  		}`, true, false}, // invalid data type.
   315  		{`message foo {
   316  			required binary foo (JSON);
   317  		}`, false, false},
   318  		{`message foo {
   319  			required int64 foo (JSON);
   320  		}`, true, false}, // only binary can be annotated as JSON.
   321  		{`message foo {
   322  			required binary foo (BSON);
   323  		}`, false, false},
   324  		{`message foo {
   325  			required int32 foo (BSON);
   326  		}`, true, false}, // only binary can be annotated as BSON.
   327  		{`message foo {
   328  			required fixed_len_byte_array(32) foo (UUID);
   329  		}`, true, false}, // invalid length for UUID.
   330  		{`message foo {
   331  			required int64 foo (ENUM);
   332  		}`, true, false}, // invalid type for ENUM.
   333  		// 80.
   334  		{`message foo {
   335  			required int64 foo (UTF8);
   336  		}`, true, false}, // invalid type for UTF8.
   337  		{`message foo {
   338  			required double foo (TIME_MILLIS);
   339  		}`, true, false}, // invalid type for TIME_MILLIS.
   340  		{`message foo {
   341  			required float foo (TIME_MICROS);
   342  		}`, true, false}, // invalid type for TIME_MICROS.
   343  		{`message foo {
   344  			required double foo (TIMESTAMP_MILLIS);
   345  		}`, true, false}, // invalid type for TIMESTAMP_MILLIS.
   346  		{`message foo {
   347  			required double foo (TIMESTAMP_MICROS);
   348  		}`, true, false}, // invalid type for TIMESTAMP_MICROS.
   349  		{`message foo {
   350  			required double foo (UINT_8);
   351  		}`, true, false}, // invalid type for UINT_8.
   352  		{`message foo {
   353  			required double foo (INT_64);
   354  		}`, true, false}, // invalid type for INT_64.
   355  		{`message foo {
   356  			required double foo (INTERVAL);
   357  		}`, true, false}, // invalid type for INTERVAL.
   358  		{`message foo {
   359  			required double foo (TIME(NANOS, true));
   360  		}`, true, false}, // invalid type for TIME(NANOS, true).
   361  		{`message foo {
   362  			required double foo (TIME(MICROS, true));
   363  		}`, true, false}, // invalid type for TIME(MICROS, true).
   364  		// 90.
   365  		{`message foo {
   366  			required double foo (MAP);
   367  		}`, true, false}, // invalid type for MAP.
   368  		{`message foo {
   369  			required double foo (LIST);
   370  		}`, true, false}, // invalid type for LIST.
   371  		{`
   372  message foo { }`, false, false}, // this is necessary because we once had a parser bug when the first character of the parsed text was a newline.
   373  		{`message foo {
   374  			required group bar (MAP) {
   375  				repeated group key_value (MAP_KEY_VALUE) {
   376  					required int64 key;
   377  					required int64 value;
   378  				}
   379  				optional double baz;
   380  			}
   381  		}`, true, false}, // underneath the MAP group there is not only a key_value (MAP_KEY_VALUE), but also the field baz, which should not be there.
   382  		{`message foo {
   383  			required fixed_len_byte_array(100000000000000000000000000000000000000000000000000000000) theid (UUID);
   384  		}`, true, false}, // length couldn't be parsed properly.
   385  		{`message foo {
   386  			required int64 bar = 20000000000000000000000;
   387  		}`, true, false}, // field ID couldn't be parsed properly
   388  		{`message hive_schema {
   389  			optional group foo_list (LIST) {
   390  			  repeated group bag {
   391  				optional binary array_element (STRING);
   392  			  }
   393  			}
   394  		  }
   395  		  `, false, false}, // this is to test the backward-compatibility rules for lists when reading schemas.
   396  		{`message foo {
   397  			optional group foo_list (LIST) {
   398  				repeated int64 data;
   399  			}
   400  		}`, false, false}, // backwards compat rule 1.
   401  		{`message foo {
   402  			optional group foo_list (LIST) {
   403  				repeated group bag {
   404  				}
   405  			}
   406  		}`, true, false}, // empty repeated group child element.
   407  		{`message foo {
   408  			optional group foo_list (LIST) {
   409  				repeated group foobar {
   410  					optional int64 a;
   411  					optional int64 b;
   412  				}
   413  			}
   414  		}`, false, false}, // backwards compat rule 2.
   415  		// 100.
   416  		{`message foo {
   417  			optional group foo_list (LIST) {
   418  				repeated group array {
   419  					optional int64 data;
   420  				}
   421  			}
   422  		}`, false, false}, // backwards compat rule 3.
   423  		{`message foo {
   424  			optional group bar (MAP) {
   425  				repeated group key_value {
   426  					required int64 foo;
   427  					optional int32 bar;
   428  				}
   429  			}
   430  		}`, false, false},
   431  		{`message foo {
   432  			optional group foo_list (LIST) {
   433  				repeated group array {
   434  					optional int64 data;
   435  				}
   436  			}
   437  		}`, true, true}, // backwards compat rule 3 should fail in strict mode.
   438  		{`message foo {
   439  			optional group bar (MAP) {
   440  				repeated group key_value {
   441  					required int64 foo;
   442  					optional int32 bar;
   443  				}
   444  			}
   445  		}`, true, true}, // key and value missing.
   446  		{`message foo {
   447  			optional group bar (MAP) {
   448  				repeated group key_value {
   449  					required int64 key;
   450  				}
   451  			}
   452  		}`, true, true}, // value is missing.
   453  		{`message foo {
   454  			optional group bar (MAP_KEY_VALUE) {
   455  				repeated group map {
   456  					required binary key (UTF8);
   457  					optional int32 value;
   458  				}
   459  			}
   460  		}`, false, false},
   461  		{`message foo {
   462  			optional group bar (MAP_KEY_VALUE) {
   463  				repeated group map {
   464  					required binary key (UTF8);
   465  					optional int32 value;
   466  				}
   467  			}
   468  		}`, true, true}, // incorrectly annotated MAP_KEY_VALUE in strict mode.
   469  		{`message foo {
   470  			optional group bar (MAP) {
   471  				repeated group map {
   472  					required boolean key (STRING);
   473  					optional int32 value;
   474  				}
   475  			}
   476  		}`, true, false}, // type and logical type don't match for key.
   477  		{`message foo {
   478  			optional group bar (LIST) {
   479  				repeated group list {
   480  					required int64 element (STRING);
   481  				}
   482  			}
   483  		}`, true, false}, // type and logical type don't match for element.
   484  		{`message foo {
   485  			optional group bar (INVALID) {
   486  
   487  			}
   488  		}`, false, true}, // invalid ConvertedType
   489  		// 110.
   490  		{`message foo { required binary METADATA$ACTION (STRING); }`, false, false}, // column name includes special character.
   491  	}
   492  
   493  	for idx, tt := range testData {
   494  		p := newSchemaParser(tt.Msg)
   495  		err := p.parse()
   496  
   497  		if tt.Strict {
   498  			schemaDef := &SchemaDefinition{RootColumn: p.root}
   499  			err = schemaDef.ValidateStrict()
   500  		}
   501  
   502  		if tt.ExpectErr {
   503  			assert.Error(t, err, "%d. expected error, got none; parsed message: %s", idx, spew.Sdump(p.root))
   504  		} else {
   505  			assert.NoError(t, err, "%d. expected no error, got error instead", idx)
   506  		}
   507  	}
   508  }
   509  
   510  func TestLineNumber(t *testing.T) {
   511  	msg := `message foo {
   512  		optional group signals (LIST) {
   513  			repeated group list {
   514  			  required group element {
   515  				required binary name (STRING);
   516  				optional binary category (STRING);
   517  				required binary condition (STRING);
   518  				optional binary group (STRING);
   519  				optional binary text (STRING);
   520  				required binary type (ENUM);
   521  				repeated binary highlight (STRING);
   522  				required binary strength (ENUM)
   523  			  }
   524  			}
   525  		  }
   526  	`
   527  	p := newSchemaParser(msg)
   528  	err := p.parse()
   529  	assert.Error(t, err)
   530  
   531  	assert.Contains(t, err.Error(), "line 13:")
   532  }
   533  
   534  func TestValidate(t *testing.T) {
   535  	testData := []struct {
   536  		schemaDef *SchemaDefinition
   537  		expectErr bool
   538  	}{
   539  		{
   540  			schemaDef: nil,
   541  			expectErr: true,
   542  		},
   543  		{
   544  			schemaDef: &SchemaDefinition{},
   545  			expectErr: true,
   546  		},
   547  		{
   548  			schemaDef: &SchemaDefinition{
   549  				RootColumn: &ColumnDefinition{},
   550  			},
   551  			expectErr: true,
   552  		},
   553  		{
   554  			schemaDef: &SchemaDefinition{
   555  				RootColumn: &ColumnDefinition{
   556  					SchemaElement: &parquet.SchemaElement{},
   557  				},
   558  			},
   559  			expectErr: true,
   560  		},
   561  		{
   562  			schemaDef: &SchemaDefinition{
   563  				RootColumn: &ColumnDefinition{
   564  					SchemaElement: &parquet.SchemaElement{
   565  						Name: "foo",
   566  					},
   567  				},
   568  			},
   569  			expectErr: false,
   570  		},
   571  		{
   572  			schemaDef: &SchemaDefinition{
   573  				RootColumn: &ColumnDefinition{
   574  					SchemaElement: &parquet.SchemaElement{
   575  						Name: "foo",
   576  					},
   577  					Children: []*ColumnDefinition{
   578  						{
   579  							SchemaElement: &parquet.SchemaElement{
   580  								Name: "bar",
   581  							},
   582  						},
   583  					},
   584  				},
   585  			},
   586  			expectErr: true,
   587  		},
   588  		{
   589  			schemaDef: &SchemaDefinition{
   590  				RootColumn: &ColumnDefinition{
   591  					SchemaElement: &parquet.SchemaElement{
   592  						Name: "foo",
   593  					},
   594  					Children: []*ColumnDefinition{
   595  						{
   596  							SchemaElement: &parquet.SchemaElement{
   597  								Name: "bar",
   598  								Type: parquet.TypePtr(parquet.Type_BOOLEAN),
   599  							},
   600  						},
   601  					},
   602  				},
   603  			},
   604  			expectErr: false,
   605  		},
   606  		{
   607  			schemaDef: &SchemaDefinition{
   608  				RootColumn: &ColumnDefinition{
   609  					SchemaElement: &parquet.SchemaElement{
   610  						Name: "foo",
   611  					},
   612  					Children: []*ColumnDefinition{
   613  						{
   614  							SchemaElement: &parquet.SchemaElement{
   615  								Name: "bar",
   616  								Type: parquet.TypePtr(parquet.Type_BYTE_ARRAY),
   617  							},
   618  							Children: []*ColumnDefinition{
   619  								{
   620  									SchemaElement: &parquet.SchemaElement{
   621  										Name: "baz",
   622  										Type: parquet.TypePtr(parquet.Type_BOOLEAN),
   623  									},
   624  								},
   625  							},
   626  						},
   627  					},
   628  				},
   629  			},
   630  			expectErr: true,
   631  		},
   632  	}
   633  
   634  	for idx, tt := range testData {
   635  		err := tt.schemaDef.Validate()
   636  		if tt.expectErr {
   637  			assert.Error(t, err, "%d. validation didn't fail", idx)
   638  		} else {
   639  			assert.NoError(t, err, "%d. validation failed", idx)
   640  		}
   641  		err = tt.schemaDef.ValidateStrict()
   642  		if tt.expectErr {
   643  			assert.Error(t, err, "%d. validation didn't fail", idx)
   644  		} else {
   645  			assert.NoError(t, err, "%d. validation failed", idx)
   646  		}
   647  	}
   648  }