github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/row_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"reflect"
     5  	"testing"
     6  
     7  	"github.com/google/uuid"
     8  	"github.com/vc42/parquet-go"
     9  )
    10  
    11  func TestRowClone(t *testing.T) {
    12  	row := parquet.Row{
    13  		parquet.ValueOf(42).Level(0, 1, 0),
    14  		parquet.ValueOf("Hello World").Level(1, 1, 1),
    15  	}
    16  	if clone := row.Clone(); !row.Equal(clone) {
    17  		t.Error("row and its clone are not equal")
    18  	}
    19  }
    20  
    21  func TestDeconstructionReconstruction(t *testing.T) {
    22  	type Person struct {
    23  		FirstName string
    24  		LastName  string
    25  		Age       int     `parquet:",optional"`
    26  		Weight    float64 `parquet:",optional"`
    27  	}
    28  
    29  	type Details struct {
    30  		Person *Person
    31  	}
    32  
    33  	type Friend struct {
    34  		ID      [16]byte `parquet:",uuid"`
    35  		Details *Details
    36  	}
    37  
    38  	type User struct {
    39  		ID      [16]byte `parquet:",uuid"`
    40  		Details *Details
    41  		Friends []Friend `parquet:",list,optional"`
    42  	}
    43  
    44  	type List2 struct {
    45  		Value string `parquet:",optional"`
    46  	}
    47  
    48  	type List1 struct {
    49  		List2 []List2 `parquet:",list"`
    50  	}
    51  
    52  	type List0 struct {
    53  		List1 []List1 `parquet:",list"`
    54  	}
    55  
    56  	type nestedListsLevel1 struct {
    57  		Level2 []string `parquet:"level2"`
    58  	}
    59  
    60  	type nestedLists struct {
    61  		Level1 []nestedListsLevel1 `parquet:"level1"`
    62  	}
    63  
    64  	tests := []struct {
    65  		scenario string
    66  		input    interface{}
    67  		values   [][]parquet.Value
    68  	}{
    69  		{
    70  			scenario: "single field",
    71  			input: struct {
    72  				Name string
    73  			}{Name: "Luke"},
    74  			values: [][]parquet.Value{
    75  				0: {parquet.ValueOf("Luke")},
    76  			},
    77  		},
    78  
    79  		{
    80  			scenario: "multiple fields",
    81  			input: Person{
    82  				FirstName: "Han",
    83  				LastName:  "Solo",
    84  				Age:       42,
    85  				Weight:    81.5,
    86  			},
    87  			values: [][]parquet.Value{
    88  				0: {parquet.ValueOf("Han")},
    89  				1: {parquet.ValueOf("Solo")},
    90  				2: {parquet.ValueOf(42).Level(0, 1, 0)},
    91  				3: {parquet.ValueOf(81.5).Level(0, 1, 0)},
    92  			},
    93  		},
    94  
    95  		{
    96  			scenario: "empty repeated field",
    97  			input: struct {
    98  				Symbols []string
    99  			}{
   100  				Symbols: []string{},
   101  			},
   102  			values: [][]parquet.Value{
   103  				0: {parquet.ValueOf(nil).Level(0, 0, 0)},
   104  			},
   105  		},
   106  
   107  		{
   108  			scenario: "single repeated field",
   109  			input: struct {
   110  				Symbols []string
   111  			}{
   112  				Symbols: []string{"EUR", "USD", "GBP", "JPY"},
   113  			},
   114  			values: [][]parquet.Value{
   115  				0: {
   116  					parquet.ValueOf("EUR").Level(0, 1, 0),
   117  					parquet.ValueOf("USD").Level(1, 1, 0),
   118  					parquet.ValueOf("GBP").Level(1, 1, 0),
   119  					parquet.ValueOf("JPY").Level(1, 1, 0),
   120  				},
   121  			},
   122  		},
   123  
   124  		{
   125  			scenario: "multiple repeated field",
   126  			input: struct {
   127  				Symbols []string
   128  				Values  []float32
   129  			}{
   130  				Symbols: []string{"EUR", "USD", "GBP", "JPY"},
   131  				Values:  []float32{0.1, 0.2, 0.3, 0.4},
   132  			},
   133  			values: [][]parquet.Value{
   134  				0: {
   135  					parquet.ValueOf("EUR").Level(0, 1, 0),
   136  					parquet.ValueOf("USD").Level(1, 1, 0),
   137  					parquet.ValueOf("GBP").Level(1, 1, 0),
   138  					parquet.ValueOf("JPY").Level(1, 1, 0),
   139  				},
   140  				1: {
   141  					parquet.ValueOf(float32(0.1)).Level(0, 1, 0),
   142  					parquet.ValueOf(float32(0.2)).Level(1, 1, 0),
   143  					parquet.ValueOf(float32(0.3)).Level(1, 1, 0),
   144  					parquet.ValueOf(float32(0.4)).Level(1, 1, 0),
   145  				},
   146  			},
   147  		},
   148  
   149  		{
   150  			scenario: "top level nil pointer field",
   151  			input: struct {
   152  				Person *Person
   153  			}{
   154  				Person: nil,
   155  			},
   156  			// Here there are four nil values because the Person type has four
   157  			// fields but it is nil.
   158  			values: [][]parquet.Value{
   159  				0: {parquet.ValueOf(nil).Level(0, 0, 0)},
   160  				1: {parquet.ValueOf(nil).Level(0, 0, 0)},
   161  				2: {parquet.ValueOf(nil).Level(0, 0, 0)},
   162  				3: {parquet.ValueOf(nil).Level(0, 0, 0)},
   163  			},
   164  		},
   165  
   166  		{
   167  			scenario: "sub level nil pointer field",
   168  			input: User{
   169  				ID: uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"),
   170  				Details: &Details{
   171  					Person: nil,
   172  				},
   173  			},
   174  			// Here there are four nil values because the Person type has four
   175  			// fields but it is nil.
   176  			values: [][]parquet.Value{
   177  				// User.ID
   178  				0: {parquet.ValueOf(uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"))},
   179  				// User.Details.Person
   180  				1: {parquet.ValueOf(nil).Level(0, 1, 0)},
   181  				2: {parquet.ValueOf(nil).Level(0, 1, 0)},
   182  				3: {parquet.ValueOf(nil).Level(0, 1, 0)},
   183  				4: {parquet.ValueOf(nil).Level(0, 1, 0)},
   184  				// User.Friends.ID
   185  				5: {parquet.ValueOf(nil).Level(0, 0, 0)},
   186  				// User.Friends.Details.Person
   187  				6: {parquet.ValueOf(nil).Level(0, 0, 0)},
   188  				7: {parquet.ValueOf(nil).Level(0, 0, 0)},
   189  				8: {parquet.ValueOf(nil).Level(0, 0, 0)},
   190  				9: {parquet.ValueOf(nil).Level(0, 0, 0)},
   191  			},
   192  		},
   193  
   194  		{
   195  			scenario: "deeply nested structure",
   196  			input: struct {
   197  				User User
   198  			}{
   199  				User: User{
   200  					ID: uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"),
   201  					Details: &Details{
   202  						Person: &Person{
   203  							FirstName: "Luke",
   204  							LastName:  "Skywalker",
   205  						},
   206  					},
   207  					Friends: []Friend{
   208  						{
   209  							ID: uuid.MustParse("1B76F8D0-82C6-403F-A104-DCDA69207220"),
   210  							Details: &Details{
   211  								Person: &Person{
   212  									FirstName: "Han",
   213  									LastName:  "Solo",
   214  								},
   215  							},
   216  						},
   217  
   218  						{
   219  							ID: uuid.MustParse("C43C8852-CCE5-40E6-B0DF-7212A5633346"),
   220  							Details: &Details{
   221  								Person: &Person{
   222  									FirstName: "Leia",
   223  									LastName:  "Skywalker",
   224  								},
   225  							},
   226  						},
   227  
   228  						{
   229  							ID: uuid.MustParse("E78642A8-0931-4D5F-918F-24DC8FF445B0"),
   230  							Details: &Details{
   231  								Person: &Person{
   232  									FirstName: "C3PO",
   233  									LastName:  "Droid",
   234  								},
   235  							},
   236  						},
   237  					},
   238  				},
   239  			},
   240  
   241  			values: [][]parquet.Value{
   242  				// User.ID
   243  				0: {parquet.ValueOf(uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"))},
   244  
   245  				// User.Details
   246  				1: {parquet.ValueOf("Luke").Level(0, 2, 0)},
   247  				2: {parquet.ValueOf("Skywalker").Level(0, 2, 0)},
   248  				3: {parquet.ValueOf(nil).Level(0, 2, 0)},
   249  				4: {parquet.ValueOf(nil).Level(0, 2, 0)},
   250  
   251  				5: { // User.Friends.ID
   252  					parquet.ValueOf(uuid.MustParse("1B76F8D0-82C6-403F-A104-DCDA69207220")).Level(0, 2, 0),
   253  					parquet.ValueOf(uuid.MustParse("C43C8852-CCE5-40E6-B0DF-7212A5633346")).Level(1, 2, 0),
   254  					parquet.ValueOf(uuid.MustParse("E78642A8-0931-4D5F-918F-24DC8FF445B0")).Level(1, 2, 0),
   255  				},
   256  
   257  				6: { // User.Friends.Details.Person.FirstName
   258  					parquet.ValueOf("Han").Level(0, 4, 0),
   259  					parquet.ValueOf("Leia").Level(1, 4, 0),
   260  					parquet.ValueOf("C3PO").Level(1, 4, 0),
   261  				},
   262  
   263  				7: { // User.Friends.Details.Person.LastName
   264  					parquet.ValueOf("Solo").Level(0, 4, 0),
   265  					parquet.ValueOf("Skywalker").Level(1, 4, 0),
   266  					parquet.ValueOf("Droid").Level(1, 4, 0),
   267  				},
   268  
   269  				8: { // User.Friends.Details.Person.Age
   270  					parquet.ValueOf(nil).Level(0, 4, 0),
   271  					parquet.ValueOf(nil).Level(1, 4, 0),
   272  					parquet.ValueOf(nil).Level(1, 4, 0),
   273  				},
   274  
   275  				9: { // User.Friends.Details.Person.Weight
   276  					parquet.ValueOf(nil).Level(0, 4, 0),
   277  					parquet.ValueOf(nil).Level(1, 4, 0),
   278  					parquet.ValueOf(nil).Level(1, 4, 0),
   279  				},
   280  			},
   281  		},
   282  
   283  		{
   284  			scenario: "multiple repeated levels",
   285  			input: List0{
   286  				List1: []List1{
   287  					{List2: []List2{{Value: "A"}, {Value: "B"}}},
   288  					{List2: []List2{}}, // parquet doesn't differentiate between empty repeated and a nil list
   289  					{List2: []List2{{Value: "C"}}},
   290  					{List2: []List2{}},
   291  					{List2: []List2{{Value: "D"}, {Value: "E"}, {Value: "F"}}},
   292  					{List2: []List2{{Value: "G"}, {Value: "H"}, {Value: "I"}}},
   293  				},
   294  			},
   295  			values: [][]parquet.Value{
   296  				{
   297  					parquet.ValueOf("A").Level(0, 3, 0),
   298  					parquet.ValueOf("B").Level(2, 3, 0),
   299  					parquet.ValueOf(nil).Level(1, 1, 0),
   300  					parquet.ValueOf("C").Level(1, 3, 0),
   301  					parquet.ValueOf(nil).Level(1, 1, 0),
   302  					parquet.ValueOf("D").Level(1, 3, 0),
   303  					parquet.ValueOf("E").Level(2, 3, 0),
   304  					parquet.ValueOf("F").Level(2, 3, 0),
   305  					parquet.ValueOf("G").Level(1, 3, 0),
   306  					parquet.ValueOf("H").Level(2, 3, 0),
   307  					parquet.ValueOf("I").Level(2, 3, 0),
   308  				},
   309  			},
   310  		},
   311  
   312  		// https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet
   313  
   314  		// message nestedLists {
   315  		//   repeated group level1 {
   316  		//     repeated string level2;
   317  		//   }
   318  		// }
   319  		// ---
   320  		// {
   321  		//   level1: {
   322  		//     level2: a
   323  		//     level2: b
   324  		//     level2: c
   325  		//   },
   326  		//   level1: {
   327  		//     level2: d
   328  		//     level2: e
   329  		//     level2: f
   330  		//     level2: g
   331  		//   }
   332  		// }
   333  		//
   334  		{
   335  			scenario: "twitter blog example 1",
   336  			input: nestedLists{
   337  				Level1: []nestedListsLevel1{
   338  					{Level2: []string{"a", "b", "c"}},
   339  					{Level2: []string{"d", "e", "f", "g"}},
   340  				},
   341  			},
   342  			values: [][]parquet.Value{
   343  				0: {
   344  					parquet.ValueOf("a").Level(0, 2, 0),
   345  					parquet.ValueOf("b").Level(2, 2, 0),
   346  					parquet.ValueOf("c").Level(2, 2, 0),
   347  					parquet.ValueOf("d").Level(1, 2, 0),
   348  					parquet.ValueOf("e").Level(2, 2, 0),
   349  					parquet.ValueOf("f").Level(2, 2, 0),
   350  					parquet.ValueOf("g").Level(2, 2, 0),
   351  				},
   352  			},
   353  		},
   354  
   355  		// message nestedLists {
   356  		//   repeated group level1 {
   357  		//     repeated string level2;
   358  		//   }
   359  		// }
   360  		// ---
   361  		// {
   362  		//   level1: {
   363  		//     level2: h
   364  		//   },
   365  		//   level1: {
   366  		//     level2: i
   367  		//     level2: j
   368  		//   }
   369  		// }
   370  		//
   371  		{
   372  			scenario: "twitter blog example 2",
   373  			input: nestedLists{
   374  				Level1: []nestedListsLevel1{
   375  					{Level2: []string{"h"}},
   376  					{Level2: []string{"i", "j"}},
   377  				},
   378  			},
   379  			values: [][]parquet.Value{
   380  				0: {
   381  					parquet.ValueOf("h").Level(0, 2, 0),
   382  					parquet.ValueOf("i").Level(1, 2, 0),
   383  					parquet.ValueOf("j").Level(2, 2, 0),
   384  				},
   385  			},
   386  		},
   387  
   388  		// message AddressBook {
   389  		//   required string owner;
   390  		//   repeated string ownerPhoneNumbers;
   391  		//   repeated group contacts {
   392  		//     required string name;
   393  		//     optional string phoneNumber;
   394  		//   }
   395  		// }
   396  		// ---
   397  		// AddressBook {
   398  		//   owner: "Julien Le Dem",
   399  		//   ownerPhoneNumbers: "555 123 4567",
   400  		//   ownerPhoneNumbers: "555 666 1337",
   401  		//   contacts: {
   402  		//     name: "Dmitriy Ryaboy",
   403  		//     phoneNumber: "555 987 6543",
   404  		//   },
   405  		//   contacts: {
   406  		//     name: "Chris Aniszczyk"
   407  		//   }
   408  		// }
   409  		{
   410  			scenario: "twitter blog example 3",
   411  			input: AddressBook{
   412  				Owner: "Julien Le Dem",
   413  				OwnerPhoneNumbers: []string{
   414  					"555 123 4567",
   415  					"555 666 1337",
   416  				},
   417  				Contacts: []Contact{
   418  					{
   419  						Name:        "Dmitriy Ryaboy",
   420  						PhoneNumber: "555 987 6543",
   421  					},
   422  					{
   423  						Name: "Chris Aniszczyk",
   424  					},
   425  				},
   426  			},
   427  			values: [][]parquet.Value{
   428  				0: { // AddressBook.owner
   429  					parquet.ValueOf("Julien Le Dem").Level(0, 0, 0),
   430  				},
   431  				1: { // AddressBook.ownerPhoneNumbers
   432  					parquet.ValueOf("555 123 4567").Level(0, 1, 0),
   433  					parquet.ValueOf("555 666 1337").Level(1, 1, 0),
   434  				},
   435  				2: { // AddressBook.contacts.name
   436  					parquet.ValueOf("Dmitriy Ryaboy").Level(0, 1, 0),
   437  					parquet.ValueOf("Chris Aniszczyk").Level(1, 1, 0),
   438  				},
   439  				3: { // AddressBook.contacts.phoneNumber
   440  					parquet.ValueOf("555 987 6543").Level(0, 2, 0),
   441  					parquet.ValueOf(nil).Level(1, 1, 0),
   442  				},
   443  			},
   444  		},
   445  	}
   446  
   447  	for _, test := range tests {
   448  		t.Run(test.scenario, func(t *testing.T) {
   449  			schema := parquet.SchemaOf(test.input)
   450  			row := schema.Deconstruct(nil, test.input)
   451  			values := columnsOf(row)
   452  
   453  			t.Logf("\n%s\n", schema)
   454  
   455  			for columnIndex, expect := range test.values {
   456  				assertEqualValues(t, columnIndex, expect, values[columnIndex])
   457  			}
   458  
   459  			newValue := reflect.New(reflect.TypeOf(test.input))
   460  			if err := schema.Reconstruct(newValue.Interface(), row); err != nil {
   461  				t.Errorf("reconstruction of the parquet row into a go value failed:\n\t%v", err)
   462  			} else if !reflect.DeepEqual(newValue.Elem().Interface(), test.input) {
   463  				t.Errorf("reconstruction of the parquet row into a go value produced the wrong output:\nwant = %#v\ngot  = %#v", test.input, newValue.Elem())
   464  			}
   465  
   466  			for columnIndex := range test.values {
   467  				values[columnIndex] = nil
   468  			}
   469  
   470  			for columnIndex, unexpected := range values {
   471  				if unexpected != nil {
   472  					t.Errorf("unexpected column index %d found with %d values in it", columnIndex, len(unexpected))
   473  				}
   474  			}
   475  		})
   476  	}
   477  }
   478  
   479  func columnsOf(row parquet.Row) [][]parquet.Value {
   480  	maxColumnIndex := 0
   481  	for _, value := range row {
   482  		if columnIndex := int(value.Column()); columnIndex > maxColumnIndex {
   483  			maxColumnIndex = columnIndex
   484  		}
   485  	}
   486  	columns := make([][]parquet.Value, maxColumnIndex+1)
   487  	for _, value := range row {
   488  		columnIndex := value.Column()
   489  		columns[columnIndex] = append(columns[columnIndex], value)
   490  	}
   491  	return columns
   492  }
   493  
   494  func assertEqualValues(t *testing.T, columnIndex int, want, got []parquet.Value) {
   495  	n := len(want)
   496  
   497  	if len(want) != len(got) {
   498  		t.Errorf("wrong number of values in column %d: want=%d got=%d", columnIndex, len(want), len(got))
   499  		if len(want) > len(got) {
   500  			n = len(got)
   501  		}
   502  	}
   503  
   504  	for i := 0; i < n; i++ {
   505  		v1, v2 := want[i], got[i]
   506  
   507  		if !parquet.Equal(v1, v2) {
   508  			t.Errorf("values at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2)
   509  		}
   510  		if columnIndex != int(v2.Column()) {
   511  			t.Errorf("column index mismatch in column %d: want=%d got=%#v", i, columnIndex, v2)
   512  		}
   513  		if v1.RepetitionLevel() != v2.RepetitionLevel() {
   514  			t.Errorf("repetition levels at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2)
   515  		}
   516  		if v1.DefinitionLevel() != v2.DefinitionLevel() {
   517  			t.Errorf("definition levels at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2)
   518  		}
   519  	}
   520  }
   521  
   522  func BenchmarkDeconstruct(b *testing.B) {
   523  	row := &AddressBook{
   524  		Owner: "Julien Le Dem",
   525  		OwnerPhoneNumbers: []string{
   526  			"555 123 4567",
   527  			"555 666 1337",
   528  		},
   529  		Contacts: []Contact{
   530  			{
   531  				Name:        "Dmitriy Ryaboy",
   532  				PhoneNumber: "555 987 6543",
   533  			},
   534  			{
   535  				Name: "Chris Aniszczyk",
   536  			},
   537  		},
   538  	}
   539  
   540  	schema := parquet.SchemaOf(row)
   541  	buffer := parquet.Row{}
   542  
   543  	for i := 0; i < b.N; i++ {
   544  		buffer = schema.Deconstruct(buffer[:0], row)
   545  	}
   546  }
   547  
   548  func BenchmarkReconstruct(b *testing.B) {
   549  	row := &AddressBook{
   550  		Owner: "Julien Le Dem",
   551  		OwnerPhoneNumbers: []string{
   552  			"555 123 4567",
   553  			"555 666 1337",
   554  		},
   555  		Contacts: []Contact{
   556  			{
   557  				Name:        "Dmitriy Ryaboy",
   558  				PhoneNumber: "555 987 6543",
   559  			},
   560  			{
   561  				Name: "Chris Aniszczyk",
   562  			},
   563  		},
   564  	}
   565  
   566  	schema := parquet.SchemaOf(row)
   567  	values := schema.Deconstruct(nil, row)
   568  	buffer := AddressBook{}
   569  
   570  	for i := 0; i < b.N; i++ {
   571  		buffer = AddressBook{}
   572  
   573  		if err := schema.Reconstruct(&buffer, values); err != nil {
   574  			b.Fatal(err)
   575  		}
   576  	}
   577  }