github.com/fraugster/parquet-go@v0.12.0/data_store_test.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"testing"
     5  
     6  	"github.com/stretchr/testify/assert"
     7  
     8  	"github.com/stretchr/testify/require"
     9  
    10  	"github.com/fraugster/parquet-go/parquet"
    11  )
    12  
    13  func newIntStore() *ColumnStore {
    14  	d := newStore(&int32Store{ColumnParameters: &ColumnParameters{}, stats: newInt32Stats(), pageStats: newInt32Stats()}, parquet.Encoding_PLAIN, false, nil)
    15  	return d
    16  }
    17  
    18  func TestOneColumn(t *testing.T) {
    19  	row := schema{}
    20  	require.NoError(t, row.AddColumn("DocID", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED)))
    21  	row.resetData()
    22  
    23  	data := []map[string]interface{}{
    24  		{"DocID": int32(10)},
    25  		{"DocID": int32(20)},
    26  	}
    27  
    28  	for i := range data {
    29  		require.NoError(t, row.AddData(data[i]))
    30  	}
    31  	d, err := row.findDataColumn("DocID")
    32  	require.NoError(t, err)
    33  	assert.Equal(t, uint16(0), d.MaxDefinitionLevel())
    34  	assert.Equal(t, uint16(0), d.MaxRepetitionLevel())
    35  	assert.Equal(t, []interface{}{int32(10), int32(20)}, d.data.values.getValues())
    36  	assert.Equal(t, []int32{0, 0}, d.data.dLevels.toArray())
    37  	assert.Equal(t, []int32{0, 0}, d.data.rLevels.toArray())
    38  
    39  	// Now reading data
    40  
    41  	for i := range data {
    42  		read, err := row.getData()
    43  		require.NoError(t, err)
    44  		assert.Equal(t, data[i], read)
    45  	}
    46  }
    47  
    48  func TestOneColumnOptional(t *testing.T) {
    49  	row := schema{}
    50  	require.NoError(t, row.AddColumn("DocID", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL)))
    51  	row.resetData()
    52  
    53  	data := []map[string]interface{}{
    54  		{"DocID": int32(10)},
    55  		{},
    56  	}
    57  
    58  	for i := range data {
    59  		require.NoError(t, row.AddData(data[i]))
    60  	}
    61  	d, err := row.findDataColumn("DocID")
    62  	require.NoError(t, err)
    63  	assert.Equal(t, uint16(1), d.MaxDefinitionLevel())
    64  	assert.Equal(t, uint16(0), d.MaxRepetitionLevel())
    65  	assert.Equal(t, []interface{}{int32(10)}, d.data.values.getValues())
    66  	assert.Equal(t, []int32{1, 0}, d.data.dLevels.toArray())
    67  	assert.Equal(t, []int32{0, 0}, d.data.rLevels.toArray())
    68  
    69  	for i := range data {
    70  		read, err := row.getData()
    71  		require.NoError(t, err)
    72  		assert.Equal(t, data[i], read)
    73  	}
    74  }
    75  
    76  func TestOneColumnRepeated(t *testing.T) {
    77  	row := schema{}
    78  	require.NoError(t, row.AddColumn("DocID", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED)))
    79  	row.resetData()
    80  
    81  	data := []map[string]interface{}{
    82  		{"DocID": []int32{10, 20}},
    83  		{},
    84  	}
    85  
    86  	for i := range data {
    87  		require.NoError(t, row.AddData(data[i]))
    88  	}
    89  	d, err := row.findDataColumn("DocID")
    90  	require.NoError(t, err)
    91  	assert.Equal(t, uint16(1), d.MaxDefinitionLevel())
    92  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
    93  	assert.Equal(t, []interface{}{int32(10), int32(20)}, d.data.values.getValues())
    94  	assert.Equal(t, []int32{1, 1, 0}, d.data.dLevels.toArray())
    95  	assert.Equal(t, []int32{0, 1, 0}, d.data.rLevels.toArray())
    96  
    97  	for i := range data {
    98  		read, err := row.getData()
    99  		require.NoError(t, err)
   100  		assert.Equal(t, data[i], read)
   101  	}
   102  }
   103  
   104  func TestComplexPart1(t *testing.T) {
   105  	row := &schema{}
   106  	require.NoError(t, row.AddGroupByPath(ColumnPath{"Name"}, parquet.FieldRepetitionType_REPEATED))
   107  	require.NoError(t, row.AddGroupByPath(ColumnPath{"Name", "Language"}, parquet.FieldRepetitionType_REPEATED))
   108  	require.NoError(t, row.AddColumnByPath(ColumnPath{"Name", "Language", "Code"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED)))
   109  	require.NoError(t, row.AddColumnByPath(ColumnPath{"Name", "Language", "Country"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL)))
   110  	require.NoError(t, row.AddColumnByPath(ColumnPath{"Name", "URL"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL)))
   111  
   112  	row.resetData()
   113  
   114  	data := []map[string]interface{}{
   115  		{
   116  			"Name": []map[string]interface{}{
   117  				{
   118  					"Language": []map[string]interface{}{
   119  						{
   120  							"Code":    int32(1),
   121  							"Country": int32(100),
   122  						},
   123  						{
   124  							"Code": int32(2),
   125  						},
   126  					},
   127  					"URL": int32(10),
   128  				},
   129  				{
   130  					"URL": int32(11),
   131  				},
   132  				{
   133  					"Language": []map[string]interface{}{
   134  						{
   135  							"Code":    int32(3),
   136  							"Country": int32(101),
   137  						},
   138  					},
   139  				},
   140  			},
   141  		},
   142  	}
   143  
   144  	for i := range data {
   145  		require.NoError(t, row.AddData(data[i]))
   146  	}
   147  
   148  	d, err := row.findDataColumn("Name.Language.Code")
   149  	require.NoError(t, err)
   150  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   151  	assert.Equal(t, uint16(2), d.MaxRepetitionLevel())
   152  	assert.Equal(t, []interface{}{int32(1), int32(2), int32(3)}, d.data.values.getValues())
   153  	assert.Equal(t, []int32{2, 2, 1, 2}, d.data.dLevels.toArray())
   154  	assert.Equal(t, []int32{0, 2, 1, 1}, d.data.rLevels.toArray())
   155  
   156  	d, err = row.findDataColumn("Name.Language.Country")
   157  	require.NoError(t, err)
   158  	assert.Equal(t, uint16(3), d.MaxDefinitionLevel())
   159  	assert.Equal(t, uint16(2), d.MaxRepetitionLevel())
   160  	assert.Equal(t, []interface{}{int32(100), int32(101)}, d.data.values.getValues())
   161  	assert.Equal(t, []int32{3, 2, 1, 3}, d.data.dLevels.toArray())
   162  	assert.Equal(t, []int32{0, 2, 1, 1}, d.data.rLevels.toArray())
   163  
   164  	d, err = row.findDataColumn("Name.URL")
   165  	require.NoError(t, err)
   166  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   167  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   168  	assert.Equal(t, []interface{}{int32(10), int32(11)}, d.data.values.getValues())
   169  	assert.Equal(t, []int32{2, 2, 1}, d.data.dLevels.toArray())
   170  	assert.Equal(t, []int32{0, 1, 1}, d.data.rLevels.toArray())
   171  
   172  	for i := range data {
   173  		read, err := row.getData()
   174  		require.NoError(t, err)
   175  		assert.Equal(t, data[i], read)
   176  	}
   177  }
   178  
   179  func TestComplexPart2(t *testing.T) {
   180  	row := &schema{}
   181  	require.NoError(t, row.AddGroupByPath(ColumnPath{"Links"}, parquet.FieldRepetitionType_OPTIONAL))
   182  	require.NoError(t, row.AddColumnByPath(ColumnPath{"Links", "Backward"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED)))
   183  	require.NoError(t, row.AddColumnByPath(ColumnPath{"Links", "Forward"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED)))
   184  	row.resetData()
   185  
   186  	data := []map[string]interface{}{
   187  		{
   188  			"Links": map[string]interface{}{
   189  				"Forward": []int32{20, 40, 60},
   190  			},
   191  		},
   192  		{
   193  			"Links": map[string]interface{}{
   194  				"Backward": []int32{10, 30},
   195  				"Forward":  []int32{80},
   196  			},
   197  		},
   198  	}
   199  
   200  	for i := range data {
   201  		require.NoError(t, row.AddData(data[i]))
   202  	}
   203  
   204  	d, err := row.findDataColumn("Links.Forward")
   205  	require.NoError(t, err)
   206  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   207  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   208  	assert.Equal(t, []interface{}{int32(20), int32(40), int32(60), int32(80)}, d.data.values.getValues())
   209  	assert.Equal(t, []int32{2, 2, 2, 2}, d.data.dLevels.toArray())
   210  	assert.Equal(t, []int32{0, 1, 1, 0}, d.data.rLevels.toArray())
   211  
   212  	d, err = row.findDataColumn("Links.Backward")
   213  	require.NoError(t, err)
   214  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   215  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   216  	assert.Equal(t, []interface{}{int32(10), int32(30)}, d.data.values.getValues())
   217  	assert.Equal(t, []int32{1, 2, 2}, d.data.dLevels.toArray())
   218  	assert.Equal(t, []int32{0, 0, 1}, d.data.rLevels.toArray())
   219  
   220  	for i := range data {
   221  		read, err := row.getData()
   222  		require.NoError(t, err)
   223  		assert.Equal(t, data[i], read)
   224  	}
   225  }
   226  
   227  func TestComplex(t *testing.T) {
   228  	// Based on this picture https://i.stack.imgur.com/raOFu.png from this doc https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36632.pdf
   229  	row := &schema{}
   230  	require.NoError(t, row.AddColumn("DocId", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED)))
   231  	require.NoError(t, row.AddGroupByPath(ColumnPath{"Links"}, parquet.FieldRepetitionType_OPTIONAL))
   232  	require.NoError(t, row.AddColumn("Links.Backward", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED)))
   233  	require.NoError(t, row.AddColumn("Links.Forward", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED)))
   234  	require.NoError(t, row.AddGroupByPath(ColumnPath{"Name"}, parquet.FieldRepetitionType_REPEATED))
   235  	require.NoError(t, row.AddGroupByPath(ColumnPath{"Name", "Language"}, parquet.FieldRepetitionType_REPEATED))
   236  	require.NoError(t, row.AddColumn("Name.Language.Code", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED)))
   237  	require.NoError(t, row.AddColumn("Name.Language.Country", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL)))
   238  	require.NoError(t, row.AddColumn("Name.URL", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL)))
   239  	row.resetData()
   240  
   241  	data := []map[string]interface{}{
   242  		{
   243  			"DocId": int32(10),
   244  			"Links": map[string]interface{}{
   245  				"Forward": []int32{20, 40, 60},
   246  			},
   247  			"Name": []map[string]interface{}{
   248  				{
   249  					"Language": []map[string]interface{}{
   250  						{
   251  							"Code":    int32(1),
   252  							"Country": int32(100),
   253  						},
   254  						{
   255  							"Code": int32(2),
   256  						},
   257  					},
   258  					"URL": int32(10),
   259  				},
   260  				{
   261  					"URL": int32(11),
   262  				},
   263  				{
   264  					"Language": []map[string]interface{}{
   265  						{
   266  							"Code":    int32(3),
   267  							"Country": int32(101),
   268  						},
   269  					},
   270  				},
   271  			},
   272  		},
   273  		{
   274  			"DocId": int32(20),
   275  			"Links": map[string]interface{}{
   276  				"Backward": []int32{10, 30},
   277  				"Forward":  []int32{80},
   278  			},
   279  			"Name": []map[string]interface{}{
   280  				{
   281  					"URL": int32(12),
   282  				},
   283  			},
   284  		},
   285  	}
   286  
   287  	for i := range data {
   288  		require.NoError(t, row.AddData(data[i]))
   289  	}
   290  
   291  	d, err := row.findDataColumn("DocId")
   292  	require.NoError(t, err)
   293  	assert.Equal(t, uint16(0), d.MaxDefinitionLevel())
   294  	assert.Equal(t, uint16(0), d.MaxRepetitionLevel())
   295  	assert.Equal(t, []interface{}{int32(10), int32(20)}, d.data.values.getValues())
   296  	assert.Equal(t, []int32{0, 0}, d.data.dLevels.toArray())
   297  	assert.Equal(t, []int32{0, 0}, d.data.rLevels.toArray())
   298  
   299  	d, err = row.findDataColumn("Name.URL")
   300  	require.NoError(t, err)
   301  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   302  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   303  	assert.Equal(t, []interface{}{int32(10), int32(11), int32(12)}, d.data.values.getValues())
   304  	assert.Equal(t, []int32{2, 2, 1, 2}, d.data.dLevels.toArray())
   305  	assert.Equal(t, []int32{0, 1, 1, 0}, d.data.rLevels.toArray())
   306  
   307  	d, err = row.findDataColumn("Links.Forward")
   308  	require.NoError(t, err)
   309  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   310  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   311  	assert.Equal(t, []interface{}{int32(20), int32(40), int32(60), int32(80)}, d.data.values.getValues())
   312  	assert.Equal(t, []int32{2, 2, 2, 2}, d.data.dLevels.toArray())
   313  	assert.Equal(t, []int32{0, 1, 1, 0}, d.data.rLevels.toArray())
   314  
   315  	d, err = row.findDataColumn("Links.Backward")
   316  	require.NoError(t, err)
   317  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   318  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   319  	assert.Equal(t, []interface{}{int32(10), int32(30)}, d.data.values.getValues())
   320  	assert.Equal(t, []int32{1, 2, 2}, d.data.dLevels.toArray())
   321  	assert.Equal(t, []int32{0, 0, 1}, d.data.rLevels.toArray())
   322  
   323  	d, err = row.findDataColumn("Name.Language.Country")
   324  	require.NoError(t, err)
   325  	assert.Equal(t, uint16(3), d.MaxDefinitionLevel())
   326  	assert.Equal(t, uint16(2), d.MaxRepetitionLevel())
   327  	assert.Equal(t, []interface{}{int32(100), int32(101)}, d.data.values.getValues())
   328  	assert.Equal(t, []int32{3, 2, 1, 3, 1}, d.data.dLevels.toArray())
   329  	assert.Equal(t, []int32{0, 2, 1, 1, 0}, d.data.rLevels.toArray())
   330  
   331  	d, err = row.findDataColumn("Name.Language.Code")
   332  	require.NoError(t, err)
   333  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   334  	assert.Equal(t, uint16(2), d.MaxRepetitionLevel())
   335  	assert.Equal(t, []interface{}{int32(1), int32(2), int32(3)}, d.data.values.getValues())
   336  	assert.Equal(t, []int32{2, 2, 1, 2, 1}, d.data.dLevels.toArray())
   337  	assert.Equal(t, []int32{0, 2, 1, 1, 0}, d.data.rLevels.toArray())
   338  
   339  	for i := range data {
   340  		read, err := row.getData()
   341  		require.NoError(t, err)
   342  		assert.Equal(t, data[i], read)
   343  	}
   344  }
   345  
   346  func TestTwitterBlog(t *testing.T) {
   347  	// Sample from here https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html
   348  	row := &schema{}
   349  	require.NoError(t, row.AddGroupByPath(ColumnPath{"level1"}, parquet.FieldRepetitionType_REPEATED))
   350  	require.NoError(t, row.AddColumn("level1.level2", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED)))
   351  	row.resetData()
   352  
   353  	data := []map[string]interface{}{
   354  		{
   355  			"level1": []map[string]interface{}{
   356  				{"level2": []int32{1, 2, 3}},
   357  				{"level2": []int32{4, 5, 6, 7}},
   358  			},
   359  		},
   360  		{
   361  			"level1": []map[string]interface{}{
   362  				{"level2": []int32{8}},
   363  				{"level2": []int32{9, 10}},
   364  			},
   365  		},
   366  	}
   367  
   368  	for i := range data {
   369  		require.NoError(t, row.AddData(data[i]))
   370  	}
   371  
   372  	d, err := row.findDataColumn("level1.level2")
   373  	require.NoError(t, err)
   374  	var expected []interface{}
   375  	for i := 1; i < 11; i++ {
   376  		expected = append(expected, int32(i))
   377  	}
   378  	assert.Equal(t, expected, d.data.values.getValues())
   379  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   380  	assert.Equal(t, uint16(2), d.MaxRepetitionLevel())
   381  	assert.Equal(t, []int32{0, 2, 2, 1, 2, 2, 2, 0, 1, 2}, d.data.rLevels.toArray())
   382  	assert.Equal(t, []int32{2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, d.data.dLevels.toArray())
   383  
   384  	for i := range data {
   385  		read, err := row.getData()
   386  		require.NoError(t, err)
   387  		assert.Equal(t, data[i], read)
   388  	}
   389  }
   390  
   391  func TestEmptyParent(t *testing.T) {
   392  	elementStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   393  	require.NoError(t, err, "failed to create elementStore")
   394  
   395  	elementCol := NewDataColumn(elementStore, parquet.FieldRepetitionType_REQUIRED)
   396  	list, err := NewListColumn(elementCol, parquet.FieldRepetitionType_OPTIONAL)
   397  	require.NoError(t, err)
   398  
   399  	row := &schema{}
   400  	require.NoError(t, row.AddColumn("baz", list))
   401  	row.resetData()
   402  	data := []map[string]interface{}{
   403  		{
   404  			"baz": map[string]interface{}{},
   405  		},
   406  	}
   407  
   408  	for i := range data {
   409  		require.NoError(t, row.AddData(data[i]))
   410  	}
   411  
   412  	col, err := row.findDataColumn("baz.list.element")
   413  	require.NoError(t, err)
   414  
   415  	assert.Equal(t, []interface{}(nil), col.data.values.getValues())
   416  
   417  	assert.Equal(t, uint16(2), col.MaxDefinitionLevel())
   418  	assert.Equal(t, uint16(1), col.MaxRepetitionLevel())
   419  	require.Equal(t, []int32{0}, col.data.rLevels.toArray())
   420  	require.Equal(t, []int32{1}, col.data.dLevels.toArray())
   421  
   422  	for i := range data {
   423  		read, err := row.getData()
   424  		require.NoError(t, err)
   425  		assert.Equal(t, data[i], read)
   426  	}
   427  }
   428  
   429  func TestZeroRL(t *testing.T) {
   430  	row := &schema{}
   431  	//message test_msg {
   432  	//		required group baz (LIST) {
   433  	//			repeated group list {
   434  	//				required group element {
   435  	//					required int64 quux;
   436  	//				}
   437  	//			}
   438  	//		}
   439  	//	}
   440  	require.NoError(t, row.AddGroupByPath(ColumnPath{"baz"}, parquet.FieldRepetitionType_REQUIRED))
   441  	require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list"}, parquet.FieldRepetitionType_REPEATED))
   442  	require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list", "element"}, parquet.FieldRepetitionType_REQUIRED))
   443  	require.NoError(t, row.AddColumn("baz.list.element.quux", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED)))
   444  	row.resetData()
   445  
   446  	data := map[string]interface{}{
   447  		"baz": map[string]interface{}{
   448  			"list": []map[string]interface{}{
   449  				{
   450  					"element": map[string]interface{}{
   451  						"quux": int32(23),
   452  					},
   453  				},
   454  				{
   455  					"element": map[string]interface{}{
   456  						"quux": int32(42),
   457  					},
   458  				},
   459  			},
   460  		},
   461  	}
   462  
   463  	require.NoError(t, row.AddData(data))
   464  
   465  	d, err := row.findDataColumn("baz.list.element.quux")
   466  	require.NoError(t, err)
   467  	var expected = []interface{}{int32(23), int32(42)}
   468  	assert.Equal(t, expected, d.data.values.getValues())
   469  	assert.Equal(t, uint16(1), d.MaxDefinitionLevel())
   470  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   471  	assert.Equal(t, []int32{0, 1}, d.data.rLevels.toArray())
   472  	assert.Equal(t, []int32{1, 1}, d.data.dLevels.toArray())
   473  
   474  	read, err := row.getData()
   475  	require.NoError(t, err)
   476  	assert.Equal(t, data, read)
   477  
   478  	row = &schema{}
   479  	require.NoError(t, row.AddGroupByPath(ColumnPath{"baz"}, parquet.FieldRepetitionType_REQUIRED))
   480  	require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list"}, parquet.FieldRepetitionType_REPEATED))
   481  	require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list", "element"}, parquet.FieldRepetitionType_REQUIRED))
   482  	require.NoError(t, row.AddColumn("baz.list.element.quux", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL)))
   483  	row.resetData()
   484  	require.NoError(t, row.AddData(data))
   485  
   486  	d, err = row.findDataColumn("baz.list.element.quux")
   487  	require.NoError(t, err)
   488  	assert.Equal(t, expected, d.data.values.getValues())
   489  	assert.Equal(t, uint16(2), d.MaxDefinitionLevel())
   490  	assert.Equal(t, uint16(1), d.MaxRepetitionLevel())
   491  	assert.Equal(t, []int32{0, 1}, d.data.rLevels.toArray())
   492  	assert.Equal(t, []int32{2, 2}, d.data.dLevels.toArray())
   493  
   494  	read, err = row.getData()
   495  	require.NoError(t, err)
   496  	assert.Equal(t, data, read)
   497  }