github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_builder_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"fmt"
     5  	"testing"
     6  
     7  	"github.com/segmentio/parquet-go"
     8  )
     9  
    10  func ExampleRowBuilder() {
    11  	builder := parquet.NewRowBuilder(parquet.Group{
    12  		"birth_date": parquet.Optional(parquet.Date()),
    13  		"first_name": parquet.String(),
    14  		"last_name":  parquet.String(),
    15  	})
    16  
    17  	builder.Add(1, parquet.ByteArrayValue([]byte("Luke")))
    18  	builder.Add(2, parquet.ByteArrayValue([]byte("Skywalker")))
    19  
    20  	row := builder.Row()
    21  	row.Range(func(columnIndex int, columnValues []parquet.Value) bool {
    22  		fmt.Printf("%+v\n", columnValues[0])
    23  		return true
    24  	})
    25  
    26  	// Output:
    27  	// C:0 D:0 R:0 V:<null>
    28  	// C:1 D:0 R:0 V:Luke
    29  	// C:2 D:0 R:0 V:Skywalker
    30  }
    31  
    32  func TestRowBuilder(t *testing.T) {
    33  	type (
    34  		operation  = func(*parquet.RowBuilder)
    35  		operations = []operation
    36  	)
    37  
    38  	add := func(columnIndex int, columnValue parquet.Value) operation {
    39  		return func(b *parquet.RowBuilder) { b.Add(columnIndex, columnValue) }
    40  	}
    41  
    42  	next := func(columnIndex int) operation {
    43  		return func(b *parquet.RowBuilder) { b.Next(columnIndex) }
    44  	}
    45  
    46  	tests := []struct {
    47  		scenario   string
    48  		operations operations
    49  		want       parquet.Row
    50  		schema     parquet.Node
    51  	}{
    52  		{
    53  			scenario: "add missing required column value",
    54  			want: parquet.Row{
    55  				parquet.Int64Value(0).Level(0, 0, 0),
    56  			},
    57  			schema: parquet.Group{
    58  				"id": parquet.Int(64),
    59  			},
    60  		},
    61  
    62  		{
    63  			scenario: "set required column value",
    64  			operations: operations{
    65  				add(0, parquet.Int64Value(1)),
    66  			},
    67  			want: parquet.Row{
    68  				parquet.Int64Value(1).Level(0, 0, 0),
    69  			},
    70  			schema: parquet.Group{
    71  				"id": parquet.Int(64),
    72  			},
    73  		},
    74  
    75  		{
    76  			scenario: "set repeated column values",
    77  			operations: operations{
    78  				add(0, parquet.Int64Value(1)),
    79  				add(1, parquet.ByteArrayValue([]byte(`1`))),
    80  				add(1, parquet.ByteArrayValue([]byte(`2`))),
    81  				add(1, parquet.ByteArrayValue([]byte(`3`))),
    82  			},
    83  			want: parquet.Row{
    84  				parquet.Int64Value(1).Level(0, 0, 0),
    85  				parquet.ByteArrayValue([]byte(`1`)).Level(0, 1, 1),
    86  				parquet.ByteArrayValue([]byte(`2`)).Level(1, 1, 1),
    87  				parquet.ByteArrayValue([]byte(`3`)).Level(1, 1, 1),
    88  			},
    89  			schema: parquet.Group{
    90  				"id":    parquet.Int(64),
    91  				"names": parquet.Repeated(parquet.String()),
    92  			},
    93  		},
    94  
    95  		{
    96  			scenario: "add missing repeated column value",
    97  			operations: operations{
    98  				add(0, parquet.Int64Value(1)),
    99  			},
   100  			want: parquet.Row{
   101  				parquet.Int64Value(1).Level(0, 0, 0),
   102  				parquet.NullValue().Level(0, 0, 1),
   103  			},
   104  			schema: parquet.Group{
   105  				"id":    parquet.Int(64),
   106  				"names": parquet.Repeated(parquet.String()),
   107  			},
   108  		},
   109  
   110  		{
   111  			scenario: "add missing optional column value",
   112  			operations: operations{
   113  				add(0, parquet.Int64Value(1)),
   114  			},
   115  			want: parquet.Row{
   116  				parquet.Int64Value(1).Level(0, 0, 0),
   117  				parquet.NullValue().Level(0, 0, 1),
   118  			},
   119  			schema: parquet.Group{
   120  				"id":   parquet.Int(64),
   121  				"name": parquet.Optional(parquet.String()),
   122  			},
   123  		},
   124  
   125  		{
   126  			scenario: "add missing nested column values",
   127  			operations: operations{
   128  				add(0, parquet.Int64Value(1)),
   129  			},
   130  			want: parquet.Row{
   131  				parquet.Int64Value(1).Level(0, 0, 0),
   132  				parquet.NullValue().Level(0, 0, 1),
   133  				parquet.ByteArrayValue(nil).Level(0, 0, 2),
   134  				parquet.ByteArrayValue(nil).Level(0, 0, 3),
   135  			},
   136  			schema: parquet.Group{
   137  				"id": parquet.Int(64),
   138  				"profile": parquet.Group{
   139  					"first_name": parquet.String(),
   140  					"last_name":  parquet.String(),
   141  					"birth_date": parquet.Optional(parquet.Date()),
   142  				},
   143  			},
   144  		},
   145  
   146  		{
   147  			scenario: "add missing repeated column group",
   148  			operations: operations{
   149  				add(0, parquet.Int64Value(1)),
   150  				add(2, parquet.ByteArrayValue([]byte(`me`))),
   151  				add(1, parquet.Int32Value(0)),
   152  				add(1, parquet.Int32Value(123456)),
   153  				add(2, parquet.ByteArrayValue([]byte(`you`))),
   154  			},
   155  			want: parquet.Row{
   156  				parquet.Int64Value(1).Level(0, 0, 0),
   157  
   158  				parquet.Int32Value(0).Level(0, 2, 1),
   159  				parquet.Int32Value(123456).Level(1, 2, 1),
   160  
   161  				parquet.ByteArrayValue([]byte(`me`)).Level(0, 1, 2),
   162  				parquet.ByteArrayValue([]byte(`you`)).Level(1, 1, 2),
   163  
   164  				parquet.NullValue().Level(0, 1, 3),
   165  				parquet.NullValue().Level(1, 1, 3),
   166  			},
   167  			schema: parquet.Group{
   168  				"id": parquet.Int(64),
   169  				"profiles": parquet.Repeated(parquet.Group{
   170  					"first_name": parquet.String(),
   171  					"last_name":  parquet.String(),
   172  					"birth_date": parquet.Optional(parquet.Date()),
   173  				}),
   174  			},
   175  		},
   176  
   177  		{
   178  			scenario: "empty map",
   179  			want: parquet.Row{
   180  				parquet.Value{}.Level(0, 0, 0),
   181  				parquet.Value{}.Level(0, 0, 1),
   182  			},
   183  			schema: parquet.Group{
   184  				"map": parquet.Repeated(parquet.Group{
   185  					"key_value": parquet.Group{
   186  						"key":   parquet.String(),
   187  						"value": parquet.Optional(parquet.String()),
   188  					},
   189  				}),
   190  			},
   191  		},
   192  
   193  		{
   194  			scenario: "one nested maps",
   195  			operations: operations{
   196  				add(0, parquet.ByteArrayValue([]byte(`A`))),
   197  				add(1, parquet.ByteArrayValue([]byte(`1`))),
   198  				add(0, parquet.ByteArrayValue([]byte(`B`))),
   199  				add(1, parquet.ByteArrayValue([]byte(`2`))),
   200  			},
   201  			want: parquet.Row{
   202  				// objects.attributes.key_value.key
   203  				parquet.ByteArrayValue([]byte(`A`)).Level(0, 2, 0),
   204  				parquet.ByteArrayValue([]byte(`B`)).Level(2, 2, 0),
   205  				// objects.attributes.key_value.value
   206  				parquet.ByteArrayValue([]byte(`1`)).Level(0, 3, 1),
   207  				parquet.ByteArrayValue([]byte(`2`)).Level(2, 3, 1),
   208  			},
   209  			schema: parquet.Group{
   210  				"objects": parquet.Repeated(parquet.Group{
   211  					"attributes": parquet.Repeated(parquet.Group{
   212  						"key_value": parquet.Group{
   213  							"key":   parquet.String(),
   214  							"value": parquet.Optional(parquet.String()),
   215  						},
   216  					}),
   217  				}),
   218  			},
   219  		},
   220  
   221  		{
   222  			scenario: "multiple nested maps",
   223  			operations: operations{
   224  				add(0, parquet.ByteArrayValue([]byte(`A`))),
   225  				add(1, parquet.ByteArrayValue([]byte(`1`))),
   226  				add(0, parquet.ByteArrayValue([]byte(`B`))),
   227  				add(1, parquet.ByteArrayValue([]byte(`2`))),
   228  				next(1), // same as next(0) because the columns are in the same group
   229  				add(0, parquet.ByteArrayValue([]byte(`C`))),
   230  				add(1, parquet.ByteArrayValue([]byte(`3`))),
   231  			},
   232  			want: parquet.Row{
   233  				// objects.attributes.key_value.key
   234  				parquet.ByteArrayValue([]byte(`A`)).Level(0, 2, 0),
   235  				parquet.ByteArrayValue([]byte(`B`)).Level(2, 2, 0),
   236  				parquet.ByteArrayValue([]byte(`C`)).Level(1, 2, 0),
   237  				// objects.attributes.key_value.value
   238  				parquet.ByteArrayValue([]byte(`1`)).Level(0, 3, 1),
   239  				parquet.ByteArrayValue([]byte(`2`)).Level(2, 3, 1),
   240  				parquet.ByteArrayValue([]byte(`3`)).Level(1, 3, 1),
   241  			},
   242  			schema: parquet.Group{
   243  				"objects": parquet.Repeated(parquet.Group{
   244  					"attributes": parquet.Repeated(parquet.Group{
   245  						"key_value": parquet.Group{
   246  							"key":   parquet.String(),
   247  							"value": parquet.Optional(parquet.String()),
   248  						},
   249  					}),
   250  				}),
   251  			},
   252  		},
   253  	}
   254  
   255  	for _, test := range tests {
   256  		t.Run(test.scenario, func(t *testing.T) {
   257  			b := parquet.NewRowBuilder(test.schema)
   258  
   259  			for i := 0; i < 2; i++ {
   260  				for _, op := range test.operations {
   261  					op(b)
   262  				}
   263  
   264  				if got := b.Row(); !got.Equal(test.want) {
   265  					t.Fatalf("test %d: rows are not equal\nwant = %+v\ngot  = %+v", i+1, test.want, got)
   266  				}
   267  
   268  				b.Reset()
   269  			}
   270  		})
   271  	}
   272  }
   273  
   274  func BenchmarkRowBuilderAdd(b *testing.B) {
   275  	builder := parquet.NewRowBuilder(parquet.Group{
   276  		"ids": parquet.Repeated(parquet.Int(64)),
   277  	})
   278  
   279  	for i := 0; i < b.N; i++ {
   280  		builder.Add(0, parquet.Int64Value(int64(i)))
   281  
   282  		if (i % 128) == 0 {
   283  			builder.Reset() // so don't run out of memory ;)
   284  		}
   285  	}
   286  }