github.com/fraugster/parquet-go@v0.12.0/cmd/csv2parquet/main_test.go

github.com/fraugster/parquet-go@v0.12.0/cmd/csv2parquet/main_test.go (about)

     1  package main
     2  
     3  import (
     4  	"bytes"
     5  	"testing"
     6  
     7  	goparquet "github.com/fraugster/parquet-go"
     8  	"github.com/fraugster/parquet-go/parquet"
     9  	"github.com/stretchr/testify/assert"
    10  	"github.com/stretchr/testify/require"
    11  )
    12  
    13  func TestParseTypeHints(t *testing.T) {
    14  	tests := map[string]struct {
    15  		Input          string
    16  		ExpectedOutput map[string]string
    17  		ExpectErr      bool
    18  	}{
    19  		"simple": {
    20  			Input:          "foo=boolean,bar=string",
    21  			ExpectedOutput: map[string]string{"foo": "boolean", "bar": "string"},
    22  		},
    23  		"simply-with-spaces": {
    24  			Input: "   foo  =  boolean ,	bar=string	 ",
    25  			ExpectedOutput: map[string]string{"foo": "boolean", "bar": "string"},
    26  		},
    27  		"empty": {
    28  			Input:          "",
    29  			ExpectedOutput: map[string]string{},
    30  		},
    31  		"invalid-type": {
    32  			Input:     "foo=invalid-type",
    33  			ExpectErr: true,
    34  		},
    35  		"invalid-field": {
    36  			Input:     "foo=boolean=invalid",
    37  			ExpectErr: true,
    38  		},
    39  	}
    40  
    41  	for testName, tt := range tests {
    42  		t.Run(testName, func(t *testing.T) {
    43  			output, err := parseTypeHints(tt.Input)
    44  			if tt.ExpectErr {
    45  				assert.Error(t, err)
    46  			} else {
    47  				assert.NoError(t, err)
    48  				assert.Equal(t, tt.ExpectedOutput, output)
    49  			}
    50  		})
    51  	}
    52  }
    53  
    54  func TestTypeHandlers(t *testing.T) {
    55  	tests := map[string]struct {
    56  		Input          string
    57  		Func           func(string) (interface{}, error)
    58  		ExpectedOutput interface{}
    59  		ExpectErr      bool
    60  	}{
    61  		"byte-array":        {"hello", byteArrayHandler, []byte("hello"), false},
    62  		"boolean-true":      {"true", booleanHandler, true, false},
    63  		"boolean-false":     {"false", booleanHandler, false, false},
    64  		"boolean-invalid":   {"invalid", booleanHandler, false, true},
    65  		"bool-UPPERCASE":    {"TRUE", booleanHandler, true, false},
    66  		"bool-num-1":        {"1", booleanHandler, true, false},
    67  		"bool-num-0":        {"0", booleanHandler, false, false},
    68  		"uint-32":           {"1234", uintHandler(32), uint32(1234), false},
    69  		"uint-invalid":      {"hello!", uintHandler(32), 0, true},
    70  		"uint-invalid-bits": {"1234", uintHandler(28), 0, true},
    71  		"uint-64":           {"1000000000000", uintHandler(64), uint64(1000000000000), false},
    72  		"int-32":            {"-1234", intHandler(32), int32(-1234), false},
    73  		"int-invalid":       {"goodbye!", intHandler(32), 0, true},
    74  		"int-invalid-bits":  {"1234", intHandler(42), 0, true},
    75  		"int-64":            {"1000000000000", intHandler(64), int64(1000000000000), false},
    76  		"float":             {"3.4", floatHandler, float32(3.4), false},
    77  		"double":            {"4.2", doubleHandler, float64(4.2), false},
    78  		"json-simple":       {`{"hello":"world"}`, jsonHandler, []byte(`{"hello":"world"}`), false},
    79  		"json-invalid":      {`{"hello":"world`, jsonHandler, nil, true},
    80  	}
    81  
    82  	for testName, tt := range tests {
    83  		t.Run(testName, func(t *testing.T) {
    84  			output, err := tt.Func(tt.Input)
    85  			if tt.ExpectErr {
    86  				require.Error(t, err)
    87  			} else {
    88  				require.NoError(t, err)
    89  				require.Equal(t, tt.ExpectedOutput, output)
    90  			}
    91  		})
    92  	}
    93  }
    94  
    95  func TestCreateColumn(t *testing.T) {
    96  	tests := map[string]struct {
    97  		Field                 string
    98  		Type                  string
    99  		ExpectErr             bool
   100  		ExpectedType          parquet.Type
   101  		ExpectedLogicalType   *parquet.LogicalType
   102  		ExpectedConvertedType *parquet.ConvertedType
   103  	}{
   104  		"simple-boolean": {
   105  			Field:        "foo",
   106  			Type:         "boolean",
   107  			ExpectErr:    false,
   108  			ExpectedType: parquet.Type_BOOLEAN,
   109  		},
   110  		"simple-byte-array": {
   111  			Field:        "foo",
   112  			Type:         "byte_array",
   113  			ExpectErr:    false,
   114  			ExpectedType: parquet.Type_BYTE_ARRAY,
   115  		},
   116  		"simple-float": {
   117  			Field:        "foo",
   118  			Type:         "float",
   119  			ExpectErr:    false,
   120  			ExpectedType: parquet.Type_FLOAT,
   121  		},
   122  		"simple-double": {
   123  			Field:        "foo",
   124  			Type:         "double",
   125  			ExpectErr:    false,
   126  			ExpectedType: parquet.Type_DOUBLE,
   127  		},
   128  		"invalid-type": {
   129  			Field:     "foo",
   130  			Type:      "invalid",
   131  			ExpectErr: true,
   132  		},
   133  		"string": {
   134  			Field:                 "foo",
   135  			Type:                  "string",
   136  			ExpectErr:             false,
   137  			ExpectedType:          parquet.Type_BYTE_ARRAY,
   138  			ExpectedLogicalType:   &parquet.LogicalType{STRING: &parquet.StringType{}},
   139  			ExpectedConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
   140  		},
   141  	}
   142  
   143  	for testName, tt := range tests {
   144  		t.Run(testName, func(t *testing.T) {
   145  			col, _, err := createColumn(tt.Field, tt.Type)
   146  			if tt.ExpectErr {
   147  				require.Error(t, err)
   148  			} else {
   149  				require.NoError(t, err)
   150  				require.Equal(t, tt.Field, col.SchemaElement.Name)
   151  				require.Equal(t, tt.ExpectedType, *col.SchemaElement.Type)
   152  				if tt.ExpectedLogicalType != nil {
   153  					require.Equal(t, tt.ExpectedLogicalType, col.SchemaElement.LogicalType)
   154  				}
   155  				if tt.ExpectedConvertedType != nil {
   156  					require.Equal(t, tt.ExpectedConvertedType, col.SchemaElement.ConvertedType)
   157  				}
   158  			}
   159  		})
   160  	}
   161  }
   162  
   163  func TestDeriveSchema(t *testing.T) {
   164  	tests := map[string]struct {
   165  		Header         []string
   166  		Types          map[string]string
   167  		ExpectErr      bool
   168  		ExpectedSchema string
   169  	}{
   170  		"single-boolean": {
   171  			Header:         []string{"foo"},
   172  			Types:          map[string]string{"foo": "boolean"},
   173  			ExpectedSchema: "message msg {\n  optional boolean foo;\n}\n",
   174  		},
   175  		"all-uints": {
   176  			Header: []string{"a", "b", "c", "d"},
   177  			Types:  map[string]string{"a": "uint8", "b": "uint16", "c": "uint32", "d": "uint64"},
   178  			ExpectedSchema: `message msg {
   179    optional int32 a (INT(8, false));
   180    optional int32 b (INT(16, false));
   181    optional int32 c (INT(32, false));
   182    optional int64 d (INT(64, false));
   183  }
   184  `,
   185  		},
   186  		"all-ints": {
   187  			Header: []string{"a", "b", "c", "d", "e"},
   188  			Types:  map[string]string{"a": "int8", "b": "int16", "c": "int32", "d": "int64", "e": "int"},
   189  			ExpectedSchema: `message msg {
   190    optional int32 a (INT(8, true));
   191    optional int32 b (INT(16, true));
   192    optional int32 c (INT(32, true));
   193    optional int64 d (INT(64, true));
   194    optional int64 e (INT(64, true));
   195  }
   196  `,
   197  		},
   198  		"string": {
   199  			Header: []string{"x"},
   200  			Types:  map[string]string{"x": "string"},
   201  			ExpectedSchema: `message msg {
   202    optional binary x (STRING);
   203  }
   204  `,
   205  		},
   206  		"json": {
   207  			Header: []string{"x"},
   208  			Types:  map[string]string{"x": "json"},
   209  			ExpectedSchema: `message msg {
   210    optional binary x (JSON);
   211  }
   212  `,
   213  		},
   214  		"default-type": {
   215  			Header: []string{"foobar"},
   216  			Types:  map[string]string{},
   217  			ExpectedSchema: `message msg {
   218    optional binary foobar (STRING);
   219  }
   220  `,
   221  		},
   222  		"invalid-type": {
   223  			Header:    []string{"foobar"},
   224  			Types:     map[string]string{"foobar": "invalid"},
   225  			ExpectErr: true,
   226  		},
   227  	}
   228  
   229  	for testName, tt := range tests {
   230  		t.Run(testName, func(t *testing.T) {
   231  			schema, _, err := deriveSchema(tt.Header, tt.Types)
   232  			if tt.ExpectErr {
   233  				require.Error(t, err)
   234  			} else {
   235  				require.NoError(t, err)
   236  				require.Equal(t, tt.ExpectedSchema, schema.String())
   237  			}
   238  		})
   239  	}
   240  }
   241  
   242  func TestWriteParquetData(t *testing.T) {
   243  	tests := map[string]struct {
   244  		Header         []string
   245  		Types          map[string]string
   246  		Records        [][]string
   247  		ExpectErr      bool
   248  		ExpectedSchema string
   249  		ExpectedRows   []map[string]interface{}
   250  	}{
   251  		"simple": {
   252  			Header: []string{"person", "age", "is_vampire"},
   253  			Types:  map[string]string{"person": "string", "age": "int16", "is_vampire": "boolean"},
   254  			Records: [][]string{
   255  				{"Viago", "379", "true"},
   256  				{"Vladislav", "862", "true"},
   257  				{"Deacon", "183", "true"},
   258  				{"Petyr", "8000", "true"},
   259  				{"Nick", "28", "true"},
   260  				{"Stu", "30", "false"},
   261  			},
   262  			ExpectedSchema: "message msg {\n  optional binary person (STRING);\n  optional int32 age (INT(16, true));\n  optional boolean is_vampire;\n}\n",
   263  			ExpectedRows: []map[string]interface{}{
   264  				{"person": []byte("Viago"), "age": int32(379), "is_vampire": true},
   265  				{"person": []byte("Vladislav"), "age": int32(862), "is_vampire": true},
   266  				{"person": []byte("Deacon"), "age": int32(183), "is_vampire": true},
   267  				{"person": []byte("Petyr"), "age": int32(8000), "is_vampire": true},
   268  				{"person": []byte("Nick"), "age": int32(28), "is_vampire": true},
   269  				{"person": []byte("Stu"), "age": int32(30), "is_vampire": false},
   270  			},
   271  		},
   272  		"invalid-type": {
   273  			Header:    []string{"foo"},
   274  			Types:     map[string]string{"foo": "invalid-type"},
   275  			ExpectErr: true,
   276  			Records: [][]string{
   277  				{"asdf"},
   278  			},
   279  		},
   280  		"not-enough-columns-in-records": {
   281  			Header:    []string{"foo"},
   282  			Types:     map[string]string{"foo": "string"},
   283  			ExpectErr: true,
   284  			Records: [][]string{
   285  				{},
   286  			},
   287  		},
   288  		"invalid-type-in-record": {
   289  			Header:    []string{"foo"},
   290  			Types:     map[string]string{"foo": "int64"},
   291  			ExpectErr: true,
   292  			Records: [][]string{
   293  				{"invalid value"},
   294  			},
   295  		},
   296  		"null-value-in-record": {
   297  			Header: []string{"foo", "bar"},
   298  			Types:  map[string]string{"foo": "int64", "bar": "string"},
   299  			Records: [][]string{
   300  				{"", "hello world"},
   301  			},
   302  			ExpectedSchema: "message msg {\n  optional int64 foo (INT(64, true));\n  optional binary bar (STRING);\n}\n",
   303  			ExpectedRows: []map[string]interface{}{
   304  				{"bar": []byte("hello world")},
   305  			},
   306  		},
   307  	}
   308  
   309  	for testName, tt := range tests {
   310  		t.Run(testName, func(t *testing.T) {
   311  			buf := &bytes.Buffer{}
   312  
   313  			err := writeParquetData(
   314  				buf,
   315  				tt.Header,
   316  				tt.Types,
   317  				tt.Records,
   318  				"unit test",
   319  				parquet.CompressionCodec_SNAPPY,
   320  				150*1024*1024,
   321  			)
   322  
   323  			if tt.ExpectErr {
   324  				require.Error(t, err)
   325  				return
   326  			}
   327  
   328  			require.NoError(t, err)
   329  
   330  			r := bytes.NewReader(buf.Bytes())
   331  
   332  			pqReader, err := goparquet.NewFileReader(r)
   333  			require.NoError(t, err)
   334  
   335  			require.Equal(t, tt.ExpectedSchema, pqReader.GetSchemaDefinition().String())
   336  
   337  			rows := []map[string]interface{}{}
   338  
   339  			for i := int64(0); i < pqReader.NumRows(); i++ {
   340  				data, err := pqReader.NextRow()
   341  				require.NoError(t, err)
   342  				rows = append(rows, data)
   343  			}
   344  
   345  			require.Equal(t, tt.ExpectedRows, rows)
   346  		})
   347  	}
   348  }