github.com/fraugster/parquet-go@v0.12.0/cmd/csv2parquet/main_test.go (about) 1 package main 2 3 import ( 4 "bytes" 5 "testing" 6 7 goparquet "github.com/fraugster/parquet-go" 8 "github.com/fraugster/parquet-go/parquet" 9 "github.com/stretchr/testify/assert" 10 "github.com/stretchr/testify/require" 11 ) 12 13 func TestParseTypeHints(t *testing.T) { 14 tests := map[string]struct { 15 Input string 16 ExpectedOutput map[string]string 17 ExpectErr bool 18 }{ 19 "simple": { 20 Input: "foo=boolean,bar=string", 21 ExpectedOutput: map[string]string{"foo": "boolean", "bar": "string"}, 22 }, 23 "simply-with-spaces": { 24 Input: " foo = boolean , bar=string ", 25 ExpectedOutput: map[string]string{"foo": "boolean", "bar": "string"}, 26 }, 27 "empty": { 28 Input: "", 29 ExpectedOutput: map[string]string{}, 30 }, 31 "invalid-type": { 32 Input: "foo=invalid-type", 33 ExpectErr: true, 34 }, 35 "invalid-field": { 36 Input: "foo=boolean=invalid", 37 ExpectErr: true, 38 }, 39 } 40 41 for testName, tt := range tests { 42 t.Run(testName, func(t *testing.T) { 43 output, err := parseTypeHints(tt.Input) 44 if tt.ExpectErr { 45 assert.Error(t, err) 46 } else { 47 assert.NoError(t, err) 48 assert.Equal(t, tt.ExpectedOutput, output) 49 } 50 }) 51 } 52 } 53 54 func TestTypeHandlers(t *testing.T) { 55 tests := map[string]struct { 56 Input string 57 Func func(string) (interface{}, error) 58 ExpectedOutput interface{} 59 ExpectErr bool 60 }{ 61 "byte-array": {"hello", byteArrayHandler, []byte("hello"), false}, 62 "boolean-true": {"true", booleanHandler, true, false}, 63 "boolean-false": {"false", booleanHandler, false, false}, 64 "boolean-invalid": {"invalid", booleanHandler, false, true}, 65 "bool-UPPERCASE": {"TRUE", booleanHandler, true, false}, 66 "bool-num-1": {"1", booleanHandler, true, false}, 67 "bool-num-0": {"0", booleanHandler, false, false}, 68 "uint-32": {"1234", uintHandler(32), uint32(1234), false}, 69 "uint-invalid": {"hello!", uintHandler(32), 0, true}, 70 "uint-invalid-bits": {"1234", uintHandler(28), 0, true}, 71 "uint-64": {"1000000000000", uintHandler(64), uint64(1000000000000), false}, 72 "int-32": {"-1234", intHandler(32), int32(-1234), false}, 73 "int-invalid": {"goodbye!", intHandler(32), 0, true}, 74 "int-invalid-bits": {"1234", intHandler(42), 0, true}, 75 "int-64": {"1000000000000", intHandler(64), int64(1000000000000), false}, 76 "float": {"3.4", floatHandler, float32(3.4), false}, 77 "double": {"4.2", doubleHandler, float64(4.2), false}, 78 "json-simple": {`{"hello":"world"}`, jsonHandler, []byte(`{"hello":"world"}`), false}, 79 "json-invalid": {`{"hello":"world`, jsonHandler, nil, true}, 80 } 81 82 for testName, tt := range tests { 83 t.Run(testName, func(t *testing.T) { 84 output, err := tt.Func(tt.Input) 85 if tt.ExpectErr { 86 require.Error(t, err) 87 } else { 88 require.NoError(t, err) 89 require.Equal(t, tt.ExpectedOutput, output) 90 } 91 }) 92 } 93 } 94 95 func TestCreateColumn(t *testing.T) { 96 tests := map[string]struct { 97 Field string 98 Type string 99 ExpectErr bool 100 ExpectedType parquet.Type 101 ExpectedLogicalType *parquet.LogicalType 102 ExpectedConvertedType *parquet.ConvertedType 103 }{ 104 "simple-boolean": { 105 Field: "foo", 106 Type: "boolean", 107 ExpectErr: false, 108 ExpectedType: parquet.Type_BOOLEAN, 109 }, 110 "simple-byte-array": { 111 Field: "foo", 112 Type: "byte_array", 113 ExpectErr: false, 114 ExpectedType: parquet.Type_BYTE_ARRAY, 115 }, 116 "simple-float": { 117 Field: "foo", 118 Type: "float", 119 ExpectErr: false, 120 ExpectedType: parquet.Type_FLOAT, 121 }, 122 "simple-double": { 123 Field: "foo", 124 Type: "double", 125 ExpectErr: false, 126 ExpectedType: parquet.Type_DOUBLE, 127 }, 128 "invalid-type": { 129 Field: "foo", 130 Type: "invalid", 131 ExpectErr: true, 132 }, 133 "string": { 134 Field: "foo", 135 Type: "string", 136 ExpectErr: false, 137 ExpectedType: parquet.Type_BYTE_ARRAY, 138 ExpectedLogicalType: &parquet.LogicalType{STRING: &parquet.StringType{}}, 139 ExpectedConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), 140 }, 141 } 142 143 for testName, tt := range tests { 144 t.Run(testName, func(t *testing.T) { 145 col, _, err := createColumn(tt.Field, tt.Type) 146 if tt.ExpectErr { 147 require.Error(t, err) 148 } else { 149 require.NoError(t, err) 150 require.Equal(t, tt.Field, col.SchemaElement.Name) 151 require.Equal(t, tt.ExpectedType, *col.SchemaElement.Type) 152 if tt.ExpectedLogicalType != nil { 153 require.Equal(t, tt.ExpectedLogicalType, col.SchemaElement.LogicalType) 154 } 155 if tt.ExpectedConvertedType != nil { 156 require.Equal(t, tt.ExpectedConvertedType, col.SchemaElement.ConvertedType) 157 } 158 } 159 }) 160 } 161 } 162 163 func TestDeriveSchema(t *testing.T) { 164 tests := map[string]struct { 165 Header []string 166 Types map[string]string 167 ExpectErr bool 168 ExpectedSchema string 169 }{ 170 "single-boolean": { 171 Header: []string{"foo"}, 172 Types: map[string]string{"foo": "boolean"}, 173 ExpectedSchema: "message msg {\n optional boolean foo;\n}\n", 174 }, 175 "all-uints": { 176 Header: []string{"a", "b", "c", "d"}, 177 Types: map[string]string{"a": "uint8", "b": "uint16", "c": "uint32", "d": "uint64"}, 178 ExpectedSchema: `message msg { 179 optional int32 a (INT(8, false)); 180 optional int32 b (INT(16, false)); 181 optional int32 c (INT(32, false)); 182 optional int64 d (INT(64, false)); 183 } 184 `, 185 }, 186 "all-ints": { 187 Header: []string{"a", "b", "c", "d", "e"}, 188 Types: map[string]string{"a": "int8", "b": "int16", "c": "int32", "d": "int64", "e": "int"}, 189 ExpectedSchema: `message msg { 190 optional int32 a (INT(8, true)); 191 optional int32 b (INT(16, true)); 192 optional int32 c (INT(32, true)); 193 optional int64 d (INT(64, true)); 194 optional int64 e (INT(64, true)); 195 } 196 `, 197 }, 198 "string": { 199 Header: []string{"x"}, 200 Types: map[string]string{"x": "string"}, 201 ExpectedSchema: `message msg { 202 optional binary x (STRING); 203 } 204 `, 205 }, 206 "json": { 207 Header: []string{"x"}, 208 Types: map[string]string{"x": "json"}, 209 ExpectedSchema: `message msg { 210 optional binary x (JSON); 211 } 212 `, 213 }, 214 "default-type": { 215 Header: []string{"foobar"}, 216 Types: map[string]string{}, 217 ExpectedSchema: `message msg { 218 optional binary foobar (STRING); 219 } 220 `, 221 }, 222 "invalid-type": { 223 Header: []string{"foobar"}, 224 Types: map[string]string{"foobar": "invalid"}, 225 ExpectErr: true, 226 }, 227 } 228 229 for testName, tt := range tests { 230 t.Run(testName, func(t *testing.T) { 231 schema, _, err := deriveSchema(tt.Header, tt.Types) 232 if tt.ExpectErr { 233 require.Error(t, err) 234 } else { 235 require.NoError(t, err) 236 require.Equal(t, tt.ExpectedSchema, schema.String()) 237 } 238 }) 239 } 240 } 241 242 func TestWriteParquetData(t *testing.T) { 243 tests := map[string]struct { 244 Header []string 245 Types map[string]string 246 Records [][]string 247 ExpectErr bool 248 ExpectedSchema string 249 ExpectedRows []map[string]interface{} 250 }{ 251 "simple": { 252 Header: []string{"person", "age", "is_vampire"}, 253 Types: map[string]string{"person": "string", "age": "int16", "is_vampire": "boolean"}, 254 Records: [][]string{ 255 {"Viago", "379", "true"}, 256 {"Vladislav", "862", "true"}, 257 {"Deacon", "183", "true"}, 258 {"Petyr", "8000", "true"}, 259 {"Nick", "28", "true"}, 260 {"Stu", "30", "false"}, 261 }, 262 ExpectedSchema: "message msg {\n optional binary person (STRING);\n optional int32 age (INT(16, true));\n optional boolean is_vampire;\n}\n", 263 ExpectedRows: []map[string]interface{}{ 264 {"person": []byte("Viago"), "age": int32(379), "is_vampire": true}, 265 {"person": []byte("Vladislav"), "age": int32(862), "is_vampire": true}, 266 {"person": []byte("Deacon"), "age": int32(183), "is_vampire": true}, 267 {"person": []byte("Petyr"), "age": int32(8000), "is_vampire": true}, 268 {"person": []byte("Nick"), "age": int32(28), "is_vampire": true}, 269 {"person": []byte("Stu"), "age": int32(30), "is_vampire": false}, 270 }, 271 }, 272 "invalid-type": { 273 Header: []string{"foo"}, 274 Types: map[string]string{"foo": "invalid-type"}, 275 ExpectErr: true, 276 Records: [][]string{ 277 {"asdf"}, 278 }, 279 }, 280 "not-enough-columns-in-records": { 281 Header: []string{"foo"}, 282 Types: map[string]string{"foo": "string"}, 283 ExpectErr: true, 284 Records: [][]string{ 285 {}, 286 }, 287 }, 288 "invalid-type-in-record": { 289 Header: []string{"foo"}, 290 Types: map[string]string{"foo": "int64"}, 291 ExpectErr: true, 292 Records: [][]string{ 293 {"invalid value"}, 294 }, 295 }, 296 "null-value-in-record": { 297 Header: []string{"foo", "bar"}, 298 Types: map[string]string{"foo": "int64", "bar": "string"}, 299 Records: [][]string{ 300 {"", "hello world"}, 301 }, 302 ExpectedSchema: "message msg {\n optional int64 foo (INT(64, true));\n optional binary bar (STRING);\n}\n", 303 ExpectedRows: []map[string]interface{}{ 304 {"bar": []byte("hello world")}, 305 }, 306 }, 307 } 308 309 for testName, tt := range tests { 310 t.Run(testName, func(t *testing.T) { 311 buf := &bytes.Buffer{} 312 313 err := writeParquetData( 314 buf, 315 tt.Header, 316 tt.Types, 317 tt.Records, 318 "unit test", 319 parquet.CompressionCodec_SNAPPY, 320 150*1024*1024, 321 ) 322 323 if tt.ExpectErr { 324 require.Error(t, err) 325 return 326 } 327 328 require.NoError(t, err) 329 330 r := bytes.NewReader(buf.Bytes()) 331 332 pqReader, err := goparquet.NewFileReader(r) 333 require.NoError(t, err) 334 335 require.Equal(t, tt.ExpectedSchema, pqReader.GetSchemaDefinition().String()) 336 337 rows := []map[string]interface{}{} 338 339 for i := int64(0); i < pqReader.NumRows(); i++ { 340 data, err := pqReader.NextRow() 341 require.NoError(t, err) 342 rows = append(rows, data) 343 } 344 345 require.Equal(t, tt.ExpectedRows, rows) 346 }) 347 } 348 }