github.com/Jeffail/benthos/v3@v3.65.0/internal/impl/parquet/processor_test.go (about)

     1  package parquet
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"testing"
     8  
     9  	"github.com/Jeffail/benthos/v3/public/service"
    10  	"github.com/stretchr/testify/assert"
    11  	"github.com/stretchr/testify/require"
    12  )
    13  
    14  func TestParquetProcessorConfigLinting(t *testing.T) {
    15  	configTests := []struct {
    16  		name        string
    17  		config      string
    18  		errContains string
    19  	}{
    20  		{
    21  			name: "missing operator",
    22  			config: `
    23  parquet:
    24    schema: '{}'
    25  `,
    26  			errContains: `field operator is required`,
    27  		},
    28  		{
    29  			name: "invalid operator",
    30  			config: `
    31  parquet:
    32    operator: not_real
    33    schema: no
    34  `,
    35  			errContains: `value not_real is not a valid`,
    36  		},
    37  	}
    38  
    39  	env := service.NewEnvironment()
    40  	for _, test := range configTests {
    41  		t.Run(test.name, func(t *testing.T) {
    42  			strm := env.NewStreamBuilder()
    43  			err := strm.AddProcessorYAML(test.config)
    44  			if test.errContains == "" {
    45  				require.NoError(t, err)
    46  			} else {
    47  				require.Error(t, err)
    48  				assert.Contains(t, err.Error(), test.errContains)
    49  			}
    50  		})
    51  	}
    52  }
    53  
    54  func TestParquetProcessorConfigParse(t *testing.T) {
    55  	tmpSchemaFile, err := os.CreateTemp("", "benthos_parquet_test")
    56  	require.NoError(t, err)
    57  
    58  	_, err = tmpSchemaFile.WriteString(`{
    59    "Tag": "name=root, repetitiontype=REQUIRED",
    60    "Fields": [
    61      {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
    62      {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"},
    63      {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"}
    64    ]
    65  }`)
    66  	require.NoError(t, err)
    67  
    68  	configTests := []struct {
    69  		name        string
    70  		config      string
    71  		schema      string
    72  		errContains string
    73  	}{
    74  		{
    75  			name: "no schema or schema file",
    76  			config: `
    77  operator: to_json
    78  `,
    79  			errContains: "either a raw `schema` or a non-empty `schema_file` must be specified",
    80  		},
    81  		{
    82  			name: "raw schema",
    83  			config: `
    84  operator: to_json
    85  schema: |
    86    {
    87      "Tag": "name=root, repetitiontype=REQUIRED",
    88      "Fields": [
    89        {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
    90        {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"},
    91        {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"}
    92      ]
    93    }
    94  `,
    95  			schema: `{
    96    "Tag": "name=root, repetitiontype=REQUIRED",
    97    "Fields": [
    98      {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
    99      {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"},
   100      {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"}
   101    ]
   102  }
   103  `,
   104  		},
   105  		{
   106  			name: "schema file",
   107  			config: fmt.Sprintf(`
   108  operator: to_json
   109  schema_file: %v
   110  `, tmpSchemaFile.Name()),
   111  			schema: `{
   112    "Tag": "name=root, repetitiontype=REQUIRED",
   113    "Fields": [
   114      {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
   115      {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"},
   116      {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"}
   117    ]
   118  }`,
   119  		},
   120  	}
   121  
   122  	confSpec := parquetProcessorConfig()
   123  	env := service.NewEnvironment()
   124  
   125  	for _, test := range configTests {
   126  		t.Run(test.name, func(t *testing.T) {
   127  			pConf, err := confSpec.ParseYAML(test.config, env)
   128  			require.NoError(t, err)
   129  
   130  			proc, err := newParquetProcessorFromConfig(pConf, nil)
   131  			if test.errContains == "" {
   132  				require.NoError(t, err)
   133  				assert.Equal(t, test.schema, proc.schema)
   134  			} else {
   135  				require.Error(t, err)
   136  				assert.Contains(t, err.Error(), test.errContains)
   137  			}
   138  		})
   139  	}
   140  }
   141  
   142  func TestParquetJSONSchemaRoundTrip(t *testing.T) {
   143  	schema := `{
   144    "Tag": "name=root, repetitiontype=REQUIRED",
   145    "Fields": [
   146      {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
   147      {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"},
   148      {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"},
   149      {"Tag": "name=weight, inname=Weight, type=FLOAT, repetitiontype=REQUIRED"},
   150      {
   151        "Tag": "name=favPokemon, inname=FavPokemon, type=LIST, repetitiontype=OPTIONAL",
   152        "Fields": [
   153          { "Tag": "name=element, repetitiontype=REQUIRED", "Fields": [
   154            { "Tag": "name=name, inname=PokeName, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED" },
   155            { "Tag": "name=coolness, inname=Coolness, type=FLOAT, repetitiontype=REQUIRED" }
   156          ] }
   157        ]
   158      }
   159    ]
   160  }`
   161  
   162  	inputDocs := []string{
   163  		`{"NameIn":"fooer first","age":21,"id":1,"weight":60.1}`,
   164  		`{"NameIn":"fooer second","age":22,"id":2,"weight":60.2}`,
   165  		`{"NameIn":"fooer third","age":23,"id":3,"weight":60.3,"favPokemon":[{"PokeName":"bulbasaur","Coolness":99}]}`,
   166  		`{"NameIn":"fooer fourth","age":24,"id":4,"weight":60.4}`,
   167  		`{"NameIn":"fooer fifth","age":25,"id":5,"weight":60.5}`,
   168  		`{"NameIn":"fooer sixth","age":26,"id":6,"weight":60.6}`,
   169  	}
   170  
   171  	// Test every compression codec
   172  	for _, c := range []string{
   173  		"uncompressed", "snappy", "gzip", "lz4", "zstd",
   174  		// "lzo", "brotli", "lz4_raw",
   175  	} {
   176  		t.Run(fmt.Sprintf("with %v codec", c), func(t *testing.T) {
   177  			writer, err := newParquetProcessor("from_json", c, schema, nil)
   178  			require.NoError(t, err)
   179  
   180  			reader, err := newParquetProcessor("to_json", "", schema, nil)
   181  			require.NoError(t, err)
   182  
   183  			var inputBatch service.MessageBatch
   184  			for _, d := range inputDocs {
   185  				inputBatch = append(inputBatch, service.NewMessage([]byte(d)))
   186  			}
   187  
   188  			writerResBatches, err := writer.ProcessBatch(context.Background(), inputBatch)
   189  			require.NoError(t, err)
   190  			require.Len(t, writerResBatches, 1)
   191  			require.Len(t, writerResBatches[0], 1)
   192  
   193  			readerResBatches, err := reader.ProcessBatch(context.Background(), writerResBatches[0])
   194  			require.NoError(t, err)
   195  			require.Len(t, writerResBatches, 1)
   196  
   197  			var readerResStrs []string
   198  			for _, m := range readerResBatches[0] {
   199  				mBytes, err := m.AsBytes()
   200  				require.NoError(t, err)
   201  				readerResStrs = append(readerResStrs, string(mBytes))
   202  			}
   203  
   204  			assert.Equal(t, []string{
   205  				`{"NameIn":"fooer first","Age":21,"Id":1,"Weight":60.1,"FavPokemon":null}`,
   206  				`{"NameIn":"fooer second","Age":22,"Id":2,"Weight":60.2,"FavPokemon":null}`,
   207  				`{"NameIn":"fooer third","Age":23,"Id":3,"Weight":60.3,"FavPokemon":[{"PokeName":"bulbasaur","Coolness":99}]}`,
   208  				`{"NameIn":"fooer fourth","Age":24,"Id":4,"Weight":60.4,"FavPokemon":null}`,
   209  				`{"NameIn":"fooer fifth","Age":25,"Id":5,"Weight":60.5,"FavPokemon":null}`,
   210  				`{"NameIn":"fooer sixth","Age":26,"Id":6,"Weight":60.6,"FavPokemon":null}`,
   211  			}, readerResStrs)
   212  		})
   213  	}
   214  }