github.com/Jeffail/benthos/v3@v3.65.0/lib/input/sequence_test.go (about)

     1  package input
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"path/filepath"
     7  	"sort"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/Jeffail/benthos/v3/lib/log"
    12  	"github.com/Jeffail/benthos/v3/lib/metrics"
    13  	"github.com/Jeffail/benthos/v3/lib/response"
    14  	"github.com/Jeffail/benthos/v3/lib/types"
    15  	"github.com/stretchr/testify/assert"
    16  	"github.com/stretchr/testify/require"
    17  )
    18  
    19  func writeFiles(t *testing.T, dir string, nameToContent map[string]string) {
    20  	t.Helper()
    21  
    22  	for k, v := range nameToContent {
    23  		require.NoError(t, os.WriteFile(filepath.Join(dir, k), []byte(v), 0o600))
    24  	}
    25  }
    26  
    27  func TestSequenceHappy(t *testing.T) {
    28  	t.Parallel()
    29  
    30  	tmpDir := t.TempDir()
    31  
    32  	files := map[string]string{
    33  		"f1": "foo\nbar\nbaz",
    34  		"f2": "buz\nbev\nbif\n",
    35  		"f3": "qux\nquz\nqev",
    36  	}
    37  
    38  	writeFiles(t, tmpDir, files)
    39  
    40  	conf := NewConfig()
    41  	conf.Type = TypeSequence
    42  
    43  	for _, k := range []string{"f1", "f2", "f3"} {
    44  		inConf := NewConfig()
    45  		inConf.Type = TypeFile
    46  		inConf.File.Path = filepath.Join(tmpDir, k)
    47  		conf.Sequence.Inputs = append(conf.Sequence.Inputs, inConf)
    48  	}
    49  
    50  	rdr, err := New(conf, types.NoopMgr(), log.Noop(), metrics.Noop())
    51  	require.NoError(t, err)
    52  
    53  	exp, act := []string{
    54  		"foo", "bar", "baz", "buz", "bev", "bif", "qux", "quz", "qev",
    55  	}, []string{}
    56  
    57  consumeLoop:
    58  	for {
    59  		select {
    60  		case tran, open := <-rdr.TransactionChan():
    61  			if !open {
    62  				break consumeLoop
    63  			}
    64  			assert.Equal(t, 1, tran.Payload.Len())
    65  			act = append(act, string(tran.Payload.Get(0).Get()))
    66  			select {
    67  			case tran.ResponseChan <- response.NewAck():
    68  			case <-time.After(time.Minute):
    69  				t.Fatalf("failed to ack after: %v", act)
    70  			}
    71  		case <-time.After(time.Minute):
    72  			t.Fatalf("Failed to consume message after: %v", act)
    73  		}
    74  	}
    75  
    76  	assert.Equal(t, exp, act)
    77  
    78  	rdr.CloseAsync()
    79  	assert.NoError(t, rdr.WaitForClose(time.Second))
    80  }
    81  
    82  func TestSequenceJoins(t *testing.T) {
    83  	t.Parallel()
    84  
    85  	tmpDir := t.TempDir()
    86  
    87  	files := map[string]string{
    88  		"csv1": "id,name,age\naaa,A,20\nbbb,B,21\nccc,B,22\n",
    89  		"csv2": "id,hobby\nccc,fencing\naaa,running\naaa,gaming\n",
    90  		"ndjson1": `{"id":"aaa","stuff":{"first":"foo"}}
    91  {"id":"bbb","stuff":{"first":"bar"}}
    92  {"id":"aaa","stuff":{"second":"baz"}}`,
    93  	}
    94  
    95  	writeFiles(t, tmpDir, files)
    96  
    97  	conf := NewConfig()
    98  	conf.Type = TypeSequence
    99  	conf.Sequence.ShardedJoin.IDPath = "id"
   100  	conf.Sequence.ShardedJoin.Iterations = 1
   101  	conf.Sequence.ShardedJoin.Type = "full-outter"
   102  
   103  	csvConf := NewConfig()
   104  	csvConf.Type = TypeCSVFile
   105  	csvConf.CSVFile.Paths = []string{
   106  		filepath.Join(tmpDir, "csv1"),
   107  		filepath.Join(tmpDir, "csv2"),
   108  	}
   109  	conf.Sequence.Inputs = append(conf.Sequence.Inputs, csvConf)
   110  	for _, k := range []string{"ndjson1"} {
   111  		inConf := NewConfig()
   112  		inConf.Type = TypeFile
   113  		inConf.File.Path = filepath.Join(tmpDir, k)
   114  		conf.Sequence.Inputs = append(conf.Sequence.Inputs, inConf)
   115  	}
   116  
   117  	rdr, err := New(conf, types.NoopMgr(), log.Noop(), metrics.Noop())
   118  	require.NoError(t, err)
   119  
   120  	exp, act := []string{
   121  		`{"age":"20","hobby":["running","gaming"],"id":"aaa","name":"A","stuff":{"first":"foo","second":"baz"}}`,
   122  		`{"age":"21","id":"bbb","name":"B","stuff":{"first":"bar"}}`,
   123  		`{"age":"22","hobby":"fencing","id":"ccc","name":"B"}`,
   124  	}, []string{}
   125  
   126  consumeLoop:
   127  	for {
   128  		select {
   129  		case tran, open := <-rdr.TransactionChan():
   130  			if !open {
   131  				break consumeLoop
   132  			}
   133  			assert.Equal(t, 1, tran.Payload.Len())
   134  			act = append(act, string(tran.Payload.Get(0).Get()))
   135  			select {
   136  			case tran.ResponseChan <- response.NewAck():
   137  			case <-time.After(time.Minute):
   138  				t.Fatalf("failed to ack after: %v", act)
   139  			}
   140  		case <-time.After(time.Minute):
   141  			t.Fatalf("Failed to consume message after: %v", act)
   142  		}
   143  	}
   144  
   145  	sort.Strings(exp)
   146  	sort.Strings(act)
   147  	assert.Equal(t, exp, act)
   148  
   149  	rdr.CloseAsync()
   150  	assert.NoError(t, rdr.WaitForClose(time.Second))
   151  }
   152  
   153  func TestSequenceJoinsMergeStrategies(t *testing.T) {
   154  	t.Parallel()
   155  
   156  	testCases := []struct {
   157  		name         string
   158  		flushOnFinal bool
   159  		mergeStrat   string
   160  		files        map[string]string
   161  		finalFile    string
   162  		result       []string
   163  	}{
   164  		{
   165  			name:         "array from final",
   166  			flushOnFinal: true,
   167  			mergeStrat:   "array",
   168  			files: map[string]string{
   169  				"csv1": "id,name,age\naaa,A,20\nbbb,B,21\nccc,B,22\n",
   170  				"csv2": "id,hobby\nccc,fencing\naaa,running\naaa,gaming\n",
   171  			},
   172  			finalFile: "id,stuff\naaa,first\nccc,second\naaa,third\n",
   173  			result: []string{
   174  				`{"age":"20","hobby":["running","gaming"],"id":"aaa","name":"A","stuff":"first"}`,
   175  				`{"age":"22","hobby":"fencing","id":"ccc","name":"B","stuff":"second"}`,
   176  				`{"age":"20","hobby":["running","gaming"],"id":"aaa","name":"A","stuff":["first","third"]}`,
   177  			},
   178  		},
   179  		{
   180  			name:         "replace from final",
   181  			flushOnFinal: true,
   182  			mergeStrat:   "replace",
   183  			files: map[string]string{
   184  				"csv1": "id,name,age\naaa,A,20\nbbb,B,21\nccc,B,22\n",
   185  				"csv2": "id,hobby\nccc,fencing\naaa,running\naaa,gaming\n",
   186  			},
   187  			finalFile: "id,stuff\naaa,first\nccc,second\naaa,third\n",
   188  			result: []string{
   189  				`{"age":"20","hobby":"gaming","id":"aaa","name":"A","stuff":"first"}`,
   190  				`{"age":"20","hobby":"gaming","id":"aaa","name":"A","stuff":"third"}`,
   191  				`{"age":"22","hobby":"fencing","id":"ccc","name":"B","stuff":"second"}`,
   192  			},
   193  		},
   194  		{
   195  			name:         "keep from final",
   196  			flushOnFinal: true,
   197  			mergeStrat:   "keep",
   198  			files: map[string]string{
   199  				"csv1": "id,name,age\naaa,A,20\nbbb,B,21\nccc,B,22\n",
   200  				"csv2": "id,hobby\nccc,fencing\naaa,running\naaa,gaming\n",
   201  			},
   202  			finalFile: "id,stuff\naaa,first\nccc,second\naaa,third\n",
   203  			result: []string{
   204  				`{"age":"20","hobby":"running","id":"aaa","name":"A","stuff":"first"}`,
   205  				`{"age":"20","hobby":"running","id":"aaa","name":"A","stuff":"first"}`,
   206  				`{"age":"22","hobby":"fencing","id":"ccc","name":"B","stuff":"second"}`,
   207  			},
   208  		},
   209  	}
   210  
   211  	for _, test := range testCases {
   212  		test := test
   213  		t.Run(test.name, func(t *testing.T) {
   214  			tmpDir := t.TempDir()
   215  
   216  			writeFiles(t, tmpDir, test.files)
   217  			writeFiles(t, tmpDir, map[string]string{
   218  				"final.csv": test.finalFile,
   219  			})
   220  
   221  			conf := NewConfig()
   222  			conf.Type = TypeSequence
   223  			conf.Sequence.ShardedJoin.IDPath = "id"
   224  			conf.Sequence.ShardedJoin.MergeStrategy = test.mergeStrat
   225  			if test.flushOnFinal {
   226  				conf.Sequence.ShardedJoin.Type = "outter"
   227  			} else {
   228  				conf.Sequence.ShardedJoin.Type = "full-outter"
   229  			}
   230  			conf.Sequence.ShardedJoin.Iterations = 1
   231  
   232  			csvConf := NewConfig()
   233  			csvConf.Type = TypeCSVFile
   234  			for k := range test.files {
   235  				csvConf.CSVFile.Paths = append(csvConf.CSVFile.Paths, filepath.Join(tmpDir, k))
   236  			}
   237  			conf.Sequence.Inputs = append(conf.Sequence.Inputs, csvConf)
   238  
   239  			finalConf := NewConfig()
   240  			finalConf.Type = TypeCSVFile
   241  			finalConf.CSVFile.Paths = []string{filepath.Join(tmpDir, "final.csv")}
   242  			conf.Sequence.Inputs = append(conf.Sequence.Inputs, finalConf)
   243  
   244  			rdr, err := New(conf, types.NoopMgr(), log.Noop(), metrics.Noop())
   245  			require.NoError(t, err)
   246  
   247  			exp, act := test.result, []string{}
   248  
   249  		consumeLoop:
   250  			for {
   251  				select {
   252  				case tran, open := <-rdr.TransactionChan():
   253  					if !open {
   254  						break consumeLoop
   255  					}
   256  					assert.Equal(t, 1, tran.Payload.Len())
   257  					act = append(act, string(tran.Payload.Get(0).Get()))
   258  					select {
   259  					case tran.ResponseChan <- response.NewAck():
   260  					case <-time.After(time.Minute):
   261  						t.Fatalf("failed to ack after: %v", act)
   262  					}
   263  				case <-time.After(time.Minute):
   264  					t.Fatalf("Failed to consume message after: %v", act)
   265  				}
   266  			}
   267  
   268  			sort.Strings(exp)
   269  			sort.Strings(act)
   270  			assert.Equal(t, exp, act)
   271  
   272  			rdr.CloseAsync()
   273  			assert.NoError(t, rdr.WaitForClose(time.Second))
   274  		})
   275  	}
   276  }
   277  
   278  func TestSequenceJoinsBig(t *testing.T) {
   279  	t.Skip()
   280  	t.Parallel()
   281  
   282  	tmpDir := t.TempDir()
   283  
   284  	jsonPath := filepath.Join(tmpDir, "one.ndjson")
   285  	csvPath := filepath.Join(tmpDir, "two.csv")
   286  
   287  	ndjsonFile, err := os.Create(jsonPath)
   288  	require.NoError(t, err)
   289  
   290  	csvFile, err := os.Create(csvPath)
   291  	require.NoError(t, err)
   292  
   293  	conf := NewConfig()
   294  	conf.Type = TypeSequence
   295  	conf.Sequence.ShardedJoin.IDPath = "id"
   296  	conf.Sequence.ShardedJoin.Iterations = 5
   297  	conf.Sequence.ShardedJoin.Type = "full-outter"
   298  
   299  	csvConf := NewConfig()
   300  	csvConf.Type = TypeCSVFile
   301  	csvConf.CSVFile.Paths = []string{csvPath}
   302  	conf.Sequence.Inputs = append(conf.Sequence.Inputs, csvConf)
   303  
   304  	jsonConf := NewConfig()
   305  	jsonConf.Type = TypeFile
   306  	jsonConf.File.Paths = []string{jsonPath}
   307  	jsonConf.File.Codec = "lines"
   308  	conf.Sequence.Inputs = append(conf.Sequence.Inputs, jsonConf)
   309  
   310  	totalRows := 1000
   311  
   312  	exp, act := []string{}, []string{}
   313  
   314  	_, err = csvFile.WriteString("id,bar\n")
   315  	require.NoError(t, err)
   316  	for i := 0; i < totalRows; i++ {
   317  		exp = append(exp, fmt.Sprintf(`{"bar":["bar%v","baz%v"],"foo":"foo%v","id":"%v"}`, i, i, i, i))
   318  
   319  		_, err = fmt.Fprintf(ndjsonFile, "{\"id\":\"%v\",\"foo\":\"foo%v\"}\n", i, i)
   320  		require.NoError(t, err)
   321  
   322  		_, err = fmt.Fprintf(csvFile, "%v,bar%v\n", i, i)
   323  		require.NoError(t, err)
   324  	}
   325  	for i := 0; i < totalRows; i++ {
   326  		_, err = fmt.Fprintf(csvFile, "%v,baz%v\n", i, i)
   327  		require.NoError(t, err)
   328  	}
   329  	require.NoError(t, ndjsonFile.Close())
   330  	require.NoError(t, csvFile.Close())
   331  
   332  	rdr, err := New(conf, types.NoopMgr(), log.Noop(), metrics.Noop())
   333  	require.NoError(t, err)
   334  
   335  consumeLoop:
   336  	for {
   337  		select {
   338  		case tran, open := <-rdr.TransactionChan():
   339  			if !open {
   340  				break consumeLoop
   341  			}
   342  			assert.Equal(t, 1, tran.Payload.Len())
   343  			act = append(act, string(tran.Payload.Get(0).Get()))
   344  			select {
   345  			case tran.ResponseChan <- response.NewAck():
   346  			case <-time.After(time.Minute):
   347  				t.Fatalf("failed to ack after: %v", act)
   348  			}
   349  		case <-time.After(time.Minute):
   350  			t.Fatalf("Failed to consume message after: %v", act)
   351  		}
   352  	}
   353  
   354  	sort.Strings(exp)
   355  	sort.Strings(act)
   356  	assert.Equal(t, exp, act)
   357  
   358  	rdr.CloseAsync()
   359  	assert.NoError(t, rdr.WaitForClose(time.Second))
   360  }
   361  
   362  func TestSequenceSad(t *testing.T) {
   363  	t.Parallel()
   364  
   365  	tmpDir := t.TempDir()
   366  
   367  	files := map[string]string{
   368  		"f1": "foo\nbar\nbaz",
   369  		"f4": "buz\nbev\nbif\n",
   370  	}
   371  
   372  	writeFiles(t, tmpDir, files)
   373  
   374  	conf := NewConfig()
   375  	conf.Type = TypeSequence
   376  
   377  	for _, k := range []string{"f1", "f2", "f3"} {
   378  		inConf := NewConfig()
   379  		inConf.Type = TypeFile
   380  		inConf.File.Path = filepath.Join(tmpDir, k)
   381  		conf.Sequence.Inputs = append(conf.Sequence.Inputs, inConf)
   382  	}
   383  
   384  	rdr, err := New(conf, types.NoopMgr(), log.Noop(), metrics.Noop())
   385  	require.NoError(t, err)
   386  
   387  	exp := []string{
   388  		"foo", "bar", "baz",
   389  	}
   390  
   391  	for i, str := range exp {
   392  		select {
   393  		case tran, open := <-rdr.TransactionChan():
   394  			if !open {
   395  				t.Fatal("closed earlier than expected")
   396  			}
   397  			assert.Equal(t, 1, tran.Payload.Len())
   398  			assert.Equal(t, str, string(tran.Payload.Get(0).Get()))
   399  			select {
   400  			case tran.ResponseChan <- response.NewAck():
   401  			case <-time.After(time.Minute):
   402  				t.Fatalf("failed to ack after: %v", str)
   403  			}
   404  		case <-time.After(time.Minute):
   405  			t.Fatalf("Failed to consume message %v", i)
   406  		}
   407  	}
   408  
   409  	select {
   410  	case <-rdr.TransactionChan():
   411  		t.Fatal("unexpected transaction")
   412  	case <-time.After(100 * time.Millisecond):
   413  	}
   414  
   415  	exp = []string{
   416  		"buz", "bev", "bif",
   417  	}
   418  
   419  	require.NoError(t, os.Rename(filepath.Join(tmpDir, "f4"), filepath.Join(tmpDir, "f2")))
   420  
   421  	for i, str := range exp {
   422  		select {
   423  		case tran, open := <-rdr.TransactionChan():
   424  			if !open {
   425  				t.Fatal("closed earlier than expected")
   426  			}
   427  			assert.Equal(t, 1, tran.Payload.Len())
   428  			assert.Equal(t, str, string(tran.Payload.Get(0).Get()))
   429  			select {
   430  			case tran.ResponseChan <- response.NewAck():
   431  			case <-time.After(time.Minute):
   432  				t.Fatalf("failed to ack after: %v", str)
   433  			}
   434  		case <-time.After(time.Minute):
   435  			t.Fatalf("Failed to consume message %v", i)
   436  		}
   437  	}
   438  
   439  	rdr.CloseAsync()
   440  	assert.NoError(t, rdr.WaitForClose(time.Second))
   441  }
   442  
   443  func TestSequenceEarlyTermination(t *testing.T) {
   444  	t.Parallel()
   445  
   446  	tmpDir := t.TempDir()
   447  
   448  	writeFiles(t, tmpDir, map[string]string{
   449  		"f1": "foo\nbar\nbaz",
   450  	})
   451  
   452  	conf := NewConfig()
   453  	conf.Type = TypeSequence
   454  
   455  	inConf := NewConfig()
   456  	inConf.Type = TypeFile
   457  	inConf.File.Path = filepath.Join(tmpDir, "f1")
   458  	conf.Sequence.Inputs = append(conf.Sequence.Inputs, inConf)
   459  
   460  	rdr, err := New(conf, types.NoopMgr(), log.Noop(), metrics.Noop())
   461  	require.NoError(t, err)
   462  
   463  	select {
   464  	case tran, open := <-rdr.TransactionChan():
   465  		if !open {
   466  			t.Fatal("closed earlier than expected")
   467  		}
   468  		assert.Equal(t, 1, tran.Payload.Len())
   469  		assert.Equal(t, "foo", string(tran.Payload.Get(0).Get()))
   470  	case <-time.After(time.Minute):
   471  		t.Fatal("timed out")
   472  	}
   473  
   474  	rdr.CloseAsync()
   475  	assert.NoError(t, rdr.WaitForClose(time.Second*5))
   476  }