github.com/observiq/carbon@v0.9.11-0.20200820160507-1b872e368a5e/operator/builtin/input/file/line_splitter_test.go (about)

     1  package file
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"errors"
     7  	"regexp"
     8  	"testing"
     9  
    10  	"github.com/stretchr/testify/assert"
    11  	"github.com/stretchr/testify/require"
    12  	"golang.org/x/text/encoding"
    13  	"golang.org/x/text/encoding/unicode"
    14  )
    15  
    16  type tokenizerTestCase struct {
    17  	Name              string
    18  	Pattern           string
    19  	Raw               []byte
    20  	ExpectedTokenized []string
    21  	ExpectedError     error
    22  }
    23  
    24  func (tc tokenizerTestCase) RunFunc(splitFunc bufio.SplitFunc) func(t *testing.T) {
    25  	return func(t *testing.T) {
    26  		scanner := bufio.NewScanner(bytes.NewReader(tc.Raw))
    27  		scanner.Split(splitFunc)
    28  		tokenized := make([]string, 0)
    29  		for {
    30  			ok := scanner.Scan()
    31  			if !ok {
    32  				assert.Equal(t, tc.ExpectedError, scanner.Err())
    33  				break
    34  			}
    35  			tokenized = append(tokenized, scanner.Text())
    36  		}
    37  
    38  		assert.Equal(t, tc.ExpectedTokenized, tokenized)
    39  	}
    40  }
    41  
    42  func TestLineStartSplitFunc(t *testing.T) {
    43  	testCases := []tokenizerTestCase{
    44  		{
    45  			Name:    "OneLogSimple",
    46  			Pattern: `LOGSTART \d+ `,
    47  			Raw:     []byte("LOGSTART 123 log1LOGSTART 123 a"),
    48  			ExpectedTokenized: []string{
    49  				`LOGSTART 123 log1`,
    50  			},
    51  		},
    52  		{
    53  			Name:    "TwoLogsSimple",
    54  			Pattern: `LOGSTART \d+ `,
    55  			Raw:     []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`),
    56  			ExpectedTokenized: []string{
    57  				`LOGSTART 123 log1 `,
    58  				`LOGSTART 234 log2 `,
    59  			},
    60  		},
    61  		{
    62  			Name:    "TwoLogsLineStart",
    63  			Pattern: `^LOGSTART \d+ `,
    64  			Raw:     []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"),
    65  			ExpectedTokenized: []string{
    66  				"LOGSTART 123 LOGSTART 345 log1\n",
    67  				"LOGSTART 234 log2\n",
    68  			},
    69  		},
    70  		{
    71  			Name:              "NoMatches",
    72  			Pattern:           `LOGSTART \d+ `,
    73  			Raw:               []byte(`file that has no matches in it`),
    74  			ExpectedTokenized: []string{},
    75  		},
    76  		{
    77  			Name:    "PrecedingNonMatches",
    78  			Pattern: `LOGSTART \d+ `,
    79  			Raw:     []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`),
    80  			ExpectedTokenized: []string{
    81  				`part that doesn't match `,
    82  				`LOGSTART 123 part that matches`,
    83  			},
    84  		},
    85  		{
    86  			Name:    "HugeLog100",
    87  			Pattern: `LOGSTART \d+ `,
    88  			Raw: func() []byte {
    89  				newRaw := []byte(`LOGSTART 123 `)
    90  				newRaw = append(newRaw, generatedByteSliceOfLength(100)...)
    91  				newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...)
    92  				return newRaw
    93  			}(),
    94  			ExpectedTokenized: []string{
    95  				`LOGSTART 123 ` + string(generatedByteSliceOfLength(100)),
    96  			},
    97  		},
    98  		{
    99  			Name:    "HugeLog10000",
   100  			Pattern: `LOGSTART \d+ `,
   101  			Raw: func() []byte {
   102  				newRaw := []byte(`LOGSTART 123 `)
   103  				newRaw = append(newRaw, generatedByteSliceOfLength(10000)...)
   104  				newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...)
   105  				return newRaw
   106  			}(),
   107  			ExpectedTokenized: []string{
   108  				`LOGSTART 123 ` + string(generatedByteSliceOfLength(10000)),
   109  			},
   110  		},
   111  		{
   112  			Name:    "ErrTooLong",
   113  			Pattern: `LOGSTART \d+ `,
   114  			Raw: func() []byte {
   115  				newRaw := []byte(`LOGSTART 123 `)
   116  				newRaw = append(newRaw, generatedByteSliceOfLength(1000000)...)
   117  				newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...)
   118  				return newRaw
   119  			}(),
   120  			ExpectedError:     errors.New("bufio.Scanner: token too long"),
   121  			ExpectedTokenized: []string{},
   122  		},
   123  	}
   124  
   125  	for _, tc := range testCases {
   126  		cfg := NewInputConfig("")
   127  		cfg.Multiline = &MultilineConfig{
   128  			LineStartPattern: tc.Pattern,
   129  		}
   130  		splitFunc, err := cfg.getSplitFunc(unicode.UTF8)
   131  		require.NoError(t, err)
   132  		t.Run(tc.Name, tc.RunFunc(splitFunc))
   133  	}
   134  
   135  	t.Run("FirstMatchHitsEndOfBuffer", func(t *testing.T) {
   136  		splitFunc := NewLineStartSplitFunc(regexp.MustCompile("LOGSTART"))
   137  		data := []byte(`LOGSTART`)
   138  
   139  		t.Run("NotAtEOF", func(t *testing.T) {
   140  			advance, token, err := splitFunc(data[:], false)
   141  			require.NoError(t, err)
   142  			require.Equal(t, 0, advance)
   143  			require.Nil(t, token)
   144  		})
   145  
   146  		t.Run("AtEOF", func(t *testing.T) {
   147  			advance, token, err := splitFunc(data[:], true)
   148  			require.NoError(t, err)
   149  			require.Equal(t, 0, advance)
   150  			require.Nil(t, token)
   151  		})
   152  	})
   153  }
   154  
   155  func TestLineEndSplitFunc(t *testing.T) {
   156  	testCases := []tokenizerTestCase{
   157  		{
   158  			Name:    "OneLogSimple",
   159  			Pattern: `LOGEND \d+`,
   160  			Raw:     []byte(`my log LOGEND 123`),
   161  			ExpectedTokenized: []string{
   162  				`my log LOGEND 123`,
   163  			},
   164  		},
   165  		{
   166  			Name:    "TwoLogsSimple",
   167  			Pattern: `LOGEND \d+`,
   168  			Raw:     []byte(`log1 LOGEND 123log2 LOGEND 234`),
   169  			ExpectedTokenized: []string{
   170  				`log1 LOGEND 123`,
   171  				`log2 LOGEND 234`,
   172  			},
   173  		},
   174  		{
   175  			Name:    "TwoLogsLineEndSimple",
   176  			Pattern: `LOGEND$`,
   177  			Raw:     []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"),
   178  			ExpectedTokenized: []string{
   179  				"log1 LOGEND LOGEND",
   180  				"\nlog2 LOGEND",
   181  			},
   182  		},
   183  		{
   184  			Name:              "NoMatches",
   185  			Pattern:           `LOGEND \d+`,
   186  			Raw:               []byte(`file that has no matches in it`),
   187  			ExpectedTokenized: []string{},
   188  		},
   189  		{
   190  			Name:    "NonMatchesAfter",
   191  			Pattern: `LOGEND \d+`,
   192  			Raw:     []byte(`part that matches LOGEND 123 part that doesn't match`),
   193  			ExpectedTokenized: []string{
   194  				`part that matches LOGEND 123`,
   195  			},
   196  		},
   197  		{
   198  			Name:    "HugeLog100",
   199  			Pattern: `LOGEND \d`,
   200  			Raw: func() []byte {
   201  				newRaw := generatedByteSliceOfLength(100)
   202  				newRaw = append(newRaw, []byte(`LOGEND 1 `)...)
   203  				return newRaw
   204  			}(),
   205  			ExpectedTokenized: []string{
   206  				string(generatedByteSliceOfLength(100)) + `LOGEND 1`,
   207  			},
   208  		},
   209  		{
   210  			Name:    "HugeLog10000",
   211  			Pattern: `LOGEND \d`,
   212  			Raw: func() []byte {
   213  				newRaw := generatedByteSliceOfLength(10000)
   214  				newRaw = append(newRaw, []byte(`LOGEND 1 `)...)
   215  				return newRaw
   216  			}(),
   217  			ExpectedTokenized: []string{
   218  				string(generatedByteSliceOfLength(10000)) + `LOGEND 1`,
   219  			},
   220  		},
   221  		{
   222  			Name:    "HugeLog1000000",
   223  			Pattern: `LOGEND \d`,
   224  			Raw: func() []byte {
   225  				newRaw := generatedByteSliceOfLength(1000000)
   226  				newRaw = append(newRaw, []byte(`LOGEND 1 `)...)
   227  				return newRaw
   228  			}(),
   229  			ExpectedTokenized: []string{},
   230  			ExpectedError:     errors.New("bufio.Scanner: token too long"),
   231  		},
   232  	}
   233  
   234  	for _, tc := range testCases {
   235  		cfg := NewInputConfig("")
   236  		cfg.Multiline = &MultilineConfig{
   237  			LineEndPattern: tc.Pattern,
   238  		}
   239  		splitFunc, err := cfg.getSplitFunc(unicode.UTF8)
   240  		require.NoError(t, err)
   241  		t.Run(tc.Name, tc.RunFunc(splitFunc))
   242  	}
   243  }
   244  
   245  func TestNewlineSplitFunc(t *testing.T) {
   246  	testCases := []tokenizerTestCase{
   247  		{
   248  			Name: "OneLogSimple",
   249  			Raw:  []byte("my log\n"),
   250  			ExpectedTokenized: []string{
   251  				`my log`,
   252  			},
   253  		},
   254  		{
   255  			Name: "TwoLogsSimple",
   256  			Raw:  []byte("log1\nlog2\n"),
   257  			ExpectedTokenized: []string{
   258  				`log1`,
   259  				`log2`,
   260  			},
   261  		},
   262  		{
   263  			Name:              "NoTailingNewline",
   264  			Raw:               []byte(`foo`),
   265  			ExpectedTokenized: []string{},
   266  		},
   267  		{
   268  			Name: "HugeLog100",
   269  			Raw: func() []byte {
   270  				newRaw := generatedByteSliceOfLength(100)
   271  				newRaw = append(newRaw, '\n')
   272  				return newRaw
   273  			}(),
   274  			ExpectedTokenized: []string{
   275  				string(generatedByteSliceOfLength(100)),
   276  			},
   277  		},
   278  		{
   279  			Name: "HugeLog10000",
   280  			Raw: func() []byte {
   281  				newRaw := generatedByteSliceOfLength(10000)
   282  				newRaw = append(newRaw, '\n')
   283  				return newRaw
   284  			}(),
   285  			ExpectedTokenized: []string{
   286  				string(generatedByteSliceOfLength(10000)),
   287  			},
   288  		},
   289  		{
   290  			Name: "HugeLog1000000",
   291  			Raw: func() []byte {
   292  				newRaw := generatedByteSliceOfLength(1000000)
   293  				newRaw = append(newRaw, '\n')
   294  				return newRaw
   295  			}(),
   296  			ExpectedTokenized: []string{},
   297  			ExpectedError:     errors.New("bufio.Scanner: token too long"),
   298  		},
   299  	}
   300  
   301  	for _, tc := range testCases {
   302  		splitFunc, err := NewNewlineSplitFunc(unicode.UTF8)
   303  		require.NoError(t, err)
   304  		t.Run(tc.Name, tc.RunFunc(splitFunc))
   305  	}
   306  }
   307  
   308  func TestNewlineSplitFunc_Encodings(t *testing.T) {
   309  	cases := []struct {
   310  		name     string
   311  		encoding encoding.Encoding
   312  		input    []byte
   313  		tokens   [][]byte
   314  	}{
   315  		{
   316  			"Simple",
   317  			unicode.UTF8,
   318  			[]byte("testlog\n"),
   319  			[][]byte{[]byte("testlog")},
   320  		},
   321  		{
   322  			"CarriageReturn",
   323  			unicode.UTF8,
   324  			[]byte("testlog\r\n"),
   325  			[][]byte{[]byte("testlog")},
   326  		},
   327  		{
   328  			"SimpleUTF16",
   329  			unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
   330  			[]byte{0, 116, 0, 101, 0, 115, 0, 116, 0, 108, 0, 111, 0, 103, 0, 10}, // testlog\n
   331  			[][]byte{{0, 116, 0, 101, 0, 115, 0, 116, 0, 108, 0, 111, 0, 103}},
   332  		},
   333  		{
   334  			"MultiUTF16",
   335  			unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
   336  			[]byte{0, 108, 0, 111, 0, 103, 0, 49, 0, 10, 0, 108, 0, 111, 0, 103, 0, 50, 0, 10}, // log1\nlog2\n
   337  			[][]byte{
   338  				{0, 108, 0, 111, 0, 103, 0, 49}, // log1
   339  				{0, 108, 0, 111, 0, 103, 0, 50}, // log2
   340  			},
   341  		},
   342  		{
   343  			"MultiCarriageReturnUTF16",
   344  			unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
   345  			[]byte{0, 108, 0, 111, 0, 103, 0, 49, 0, 13, 0, 10, 0, 108, 0, 111, 0, 103, 0, 50, 0, 13, 0, 10}, // log1\r\nlog2\r\n
   346  			[][]byte{
   347  				{0, 108, 0, 111, 0, 103, 0, 49}, // log1
   348  				{0, 108, 0, 111, 0, 103, 0, 50}, // log2
   349  			},
   350  		},
   351  	}
   352  
   353  	for _, tc := range cases {
   354  		t.Run(tc.name, func(t *testing.T) {
   355  			splitFunc, err := NewNewlineSplitFunc(tc.encoding)
   356  			require.NoError(t, err)
   357  			scanner := bufio.NewScanner(bytes.NewReader(tc.input))
   358  			scanner.Split(splitFunc)
   359  
   360  			tokens := [][]byte{}
   361  			for {
   362  				ok := scanner.Scan()
   363  				if !ok {
   364  					require.NoError(t, scanner.Err())
   365  					break
   366  				}
   367  
   368  				tokens = append(tokens, scanner.Bytes())
   369  			}
   370  
   371  			require.Equal(t, tc.tokens, tokens)
   372  		})
   373  	}
   374  }
   375  
   376  func generatedByteSliceOfLength(length int) []byte {
   377  	chars := []byte(`abcdefghijklmnopqrstuvwxyz`)
   378  	newSlice := make([]byte, length)
   379  	for i := 0; i < length; i++ {
   380  		newSlice[i] = chars[i%len(chars)]
   381  	}
   382  	return newSlice
   383  }