github.com/observiq/carbon@v0.9.11-0.20200820160507-1b872e368a5e/operator/builtin/input/file/line_splitter_test.go (about) 1 package file 2 3 import ( 4 "bufio" 5 "bytes" 6 "errors" 7 "regexp" 8 "testing" 9 10 "github.com/stretchr/testify/assert" 11 "github.com/stretchr/testify/require" 12 "golang.org/x/text/encoding" 13 "golang.org/x/text/encoding/unicode" 14 ) 15 16 type tokenizerTestCase struct { 17 Name string 18 Pattern string 19 Raw []byte 20 ExpectedTokenized []string 21 ExpectedError error 22 } 23 24 func (tc tokenizerTestCase) RunFunc(splitFunc bufio.SplitFunc) func(t *testing.T) { 25 return func(t *testing.T) { 26 scanner := bufio.NewScanner(bytes.NewReader(tc.Raw)) 27 scanner.Split(splitFunc) 28 tokenized := make([]string, 0) 29 for { 30 ok := scanner.Scan() 31 if !ok { 32 assert.Equal(t, tc.ExpectedError, scanner.Err()) 33 break 34 } 35 tokenized = append(tokenized, scanner.Text()) 36 } 37 38 assert.Equal(t, tc.ExpectedTokenized, tokenized) 39 } 40 } 41 42 func TestLineStartSplitFunc(t *testing.T) { 43 testCases := []tokenizerTestCase{ 44 { 45 Name: "OneLogSimple", 46 Pattern: `LOGSTART \d+ `, 47 Raw: []byte("LOGSTART 123 log1LOGSTART 123 a"), 48 ExpectedTokenized: []string{ 49 `LOGSTART 123 log1`, 50 }, 51 }, 52 { 53 Name: "TwoLogsSimple", 54 Pattern: `LOGSTART \d+ `, 55 Raw: []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`), 56 ExpectedTokenized: []string{ 57 `LOGSTART 123 log1 `, 58 `LOGSTART 234 log2 `, 59 }, 60 }, 61 { 62 Name: "TwoLogsLineStart", 63 Pattern: `^LOGSTART \d+ `, 64 Raw: []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"), 65 ExpectedTokenized: []string{ 66 "LOGSTART 123 LOGSTART 345 log1\n", 67 "LOGSTART 234 log2\n", 68 }, 69 }, 70 { 71 Name: "NoMatches", 72 Pattern: `LOGSTART \d+ `, 73 Raw: []byte(`file that has no matches in it`), 74 ExpectedTokenized: []string{}, 75 }, 76 { 77 Name: "PrecedingNonMatches", 78 Pattern: `LOGSTART \d+ `, 79 Raw: []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`), 80 ExpectedTokenized: []string{ 81 `part that doesn't match `, 82 `LOGSTART 123 part that matches`, 83 }, 84 }, 85 { 86 Name: "HugeLog100", 87 Pattern: `LOGSTART \d+ `, 88 Raw: func() []byte { 89 newRaw := []byte(`LOGSTART 123 `) 90 newRaw = append(newRaw, generatedByteSliceOfLength(100)...) 91 newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...) 92 return newRaw 93 }(), 94 ExpectedTokenized: []string{ 95 `LOGSTART 123 ` + string(generatedByteSliceOfLength(100)), 96 }, 97 }, 98 { 99 Name: "HugeLog10000", 100 Pattern: `LOGSTART \d+ `, 101 Raw: func() []byte { 102 newRaw := []byte(`LOGSTART 123 `) 103 newRaw = append(newRaw, generatedByteSliceOfLength(10000)...) 104 newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...) 105 return newRaw 106 }(), 107 ExpectedTokenized: []string{ 108 `LOGSTART 123 ` + string(generatedByteSliceOfLength(10000)), 109 }, 110 }, 111 { 112 Name: "ErrTooLong", 113 Pattern: `LOGSTART \d+ `, 114 Raw: func() []byte { 115 newRaw := []byte(`LOGSTART 123 `) 116 newRaw = append(newRaw, generatedByteSliceOfLength(1000000)...) 117 newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...) 118 return newRaw 119 }(), 120 ExpectedError: errors.New("bufio.Scanner: token too long"), 121 ExpectedTokenized: []string{}, 122 }, 123 } 124 125 for _, tc := range testCases { 126 cfg := NewInputConfig("") 127 cfg.Multiline = &MultilineConfig{ 128 LineStartPattern: tc.Pattern, 129 } 130 splitFunc, err := cfg.getSplitFunc(unicode.UTF8) 131 require.NoError(t, err) 132 t.Run(tc.Name, tc.RunFunc(splitFunc)) 133 } 134 135 t.Run("FirstMatchHitsEndOfBuffer", func(t *testing.T) { 136 splitFunc := NewLineStartSplitFunc(regexp.MustCompile("LOGSTART")) 137 data := []byte(`LOGSTART`) 138 139 t.Run("NotAtEOF", func(t *testing.T) { 140 advance, token, err := splitFunc(data[:], false) 141 require.NoError(t, err) 142 require.Equal(t, 0, advance) 143 require.Nil(t, token) 144 }) 145 146 t.Run("AtEOF", func(t *testing.T) { 147 advance, token, err := splitFunc(data[:], true) 148 require.NoError(t, err) 149 require.Equal(t, 0, advance) 150 require.Nil(t, token) 151 }) 152 }) 153 } 154 155 func TestLineEndSplitFunc(t *testing.T) { 156 testCases := []tokenizerTestCase{ 157 { 158 Name: "OneLogSimple", 159 Pattern: `LOGEND \d+`, 160 Raw: []byte(`my log LOGEND 123`), 161 ExpectedTokenized: []string{ 162 `my log LOGEND 123`, 163 }, 164 }, 165 { 166 Name: "TwoLogsSimple", 167 Pattern: `LOGEND \d+`, 168 Raw: []byte(`log1 LOGEND 123log2 LOGEND 234`), 169 ExpectedTokenized: []string{ 170 `log1 LOGEND 123`, 171 `log2 LOGEND 234`, 172 }, 173 }, 174 { 175 Name: "TwoLogsLineEndSimple", 176 Pattern: `LOGEND$`, 177 Raw: []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"), 178 ExpectedTokenized: []string{ 179 "log1 LOGEND LOGEND", 180 "\nlog2 LOGEND", 181 }, 182 }, 183 { 184 Name: "NoMatches", 185 Pattern: `LOGEND \d+`, 186 Raw: []byte(`file that has no matches in it`), 187 ExpectedTokenized: []string{}, 188 }, 189 { 190 Name: "NonMatchesAfter", 191 Pattern: `LOGEND \d+`, 192 Raw: []byte(`part that matches LOGEND 123 part that doesn't match`), 193 ExpectedTokenized: []string{ 194 `part that matches LOGEND 123`, 195 }, 196 }, 197 { 198 Name: "HugeLog100", 199 Pattern: `LOGEND \d`, 200 Raw: func() []byte { 201 newRaw := generatedByteSliceOfLength(100) 202 newRaw = append(newRaw, []byte(`LOGEND 1 `)...) 203 return newRaw 204 }(), 205 ExpectedTokenized: []string{ 206 string(generatedByteSliceOfLength(100)) + `LOGEND 1`, 207 }, 208 }, 209 { 210 Name: "HugeLog10000", 211 Pattern: `LOGEND \d`, 212 Raw: func() []byte { 213 newRaw := generatedByteSliceOfLength(10000) 214 newRaw = append(newRaw, []byte(`LOGEND 1 `)...) 215 return newRaw 216 }(), 217 ExpectedTokenized: []string{ 218 string(generatedByteSliceOfLength(10000)) + `LOGEND 1`, 219 }, 220 }, 221 { 222 Name: "HugeLog1000000", 223 Pattern: `LOGEND \d`, 224 Raw: func() []byte { 225 newRaw := generatedByteSliceOfLength(1000000) 226 newRaw = append(newRaw, []byte(`LOGEND 1 `)...) 227 return newRaw 228 }(), 229 ExpectedTokenized: []string{}, 230 ExpectedError: errors.New("bufio.Scanner: token too long"), 231 }, 232 } 233 234 for _, tc := range testCases { 235 cfg := NewInputConfig("") 236 cfg.Multiline = &MultilineConfig{ 237 LineEndPattern: tc.Pattern, 238 } 239 splitFunc, err := cfg.getSplitFunc(unicode.UTF8) 240 require.NoError(t, err) 241 t.Run(tc.Name, tc.RunFunc(splitFunc)) 242 } 243 } 244 245 func TestNewlineSplitFunc(t *testing.T) { 246 testCases := []tokenizerTestCase{ 247 { 248 Name: "OneLogSimple", 249 Raw: []byte("my log\n"), 250 ExpectedTokenized: []string{ 251 `my log`, 252 }, 253 }, 254 { 255 Name: "TwoLogsSimple", 256 Raw: []byte("log1\nlog2\n"), 257 ExpectedTokenized: []string{ 258 `log1`, 259 `log2`, 260 }, 261 }, 262 { 263 Name: "NoTailingNewline", 264 Raw: []byte(`foo`), 265 ExpectedTokenized: []string{}, 266 }, 267 { 268 Name: "HugeLog100", 269 Raw: func() []byte { 270 newRaw := generatedByteSliceOfLength(100) 271 newRaw = append(newRaw, '\n') 272 return newRaw 273 }(), 274 ExpectedTokenized: []string{ 275 string(generatedByteSliceOfLength(100)), 276 }, 277 }, 278 { 279 Name: "HugeLog10000", 280 Raw: func() []byte { 281 newRaw := generatedByteSliceOfLength(10000) 282 newRaw = append(newRaw, '\n') 283 return newRaw 284 }(), 285 ExpectedTokenized: []string{ 286 string(generatedByteSliceOfLength(10000)), 287 }, 288 }, 289 { 290 Name: "HugeLog1000000", 291 Raw: func() []byte { 292 newRaw := generatedByteSliceOfLength(1000000) 293 newRaw = append(newRaw, '\n') 294 return newRaw 295 }(), 296 ExpectedTokenized: []string{}, 297 ExpectedError: errors.New("bufio.Scanner: token too long"), 298 }, 299 } 300 301 for _, tc := range testCases { 302 splitFunc, err := NewNewlineSplitFunc(unicode.UTF8) 303 require.NoError(t, err) 304 t.Run(tc.Name, tc.RunFunc(splitFunc)) 305 } 306 } 307 308 func TestNewlineSplitFunc_Encodings(t *testing.T) { 309 cases := []struct { 310 name string 311 encoding encoding.Encoding 312 input []byte 313 tokens [][]byte 314 }{ 315 { 316 "Simple", 317 unicode.UTF8, 318 []byte("testlog\n"), 319 [][]byte{[]byte("testlog")}, 320 }, 321 { 322 "CarriageReturn", 323 unicode.UTF8, 324 []byte("testlog\r\n"), 325 [][]byte{[]byte("testlog")}, 326 }, 327 { 328 "SimpleUTF16", 329 unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), 330 []byte{0, 116, 0, 101, 0, 115, 0, 116, 0, 108, 0, 111, 0, 103, 0, 10}, // testlog\n 331 [][]byte{{0, 116, 0, 101, 0, 115, 0, 116, 0, 108, 0, 111, 0, 103}}, 332 }, 333 { 334 "MultiUTF16", 335 unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), 336 []byte{0, 108, 0, 111, 0, 103, 0, 49, 0, 10, 0, 108, 0, 111, 0, 103, 0, 50, 0, 10}, // log1\nlog2\n 337 [][]byte{ 338 {0, 108, 0, 111, 0, 103, 0, 49}, // log1 339 {0, 108, 0, 111, 0, 103, 0, 50}, // log2 340 }, 341 }, 342 { 343 "MultiCarriageReturnUTF16", 344 unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), 345 []byte{0, 108, 0, 111, 0, 103, 0, 49, 0, 13, 0, 10, 0, 108, 0, 111, 0, 103, 0, 50, 0, 13, 0, 10}, // log1\r\nlog2\r\n 346 [][]byte{ 347 {0, 108, 0, 111, 0, 103, 0, 49}, // log1 348 {0, 108, 0, 111, 0, 103, 0, 50}, // log2 349 }, 350 }, 351 } 352 353 for _, tc := range cases { 354 t.Run(tc.name, func(t *testing.T) { 355 splitFunc, err := NewNewlineSplitFunc(tc.encoding) 356 require.NoError(t, err) 357 scanner := bufio.NewScanner(bytes.NewReader(tc.input)) 358 scanner.Split(splitFunc) 359 360 tokens := [][]byte{} 361 for { 362 ok := scanner.Scan() 363 if !ok { 364 require.NoError(t, scanner.Err()) 365 break 366 } 367 368 tokens = append(tokens, scanner.Bytes()) 369 } 370 371 require.Equal(t, tc.tokens, tokens) 372 }) 373 } 374 } 375 376 func generatedByteSliceOfLength(length int) []byte { 377 chars := []byte(`abcdefghijklmnopqrstuvwxyz`) 378 newSlice := make([]byte, length) 379 for i := 0; i < length; i++ { 380 newSlice[i] = chars[i%len(chars)] 381 } 382 return newSlice 383 }