github.com/observiq/carbon@v0.9.11-0.20200820160507-1b872e368a5e/operator/builtin/input/file/line_splitter.go (about)

     1  package file
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"regexp"
     7  
     8  	"golang.org/x/text/encoding"
     9  )
    10  
    11  // NewLineStartSplitFunc creates a bufio.SplitFunc that splits an incoming stream into
    12  // tokens that start with a match to the regex pattern provided
    13  func NewLineStartSplitFunc(re *regexp.Regexp) bufio.SplitFunc {
    14  	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
    15  		firstLoc := re.FindIndex(data)
    16  		if firstLoc == nil {
    17  			return 0, nil, nil // read more data and try again.
    18  		}
    19  		firstMatchStart := firstLoc[0]
    20  		firstMatchEnd := firstLoc[1]
    21  
    22  		if firstMatchStart != 0 {
    23  			// the beginning of the file does not match the start pattern, so return a token up to the first match so we don't lose data
    24  			advance = firstMatchStart
    25  			token = data[0:firstMatchStart]
    26  			return
    27  		}
    28  
    29  		if firstMatchEnd == len(data) {
    30  			// the first match goes to the end of the buffer, so don't look for a second match
    31  			return 0, nil, nil
    32  		}
    33  
    34  		secondLocOffset := firstMatchEnd + 1
    35  		secondLoc := re.FindIndex(data[secondLocOffset:])
    36  		if secondLoc == nil {
    37  			return 0, nil, nil // read more data and try again
    38  		}
    39  		secondMatchStart := secondLoc[0] + secondLocOffset
    40  
    41  		advance = secondMatchStart                     // start scanning at the beginning of the second match
    42  		token = data[firstMatchStart:secondMatchStart] // the token begins at the first match, and ends at the beginning of the second match
    43  		err = nil
    44  		return
    45  	}
    46  }
    47  
    48  // NewLineEndSplitFunc creates a bufio.SplitFunc that splits an incoming stream into
    49  // tokens that end with a match to the regex pattern provided
    50  func NewLineEndSplitFunc(re *regexp.Regexp) bufio.SplitFunc {
    51  	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
    52  		loc := re.FindIndex(data)
    53  		if loc == nil {
    54  			return 0, nil, nil // read more data and try again
    55  		}
    56  
    57  		// If the match goes up to the end of the current buffer, do another
    58  		// read until we can capture the entire match
    59  		if loc[1] == len(data)-1 && !atEOF {
    60  			return 0, nil, nil
    61  		}
    62  
    63  		advance = loc[1]
    64  		token = data[:loc[1]]
    65  		err = nil
    66  		return
    67  	}
    68  }
    69  
    70  // NewNewlineSplitFunc splits log lines by newline, just as bufio.ScanLines, but
    71  // never returning an token using EOF as a terminator
    72  func NewNewlineSplitFunc(encoding encoding.Encoding) (bufio.SplitFunc, error) {
    73  	newline, err := encodedNewline(encoding)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  
    78  	carriageReturn, err := encodedCarriageReturn(encoding)
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  
    83  	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
    84  		if atEOF && len(data) == 0 {
    85  			return 0, nil, nil
    86  		}
    87  
    88  		if i := bytes.Index(data, newline); i >= 0 {
    89  			// We have a full newline-terminated line.
    90  			return i + len(newline), bytes.TrimSuffix(data[:i], carriageReturn), nil
    91  		}
    92  
    93  		// Request more data.
    94  		return 0, nil, nil
    95  	}, nil
    96  }
    97  
    98  func encodedNewline(encoding encoding.Encoding) ([]byte, error) {
    99  	out := make([]byte, 10)
   100  	nDst, _, err := encoding.NewEncoder().Transform(out, []byte{'\n'}, true)
   101  	return out[:nDst], err
   102  }
   103  
   104  func encodedCarriageReturn(encoding encoding.Encoding) ([]byte, error) {
   105  	out := make([]byte, 10)
   106  	nDst, _, err := encoding.NewEncoder().Transform(out, []byte{'\r'}, true)
   107  	return out[:nDst], err
   108  }