github.com/observiq/carbon@v0.9.11-0.20200820160507-1b872e368a5e/operator/builtin/input/file/file.go (about)

     1  package file
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"context"
     7  	"crypto/md5"
     8  	"encoding/gob"
     9  	"fmt"
    10  	"io"
    11  	"os"
    12  	"path/filepath"
    13  	"regexp"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/observiq/carbon/entry"
    19  	"github.com/observiq/carbon/operator"
    20  	"github.com/observiq/carbon/operator/helper"
    21  	"go.uber.org/zap"
    22  	"golang.org/x/text/encoding"
    23  	"golang.org/x/text/encoding/ianaindex"
    24  	"golang.org/x/text/encoding/unicode"
    25  )
    26  
    27  func init() {
    28  	operator.Register("file_input", func() operator.Builder { return NewInputConfig("") })
    29  }
    30  
    31  func NewInputConfig(operatorID string) *InputConfig {
    32  	return &InputConfig{
    33  		InputConfig:     helper.NewInputConfig(operatorID, "file_input"),
    34  		PollInterval:    operator.Duration{Duration: 200 * time.Millisecond},
    35  		IncludeFileName: true,
    36  		IncludeFilePath: false,
    37  		StartAt:         "end",
    38  		MaxLogSize:      1024 * 1024,
    39  		Encoding:        "nop",
    40  	}
    41  }
    42  
    43  // InputConfig is the configuration of a file input operator
    44  type InputConfig struct {
    45  	helper.InputConfig `yaml:",inline"`
    46  
    47  	Include []string `json:"include,omitempty" yaml:"include,omitempty"`
    48  	Exclude []string `json:"exclude,omitempty" yaml:"exclude,omitempty"`
    49  
    50  	PollInterval    operator.Duration `json:"poll_interval,omitempty"     yaml:"poll_interval,omitempty"`
    51  	Multiline       *MultilineConfig  `json:"multiline,omitempty"         yaml:"multiline,omitempty"`
    52  	IncludeFileName bool              `json:"include_file_name,omitempty" yaml:"include_file_name,omitempty"`
    53  	IncludeFilePath bool              `json:"include_file_path,omitempty" yaml:"include_file_path,omitempty"`
    54  	StartAt         string            `json:"start_at,omitempty"          yaml:"start_at,omitempty"`
    55  	MaxLogSize      int               `json:"max_log_size,omitempty"      yaml:"max_log_size,omitempty"`
    56  	Encoding        string            `json:"encoding,omitempty"          yaml:"encoding,omitempty"`
    57  }
    58  
    59  // MultilineConfig is the configuration a multiline operation
    60  type MultilineConfig struct {
    61  	LineStartPattern string `json:"line_start_pattern" yaml:"line_start_pattern"`
    62  	LineEndPattern   string `json:"line_end_pattern"   yaml:"line_end_pattern"`
    63  }
    64  
    65  // Build will build a file input operator from the supplied configuration
    66  func (c InputConfig) Build(context operator.BuildContext) (operator.Operator, error) {
    67  	inputOperator, err := c.InputConfig.Build(context)
    68  	if err != nil {
    69  		return nil, err
    70  	}
    71  
    72  	if len(c.Include) == 0 {
    73  		return nil, fmt.Errorf("required argument `include` is empty")
    74  	}
    75  
    76  	// Ensure includes can be parsed as globs
    77  	for _, include := range c.Include {
    78  		_, err := filepath.Match(include, "matchstring")
    79  		if err != nil {
    80  			return nil, fmt.Errorf("parse include glob: %s", err)
    81  		}
    82  	}
    83  
    84  	// Ensure excludes can be parsed as globs
    85  	for _, exclude := range c.Exclude {
    86  		_, err := filepath.Match(exclude, "matchstring")
    87  		if err != nil {
    88  			return nil, fmt.Errorf("parse exclude glob: %s", err)
    89  		}
    90  	}
    91  
    92  	encoding, err := lookupEncoding(c.Encoding)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	splitFunc, err := c.getSplitFunc(encoding)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  
   102  	var startAtBeginning bool
   103  	switch c.StartAt {
   104  	case "beginning":
   105  		startAtBeginning = true
   106  	case "end":
   107  		startAtBeginning = false
   108  	default:
   109  		return nil, fmt.Errorf("invalid start_at location '%s'", c.StartAt)
   110  	}
   111  
   112  	fileNameField := entry.NewNilField()
   113  	if c.IncludeFileName {
   114  		fileNameField = entry.NewLabelField("file_name")
   115  	}
   116  
   117  	filePathField := entry.NewNilField()
   118  	if c.IncludeFilePath {
   119  		filePathField = entry.NewLabelField("file_path")
   120  	}
   121  
   122  	operator := &InputOperator{
   123  		InputOperator:    inputOperator,
   124  		Include:          c.Include,
   125  		Exclude:          c.Exclude,
   126  		SplitFunc:        splitFunc,
   127  		PollInterval:     c.PollInterval.Raw(),
   128  		persist:          helper.NewScopedDBPersister(context.Database, c.ID()),
   129  		FilePathField:    filePathField,
   130  		FileNameField:    fileNameField,
   131  		runningFiles:     make(map[string]struct{}),
   132  		fileUpdateChan:   make(chan fileUpdateMessage, 10),
   133  		fingerprintBytes: 1000,
   134  		startAtBeginning: startAtBeginning,
   135  		encoding:         encoding,
   136  		MaxLogSize:       c.MaxLogSize,
   137  	}
   138  
   139  	return operator, nil
   140  }
   141  
   142  var encodingOverrides = map[string]encoding.Encoding{
   143  	"utf-16":   unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
   144  	"utf16":    unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
   145  	"utf8":     unicode.UTF8,
   146  	"ascii":    unicode.UTF8,
   147  	"us-ascii": unicode.UTF8,
   148  	"nop":      encoding.Nop,
   149  	"":         encoding.Nop,
   150  }
   151  
   152  func lookupEncoding(enc string) (encoding.Encoding, error) {
   153  	if encoding, ok := encodingOverrides[strings.ToLower(enc)]; ok {
   154  		return encoding, nil
   155  	}
   156  	encoding, err := ianaindex.IANA.Encoding(enc)
   157  	if err != nil {
   158  		return nil, fmt.Errorf("unsupported encoding '%s'", enc)
   159  	}
   160  	if encoding == nil {
   161  		return nil, fmt.Errorf("no charmap defined for encoding '%s'", enc)
   162  	}
   163  	return encoding, nil
   164  }
   165  
   166  // getSplitFunc will return the split function associated the configured mode.
   167  func (c InputConfig) getSplitFunc(encoding encoding.Encoding) (bufio.SplitFunc, error) {
   168  	if c.Multiline == nil {
   169  		return NewNewlineSplitFunc(encoding)
   170  	}
   171  	endPattern := c.Multiline.LineEndPattern
   172  	startPattern := c.Multiline.LineStartPattern
   173  
   174  	switch {
   175  	case endPattern != "" && startPattern != "":
   176  		return nil, fmt.Errorf("only one of line_start_pattern or line_end_pattern can be set")
   177  	case endPattern == "" && startPattern == "":
   178  		return nil, fmt.Errorf("one of line_start_pattern or line_end_pattern must be set")
   179  	case endPattern != "":
   180  		re, err := regexp.Compile("(?m)" + c.Multiline.LineEndPattern)
   181  		if err != nil {
   182  			return nil, fmt.Errorf("compile line end regex: %s", err)
   183  		}
   184  		return NewLineEndSplitFunc(re), nil
   185  	case startPattern != "":
   186  		re, err := regexp.Compile("(?m)" + c.Multiline.LineStartPattern)
   187  		if err != nil {
   188  			return nil, fmt.Errorf("compile line start regex: %s", err)
   189  		}
   190  		return NewLineStartSplitFunc(re), nil
   191  	default:
   192  		return nil, fmt.Errorf("unreachable")
   193  	}
   194  }
   195  
   196  // InputOperator is an operator that monitors files for entries
   197  type InputOperator struct {
   198  	helper.InputOperator
   199  
   200  	Include       []string
   201  	Exclude       []string
   202  	FilePathField entry.Field
   203  	FileNameField entry.Field
   204  	PollInterval  time.Duration
   205  	SplitFunc     bufio.SplitFunc
   206  	MaxLogSize    int
   207  
   208  	persist helper.Persister
   209  
   210  	runningFiles     map[string]struct{}
   211  	knownFiles       map[string]*knownFileInfo
   212  	startAtBeginning bool
   213  
   214  	fileUpdateChan   chan fileUpdateMessage
   215  	fingerprintBytes int64
   216  
   217  	encoding encoding.Encoding
   218  
   219  	wg       *sync.WaitGroup
   220  	readerWg *sync.WaitGroup
   221  	cancel   context.CancelFunc
   222  }
   223  
   224  // Start will start the file monitoring process
   225  func (f *InputOperator) Start() error {
   226  	ctx, cancel := context.WithCancel(context.Background())
   227  	f.cancel = cancel
   228  	f.wg = &sync.WaitGroup{}
   229  	f.readerWg = &sync.WaitGroup{}
   230  
   231  	var err error
   232  	f.knownFiles, err = f.readKnownFiles()
   233  	if err != nil {
   234  		return fmt.Errorf("failed to read known files from database: %s", err)
   235  	}
   236  
   237  	f.wg.Add(1)
   238  	go func() {
   239  		defer f.wg.Done()
   240  		defer f.syncKnownFiles()
   241  		defer f.drainMessages()
   242  
   243  		globTicker := time.NewTicker(f.PollInterval)
   244  		defer globTicker.Stop()
   245  
   246  		// All accesses to runningFiles and knownFiles should be done from
   247  		// this goroutine. That means that all private methods of FileInput
   248  		// are unsafe to call from multiple goroutines. Changes to these
   249  		// maps should be done through the fileUpdateChan.
   250  		firstCheck := true
   251  		for {
   252  			select {
   253  			case <-ctx.Done():
   254  				return
   255  			case <-globTicker.C:
   256  				matches := getMatches(f.Include, f.Exclude)
   257  				if firstCheck && len(matches) == 0 {
   258  					f.Warnw("no files match the configured include patterns", "include", f.Include)
   259  				}
   260  				for _, match := range matches {
   261  					f.checkFile(ctx, match, firstCheck)
   262  				}
   263  				f.syncKnownFiles()
   264  				firstCheck = false
   265  			case message, ok := <-f.fileUpdateChan:
   266  				if ok {
   267  					f.updateFile(message)
   268  				}
   269  			}
   270  		}
   271  	}()
   272  
   273  	return nil
   274  }
   275  
   276  // Stop will stop the file monitoring process
   277  func (f *InputOperator) Stop() error {
   278  	f.cancel()
   279  	f.wg.Wait()
   280  	f.fileUpdateChan = make(chan fileUpdateMessage)
   281  	f.knownFiles = nil
   282  	return nil
   283  }
   284  
   285  // checkFile is not safe to call from multiple goroutines
   286  //
   287  // firstCheck indicates whether this is the first time checkFile has been called
   288  // after startup. This is important for the start_at parameter because, after initial
   289  // startup, we don't want to start at the end of newly-created files.
   290  func (f *InputOperator) checkFile(ctx context.Context, path string, firstCheck bool) {
   291  
   292  	// Check if the file is currently being read
   293  	if _, ok := f.runningFiles[path]; ok {
   294  		return // file is already being read
   295  	}
   296  
   297  	// If the path is known, start from last offset
   298  	knownFile, isKnown := f.knownFiles[path]
   299  
   300  	// If the path is new, check if it was from a known file that was rotated
   301  	var err error
   302  	if !isKnown {
   303  		knownFile, err = newKnownFileInfo(path, f.fingerprintBytes, f.startAtBeginning || !firstCheck)
   304  		if err != nil {
   305  			f.Warnw("Failed to get info for file", zap.Error(err))
   306  			return
   307  		}
   308  
   309  		for _, knownInfo := range f.knownFiles {
   310  			if knownFile.fingerprintMatches(knownInfo) || knownFile.smallFileContentsMatches(knownInfo) {
   311  				// The file was rotated, so update the path
   312  				knownInfo.Path = path
   313  				knownFile = knownInfo
   314  				break
   315  			}
   316  		}
   317  	}
   318  
   319  	f.runningFiles[path] = struct{}{}
   320  	f.knownFiles[path] = knownFile
   321  	f.readerWg.Add(1)
   322  	go func(ctx context.Context, path string, offset, lastSeenSize int64) {
   323  		defer f.readerWg.Done()
   324  		messenger := f.newFileUpdateMessenger(path)
   325  		defer messenger.FinishedReading()
   326  		err := ReadToEnd(ctx, path, offset, lastSeenSize, messenger, f.SplitFunc, f.FilePathField, f.FileNameField, f.InputOperator, f.MaxLogSize, f.encoding)
   327  		if err != nil {
   328  			f.Warnw("Failed to read log file", zap.Error(err))
   329  		}
   330  	}(ctx, path, knownFile.Offset, knownFile.LastSeenFileSize)
   331  }
   332  
   333  func (f *InputOperator) updateFile(message fileUpdateMessage) {
   334  	if message.finished {
   335  		delete(f.runningFiles, message.path)
   336  		return
   337  	}
   338  
   339  	knownFile := f.knownFiles[message.path]
   340  
   341  	// This is a last seen size message, so just set the size and return
   342  	if message.lastSeenFileSize != -1 {
   343  		knownFile.LastSeenFileSize = message.lastSeenFileSize
   344  		return
   345  	}
   346  
   347  	if message.newOffset < knownFile.Offset {
   348  		// The file was truncated or rotated
   349  
   350  		newKnownFile, err := newKnownFileInfo(message.path, f.fingerprintBytes, true)
   351  		if err != nil {
   352  			f.Warnw("Failed to generate new file info", zap.Error(err))
   353  			return
   354  		}
   355  		f.knownFiles[message.path] = newKnownFile
   356  		return
   357  	}
   358  
   359  	if knownFile.Offset < f.fingerprintBytes && message.newOffset > f.fingerprintBytes {
   360  		// The file graduated from small file to fingerprinted file
   361  
   362  		file, err := os.Open(message.path)
   363  		if err != nil {
   364  			f.Warnw("Failed to open file for fingerprinting", zap.Error(err))
   365  			return
   366  		}
   367  		defer file.Close()
   368  		knownFile.Fingerprint, err = fingerprintFile(file, f.fingerprintBytes)
   369  		if err != nil {
   370  			f.Warnw("Failed to fingerprint file", zap.Error(err))
   371  			return
   372  		}
   373  		knownFile.IsSmallFile = false
   374  	} else if message.newOffset < f.fingerprintBytes {
   375  		// The file is a small file
   376  
   377  		file, err := os.Open(message.path)
   378  		if err != nil {
   379  			f.Warnw("Failed to open small file for content tracking", zap.Error(err))
   380  			return
   381  		}
   382  		defer file.Close()
   383  
   384  		buf := make([]byte, message.newOffset)
   385  		n, err := file.Read(buf)
   386  		if err != nil && err != io.EOF {
   387  			f.Warnw("Failed to read small file for content tracking", zap.Error(err))
   388  			return
   389  		}
   390  		knownFile.SmallFileContents = buf[:n]
   391  		knownFile.IsSmallFile = true
   392  	}
   393  
   394  	knownFile.Offset = message.newOffset
   395  }
   396  
   397  func (f *InputOperator) drainMessages() {
   398  	go func() {
   399  		f.readerWg.Wait()
   400  		close(f.fileUpdateChan)
   401  	}()
   402  
   403  	for {
   404  		message, ok := <-f.fileUpdateChan
   405  		if !ok {
   406  			return
   407  		}
   408  		f.updateFile(message)
   409  	}
   410  }
   411  
   412  var knownFilesKey = "knownFiles"
   413  
   414  func (f *InputOperator) syncKnownFiles() {
   415  	var buf bytes.Buffer
   416  	enc := gob.NewEncoder(&buf)
   417  	err := enc.Encode(f.knownFiles)
   418  	if err != nil {
   419  		f.Errorw("Failed to encode known files", zap.Error(err))
   420  		return
   421  	}
   422  
   423  	f.persist.Set(knownFilesKey, buf.Bytes())
   424  	f.persist.Sync()
   425  }
   426  
   427  func (f *InputOperator) readKnownFiles() (map[string]*knownFileInfo, error) {
   428  	err := f.persist.Load()
   429  	if err != nil {
   430  		return nil, err
   431  	}
   432  
   433  	var knownFiles map[string]*knownFileInfo
   434  	encoded := f.persist.Get(knownFilesKey)
   435  	if encoded == nil {
   436  		knownFiles = make(map[string]*knownFileInfo)
   437  		return knownFiles, nil
   438  	}
   439  
   440  	dec := gob.NewDecoder(bytes.NewReader(encoded))
   441  	err = dec.Decode(&knownFiles)
   442  	if err != nil {
   443  		return nil, err
   444  	}
   445  
   446  	return knownFiles, nil
   447  }
   448  
   449  func (f *InputOperator) newFileUpdateMessenger(path string) fileUpdateMessenger {
   450  	return fileUpdateMessenger{
   451  		path: path,
   452  		c:    f.fileUpdateChan,
   453  	}
   454  }
   455  
   456  type knownFileInfo struct {
   457  	Path              string
   458  	IsSmallFile       bool
   459  	Fingerprint       []byte
   460  	SmallFileContents []byte
   461  	Offset            int64
   462  	LastSeenFileSize  int64
   463  }
   464  
   465  func newKnownFileInfo(path string, fingerprintBytes int64, startAtBeginning bool) (*knownFileInfo, error) {
   466  	file, err := os.Open(path)
   467  	if err != nil {
   468  		return nil, err
   469  	}
   470  	defer file.Close()
   471  
   472  	stat, err := file.Stat()
   473  	if err != nil {
   474  		return nil, err
   475  	}
   476  
   477  	var fingerprint []byte
   478  	var smallFileContents []byte
   479  	isSmallFile := false
   480  	size := stat.Size()
   481  	if size > fingerprintBytes {
   482  		fingerprint, err = fingerprintFile(file, fingerprintBytes)
   483  		if err != nil {
   484  			return nil, err
   485  		}
   486  	} else {
   487  		isSmallFile = true
   488  		buf := make([]byte, size)
   489  		n, err := file.Read(buf)
   490  		if err != nil {
   491  			return nil, err
   492  		}
   493  		smallFileContents = buf[:n]
   494  	}
   495  
   496  	var offset int64
   497  	if startAtBeginning {
   498  		offset = 0
   499  	} else {
   500  		offset = stat.Size()
   501  	}
   502  
   503  	return &knownFileInfo{
   504  		Path:              path,
   505  		Fingerprint:       fingerprint,
   506  		SmallFileContents: smallFileContents,
   507  		IsSmallFile:       isSmallFile,
   508  		Offset:            offset,
   509  	}, nil
   510  }
   511  
   512  func (i *knownFileInfo) smallFileContentsMatches(other *knownFileInfo) bool {
   513  	if !(i.IsSmallFile && other.IsSmallFile) {
   514  		return false
   515  	}
   516  
   517  	// compare the smaller of the two known files
   518  	var s int
   519  	if len(i.SmallFileContents) > len(other.SmallFileContents) {
   520  		s = len(other.SmallFileContents)
   521  	} else {
   522  		s = len(i.SmallFileContents)
   523  	}
   524  
   525  	return bytes.Equal(i.SmallFileContents[:s], other.SmallFileContents[:s])
   526  }
   527  
   528  func (i *knownFileInfo) fingerprintMatches(other *knownFileInfo) bool {
   529  	if i.IsSmallFile || other.IsSmallFile {
   530  		return false
   531  	}
   532  	return bytes.Equal(i.Fingerprint, other.Fingerprint)
   533  }
   534  
   535  func fingerprintFile(file *os.File, numBytes int64) ([]byte, error) {
   536  	_, err := file.Seek(0, io.SeekStart)
   537  	if err != nil {
   538  		return nil, err
   539  	}
   540  	hash := md5.New()
   541  
   542  	buffer := make([]byte, numBytes)
   543  	io.ReadFull(file, buffer)
   544  	hash.Write(buffer)
   545  	return hash.Sum(nil), nil
   546  }
   547  
   548  type fileUpdateMessage struct {
   549  	path             string
   550  	newOffset        int64
   551  	lastSeenFileSize int64
   552  	finished         bool
   553  }
   554  
   555  type fileUpdateMessenger struct {
   556  	c    chan fileUpdateMessage
   557  	path string
   558  }
   559  
   560  func (f *fileUpdateMessenger) SetOffset(offset int64) {
   561  	f.c <- fileUpdateMessage{
   562  		path:             f.path,
   563  		newOffset:        offset,
   564  		lastSeenFileSize: -1,
   565  	}
   566  }
   567  
   568  func (f *fileUpdateMessenger) SetLastSeenFileSize(size int64) {
   569  	f.c <- fileUpdateMessage{
   570  		path:             f.path,
   571  		lastSeenFileSize: size,
   572  	}
   573  }
   574  
   575  func (f *fileUpdateMessenger) FinishedReading() {
   576  	f.c <- fileUpdateMessage{
   577  		path:             f.path,
   578  		finished:         true,
   579  		lastSeenFileSize: -1,
   580  	}
   581  }
   582  
   583  func getMatches(includes, excludes []string) []string {
   584  	all := make([]string, 0, len(includes))
   585  	for _, include := range includes {
   586  		matches, _ := filepath.Glob(include) // compile error checked in build
   587  	INCLUDE:
   588  		for _, match := range matches {
   589  			for _, exclude := range excludes {
   590  				if itMatches, _ := filepath.Match(exclude, match); itMatches {
   591  					break INCLUDE
   592  				}
   593  			}
   594  
   595  			for _, existing := range all {
   596  				if existing == match {
   597  					break INCLUDE
   598  				}
   599  			}
   600  
   601  			all = append(all, match)
   602  		}
   603  	}
   604  
   605  	return all
   606  }