github.com/observiq/carbon@v0.9.11-0.20200820160507-1b872e368a5e/operator/builtin/input/file/file.go (about) 1 package file 2 3 import ( 4 "bufio" 5 "bytes" 6 "context" 7 "crypto/md5" 8 "encoding/gob" 9 "fmt" 10 "io" 11 "os" 12 "path/filepath" 13 "regexp" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/observiq/carbon/entry" 19 "github.com/observiq/carbon/operator" 20 "github.com/observiq/carbon/operator/helper" 21 "go.uber.org/zap" 22 "golang.org/x/text/encoding" 23 "golang.org/x/text/encoding/ianaindex" 24 "golang.org/x/text/encoding/unicode" 25 ) 26 27 func init() { 28 operator.Register("file_input", func() operator.Builder { return NewInputConfig("") }) 29 } 30 31 func NewInputConfig(operatorID string) *InputConfig { 32 return &InputConfig{ 33 InputConfig: helper.NewInputConfig(operatorID, "file_input"), 34 PollInterval: operator.Duration{Duration: 200 * time.Millisecond}, 35 IncludeFileName: true, 36 IncludeFilePath: false, 37 StartAt: "end", 38 MaxLogSize: 1024 * 1024, 39 Encoding: "nop", 40 } 41 } 42 43 // InputConfig is the configuration of a file input operator 44 type InputConfig struct { 45 helper.InputConfig `yaml:",inline"` 46 47 Include []string `json:"include,omitempty" yaml:"include,omitempty"` 48 Exclude []string `json:"exclude,omitempty" yaml:"exclude,omitempty"` 49 50 PollInterval operator.Duration `json:"poll_interval,omitempty" yaml:"poll_interval,omitempty"` 51 Multiline *MultilineConfig `json:"multiline,omitempty" yaml:"multiline,omitempty"` 52 IncludeFileName bool `json:"include_file_name,omitempty" yaml:"include_file_name,omitempty"` 53 IncludeFilePath bool `json:"include_file_path,omitempty" yaml:"include_file_path,omitempty"` 54 StartAt string `json:"start_at,omitempty" yaml:"start_at,omitempty"` 55 MaxLogSize int `json:"max_log_size,omitempty" yaml:"max_log_size,omitempty"` 56 Encoding string `json:"encoding,omitempty" yaml:"encoding,omitempty"` 57 } 58 59 // MultilineConfig is the configuration a multiline operation 60 type MultilineConfig struct { 61 LineStartPattern string `json:"line_start_pattern" yaml:"line_start_pattern"` 62 LineEndPattern string `json:"line_end_pattern" yaml:"line_end_pattern"` 63 } 64 65 // Build will build a file input operator from the supplied configuration 66 func (c InputConfig) Build(context operator.BuildContext) (operator.Operator, error) { 67 inputOperator, err := c.InputConfig.Build(context) 68 if err != nil { 69 return nil, err 70 } 71 72 if len(c.Include) == 0 { 73 return nil, fmt.Errorf("required argument `include` is empty") 74 } 75 76 // Ensure includes can be parsed as globs 77 for _, include := range c.Include { 78 _, err := filepath.Match(include, "matchstring") 79 if err != nil { 80 return nil, fmt.Errorf("parse include glob: %s", err) 81 } 82 } 83 84 // Ensure excludes can be parsed as globs 85 for _, exclude := range c.Exclude { 86 _, err := filepath.Match(exclude, "matchstring") 87 if err != nil { 88 return nil, fmt.Errorf("parse exclude glob: %s", err) 89 } 90 } 91 92 encoding, err := lookupEncoding(c.Encoding) 93 if err != nil { 94 return nil, err 95 } 96 97 splitFunc, err := c.getSplitFunc(encoding) 98 if err != nil { 99 return nil, err 100 } 101 102 var startAtBeginning bool 103 switch c.StartAt { 104 case "beginning": 105 startAtBeginning = true 106 case "end": 107 startAtBeginning = false 108 default: 109 return nil, fmt.Errorf("invalid start_at location '%s'", c.StartAt) 110 } 111 112 fileNameField := entry.NewNilField() 113 if c.IncludeFileName { 114 fileNameField = entry.NewLabelField("file_name") 115 } 116 117 filePathField := entry.NewNilField() 118 if c.IncludeFilePath { 119 filePathField = entry.NewLabelField("file_path") 120 } 121 122 operator := &InputOperator{ 123 InputOperator: inputOperator, 124 Include: c.Include, 125 Exclude: c.Exclude, 126 SplitFunc: splitFunc, 127 PollInterval: c.PollInterval.Raw(), 128 persist: helper.NewScopedDBPersister(context.Database, c.ID()), 129 FilePathField: filePathField, 130 FileNameField: fileNameField, 131 runningFiles: make(map[string]struct{}), 132 fileUpdateChan: make(chan fileUpdateMessage, 10), 133 fingerprintBytes: 1000, 134 startAtBeginning: startAtBeginning, 135 encoding: encoding, 136 MaxLogSize: c.MaxLogSize, 137 } 138 139 return operator, nil 140 } 141 142 var encodingOverrides = map[string]encoding.Encoding{ 143 "utf-16": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), 144 "utf16": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), 145 "utf8": unicode.UTF8, 146 "ascii": unicode.UTF8, 147 "us-ascii": unicode.UTF8, 148 "nop": encoding.Nop, 149 "": encoding.Nop, 150 } 151 152 func lookupEncoding(enc string) (encoding.Encoding, error) { 153 if encoding, ok := encodingOverrides[strings.ToLower(enc)]; ok { 154 return encoding, nil 155 } 156 encoding, err := ianaindex.IANA.Encoding(enc) 157 if err != nil { 158 return nil, fmt.Errorf("unsupported encoding '%s'", enc) 159 } 160 if encoding == nil { 161 return nil, fmt.Errorf("no charmap defined for encoding '%s'", enc) 162 } 163 return encoding, nil 164 } 165 166 // getSplitFunc will return the split function associated the configured mode. 167 func (c InputConfig) getSplitFunc(encoding encoding.Encoding) (bufio.SplitFunc, error) { 168 if c.Multiline == nil { 169 return NewNewlineSplitFunc(encoding) 170 } 171 endPattern := c.Multiline.LineEndPattern 172 startPattern := c.Multiline.LineStartPattern 173 174 switch { 175 case endPattern != "" && startPattern != "": 176 return nil, fmt.Errorf("only one of line_start_pattern or line_end_pattern can be set") 177 case endPattern == "" && startPattern == "": 178 return nil, fmt.Errorf("one of line_start_pattern or line_end_pattern must be set") 179 case endPattern != "": 180 re, err := regexp.Compile("(?m)" + c.Multiline.LineEndPattern) 181 if err != nil { 182 return nil, fmt.Errorf("compile line end regex: %s", err) 183 } 184 return NewLineEndSplitFunc(re), nil 185 case startPattern != "": 186 re, err := regexp.Compile("(?m)" + c.Multiline.LineStartPattern) 187 if err != nil { 188 return nil, fmt.Errorf("compile line start regex: %s", err) 189 } 190 return NewLineStartSplitFunc(re), nil 191 default: 192 return nil, fmt.Errorf("unreachable") 193 } 194 } 195 196 // InputOperator is an operator that monitors files for entries 197 type InputOperator struct { 198 helper.InputOperator 199 200 Include []string 201 Exclude []string 202 FilePathField entry.Field 203 FileNameField entry.Field 204 PollInterval time.Duration 205 SplitFunc bufio.SplitFunc 206 MaxLogSize int 207 208 persist helper.Persister 209 210 runningFiles map[string]struct{} 211 knownFiles map[string]*knownFileInfo 212 startAtBeginning bool 213 214 fileUpdateChan chan fileUpdateMessage 215 fingerprintBytes int64 216 217 encoding encoding.Encoding 218 219 wg *sync.WaitGroup 220 readerWg *sync.WaitGroup 221 cancel context.CancelFunc 222 } 223 224 // Start will start the file monitoring process 225 func (f *InputOperator) Start() error { 226 ctx, cancel := context.WithCancel(context.Background()) 227 f.cancel = cancel 228 f.wg = &sync.WaitGroup{} 229 f.readerWg = &sync.WaitGroup{} 230 231 var err error 232 f.knownFiles, err = f.readKnownFiles() 233 if err != nil { 234 return fmt.Errorf("failed to read known files from database: %s", err) 235 } 236 237 f.wg.Add(1) 238 go func() { 239 defer f.wg.Done() 240 defer f.syncKnownFiles() 241 defer f.drainMessages() 242 243 globTicker := time.NewTicker(f.PollInterval) 244 defer globTicker.Stop() 245 246 // All accesses to runningFiles and knownFiles should be done from 247 // this goroutine. That means that all private methods of FileInput 248 // are unsafe to call from multiple goroutines. Changes to these 249 // maps should be done through the fileUpdateChan. 250 firstCheck := true 251 for { 252 select { 253 case <-ctx.Done(): 254 return 255 case <-globTicker.C: 256 matches := getMatches(f.Include, f.Exclude) 257 if firstCheck && len(matches) == 0 { 258 f.Warnw("no files match the configured include patterns", "include", f.Include) 259 } 260 for _, match := range matches { 261 f.checkFile(ctx, match, firstCheck) 262 } 263 f.syncKnownFiles() 264 firstCheck = false 265 case message, ok := <-f.fileUpdateChan: 266 if ok { 267 f.updateFile(message) 268 } 269 } 270 } 271 }() 272 273 return nil 274 } 275 276 // Stop will stop the file monitoring process 277 func (f *InputOperator) Stop() error { 278 f.cancel() 279 f.wg.Wait() 280 f.fileUpdateChan = make(chan fileUpdateMessage) 281 f.knownFiles = nil 282 return nil 283 } 284 285 // checkFile is not safe to call from multiple goroutines 286 // 287 // firstCheck indicates whether this is the first time checkFile has been called 288 // after startup. This is important for the start_at parameter because, after initial 289 // startup, we don't want to start at the end of newly-created files. 290 func (f *InputOperator) checkFile(ctx context.Context, path string, firstCheck bool) { 291 292 // Check if the file is currently being read 293 if _, ok := f.runningFiles[path]; ok { 294 return // file is already being read 295 } 296 297 // If the path is known, start from last offset 298 knownFile, isKnown := f.knownFiles[path] 299 300 // If the path is new, check if it was from a known file that was rotated 301 var err error 302 if !isKnown { 303 knownFile, err = newKnownFileInfo(path, f.fingerprintBytes, f.startAtBeginning || !firstCheck) 304 if err != nil { 305 f.Warnw("Failed to get info for file", zap.Error(err)) 306 return 307 } 308 309 for _, knownInfo := range f.knownFiles { 310 if knownFile.fingerprintMatches(knownInfo) || knownFile.smallFileContentsMatches(knownInfo) { 311 // The file was rotated, so update the path 312 knownInfo.Path = path 313 knownFile = knownInfo 314 break 315 } 316 } 317 } 318 319 f.runningFiles[path] = struct{}{} 320 f.knownFiles[path] = knownFile 321 f.readerWg.Add(1) 322 go func(ctx context.Context, path string, offset, lastSeenSize int64) { 323 defer f.readerWg.Done() 324 messenger := f.newFileUpdateMessenger(path) 325 defer messenger.FinishedReading() 326 err := ReadToEnd(ctx, path, offset, lastSeenSize, messenger, f.SplitFunc, f.FilePathField, f.FileNameField, f.InputOperator, f.MaxLogSize, f.encoding) 327 if err != nil { 328 f.Warnw("Failed to read log file", zap.Error(err)) 329 } 330 }(ctx, path, knownFile.Offset, knownFile.LastSeenFileSize) 331 } 332 333 func (f *InputOperator) updateFile(message fileUpdateMessage) { 334 if message.finished { 335 delete(f.runningFiles, message.path) 336 return 337 } 338 339 knownFile := f.knownFiles[message.path] 340 341 // This is a last seen size message, so just set the size and return 342 if message.lastSeenFileSize != -1 { 343 knownFile.LastSeenFileSize = message.lastSeenFileSize 344 return 345 } 346 347 if message.newOffset < knownFile.Offset { 348 // The file was truncated or rotated 349 350 newKnownFile, err := newKnownFileInfo(message.path, f.fingerprintBytes, true) 351 if err != nil { 352 f.Warnw("Failed to generate new file info", zap.Error(err)) 353 return 354 } 355 f.knownFiles[message.path] = newKnownFile 356 return 357 } 358 359 if knownFile.Offset < f.fingerprintBytes && message.newOffset > f.fingerprintBytes { 360 // The file graduated from small file to fingerprinted file 361 362 file, err := os.Open(message.path) 363 if err != nil { 364 f.Warnw("Failed to open file for fingerprinting", zap.Error(err)) 365 return 366 } 367 defer file.Close() 368 knownFile.Fingerprint, err = fingerprintFile(file, f.fingerprintBytes) 369 if err != nil { 370 f.Warnw("Failed to fingerprint file", zap.Error(err)) 371 return 372 } 373 knownFile.IsSmallFile = false 374 } else if message.newOffset < f.fingerprintBytes { 375 // The file is a small file 376 377 file, err := os.Open(message.path) 378 if err != nil { 379 f.Warnw("Failed to open small file for content tracking", zap.Error(err)) 380 return 381 } 382 defer file.Close() 383 384 buf := make([]byte, message.newOffset) 385 n, err := file.Read(buf) 386 if err != nil && err != io.EOF { 387 f.Warnw("Failed to read small file for content tracking", zap.Error(err)) 388 return 389 } 390 knownFile.SmallFileContents = buf[:n] 391 knownFile.IsSmallFile = true 392 } 393 394 knownFile.Offset = message.newOffset 395 } 396 397 func (f *InputOperator) drainMessages() { 398 go func() { 399 f.readerWg.Wait() 400 close(f.fileUpdateChan) 401 }() 402 403 for { 404 message, ok := <-f.fileUpdateChan 405 if !ok { 406 return 407 } 408 f.updateFile(message) 409 } 410 } 411 412 var knownFilesKey = "knownFiles" 413 414 func (f *InputOperator) syncKnownFiles() { 415 var buf bytes.Buffer 416 enc := gob.NewEncoder(&buf) 417 err := enc.Encode(f.knownFiles) 418 if err != nil { 419 f.Errorw("Failed to encode known files", zap.Error(err)) 420 return 421 } 422 423 f.persist.Set(knownFilesKey, buf.Bytes()) 424 f.persist.Sync() 425 } 426 427 func (f *InputOperator) readKnownFiles() (map[string]*knownFileInfo, error) { 428 err := f.persist.Load() 429 if err != nil { 430 return nil, err 431 } 432 433 var knownFiles map[string]*knownFileInfo 434 encoded := f.persist.Get(knownFilesKey) 435 if encoded == nil { 436 knownFiles = make(map[string]*knownFileInfo) 437 return knownFiles, nil 438 } 439 440 dec := gob.NewDecoder(bytes.NewReader(encoded)) 441 err = dec.Decode(&knownFiles) 442 if err != nil { 443 return nil, err 444 } 445 446 return knownFiles, nil 447 } 448 449 func (f *InputOperator) newFileUpdateMessenger(path string) fileUpdateMessenger { 450 return fileUpdateMessenger{ 451 path: path, 452 c: f.fileUpdateChan, 453 } 454 } 455 456 type knownFileInfo struct { 457 Path string 458 IsSmallFile bool 459 Fingerprint []byte 460 SmallFileContents []byte 461 Offset int64 462 LastSeenFileSize int64 463 } 464 465 func newKnownFileInfo(path string, fingerprintBytes int64, startAtBeginning bool) (*knownFileInfo, error) { 466 file, err := os.Open(path) 467 if err != nil { 468 return nil, err 469 } 470 defer file.Close() 471 472 stat, err := file.Stat() 473 if err != nil { 474 return nil, err 475 } 476 477 var fingerprint []byte 478 var smallFileContents []byte 479 isSmallFile := false 480 size := stat.Size() 481 if size > fingerprintBytes { 482 fingerprint, err = fingerprintFile(file, fingerprintBytes) 483 if err != nil { 484 return nil, err 485 } 486 } else { 487 isSmallFile = true 488 buf := make([]byte, size) 489 n, err := file.Read(buf) 490 if err != nil { 491 return nil, err 492 } 493 smallFileContents = buf[:n] 494 } 495 496 var offset int64 497 if startAtBeginning { 498 offset = 0 499 } else { 500 offset = stat.Size() 501 } 502 503 return &knownFileInfo{ 504 Path: path, 505 Fingerprint: fingerprint, 506 SmallFileContents: smallFileContents, 507 IsSmallFile: isSmallFile, 508 Offset: offset, 509 }, nil 510 } 511 512 func (i *knownFileInfo) smallFileContentsMatches(other *knownFileInfo) bool { 513 if !(i.IsSmallFile && other.IsSmallFile) { 514 return false 515 } 516 517 // compare the smaller of the two known files 518 var s int 519 if len(i.SmallFileContents) > len(other.SmallFileContents) { 520 s = len(other.SmallFileContents) 521 } else { 522 s = len(i.SmallFileContents) 523 } 524 525 return bytes.Equal(i.SmallFileContents[:s], other.SmallFileContents[:s]) 526 } 527 528 func (i *knownFileInfo) fingerprintMatches(other *knownFileInfo) bool { 529 if i.IsSmallFile || other.IsSmallFile { 530 return false 531 } 532 return bytes.Equal(i.Fingerprint, other.Fingerprint) 533 } 534 535 func fingerprintFile(file *os.File, numBytes int64) ([]byte, error) { 536 _, err := file.Seek(0, io.SeekStart) 537 if err != nil { 538 return nil, err 539 } 540 hash := md5.New() 541 542 buffer := make([]byte, numBytes) 543 io.ReadFull(file, buffer) 544 hash.Write(buffer) 545 return hash.Sum(nil), nil 546 } 547 548 type fileUpdateMessage struct { 549 path string 550 newOffset int64 551 lastSeenFileSize int64 552 finished bool 553 } 554 555 type fileUpdateMessenger struct { 556 c chan fileUpdateMessage 557 path string 558 } 559 560 func (f *fileUpdateMessenger) SetOffset(offset int64) { 561 f.c <- fileUpdateMessage{ 562 path: f.path, 563 newOffset: offset, 564 lastSeenFileSize: -1, 565 } 566 } 567 568 func (f *fileUpdateMessenger) SetLastSeenFileSize(size int64) { 569 f.c <- fileUpdateMessage{ 570 path: f.path, 571 lastSeenFileSize: size, 572 } 573 } 574 575 func (f *fileUpdateMessenger) FinishedReading() { 576 f.c <- fileUpdateMessage{ 577 path: f.path, 578 finished: true, 579 lastSeenFileSize: -1, 580 } 581 } 582 583 func getMatches(includes, excludes []string) []string { 584 all := make([]string, 0, len(includes)) 585 for _, include := range includes { 586 matches, _ := filepath.Glob(include) // compile error checked in build 587 INCLUDE: 588 for _, match := range matches { 589 for _, exclude := range excludes { 590 if itMatches, _ := filepath.Match(exclude, match); itMatches { 591 break INCLUDE 592 } 593 } 594 595 for _, existing := range all { 596 if existing == match { 597 break INCLUDE 598 } 599 } 600 601 all = append(all, match) 602 } 603 } 604 605 return all 606 }