github.com/Jeffail/benthos/v3@v3.65.0/lib/input/sftp.go (about)

     1  package input
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/Jeffail/benthos/v3/internal/codec"
    12  	"github.com/Jeffail/benthos/v3/internal/docs"
    13  	sftpSetup "github.com/Jeffail/benthos/v3/internal/impl/sftp"
    14  	"github.com/Jeffail/benthos/v3/internal/interop"
    15  	"github.com/Jeffail/benthos/v3/lib/input/reader"
    16  	"github.com/Jeffail/benthos/v3/lib/log"
    17  	"github.com/Jeffail/benthos/v3/lib/message"
    18  	"github.com/Jeffail/benthos/v3/lib/metrics"
    19  	"github.com/Jeffail/benthos/v3/lib/types"
    20  	"github.com/pkg/sftp"
    21  )
    22  
    23  func init() {
    24  	watcherDocs := docs.FieldSpecs{
    25  		docs.FieldCommon(
    26  			"enabled",
    27  			"Whether file watching is enabled.",
    28  		),
    29  		docs.FieldCommon(
    30  			"minimum_age",
    31  			"The minimum period of time since a file was last updated before attempting to consume it. Increasing this period decreases the likelihood that a file will be consumed whilst it is still being written to.",
    32  			"10s", "1m", "10m",
    33  		),
    34  		docs.FieldCommon(
    35  			"poll_interval",
    36  			"The interval between each attempt to scan the target paths for new files.",
    37  			"100ms", "1s",
    38  		),
    39  		docs.FieldCommon(
    40  			"cache",
    41  			"A [cache resource](/docs/components/caches/about) for storing the paths of files already consumed.",
    42  		),
    43  	}
    44  
    45  	Constructors[TypeSFTP] = TypeSpec{
    46  		constructor: fromSimpleConstructor(func(conf Config, mgr types.Manager, log log.Modular, stats metrics.Type) (Type, error) {
    47  			r, err := newSFTPReader(conf.SFTP, mgr, log, stats)
    48  			if err != nil {
    49  				return nil, err
    50  			}
    51  			return NewAsyncReader(
    52  				TypeSFTP,
    53  				true,
    54  				reader.NewAsyncPreserver(r),
    55  				log, stats,
    56  			)
    57  		}),
    58  		Status:  docs.StatusExperimental,
    59  		Version: "3.39.0",
    60  		Summary: `Consumes files from a server over SFTP.`,
    61  		Description: `
    62  ## Metadata
    63  
    64  This input adds the following metadata fields to each message:
    65  
    66  ` + "```" + `
    67  - sftp_path
    68  ` + "```" + `
    69  
    70  You can access these metadata fields using [function interpolation](/docs/configuration/interpolation#metadata).`,
    71  		FieldSpecs: docs.FieldSpecs{
    72  			docs.FieldCommon(
    73  				"address",
    74  				"The address of the server to connect to that has the target files.",
    75  			),
    76  			docs.FieldCommon(
    77  				"credentials",
    78  				"The credentials to use to log into the server.",
    79  			).WithChildren(sftpSetup.CredentialsDocs()...),
    80  			docs.FieldString(
    81  				"paths",
    82  				"A list of paths to consume sequentially. Glob patterns are supported.",
    83  			).Array(),
    84  			codec.ReaderDocs,
    85  			docs.FieldAdvanced("delete_on_finish", "Whether to delete files from the server once they are processed."),
    86  			docs.FieldAdvanced("max_buffer", "The largest token size expected when consuming delimited files."),
    87  			docs.FieldCommon(
    88  				"watcher",
    89  				"An experimental mode whereby the input will periodically scan the target paths for new files and consume them, when all files are consumed the input will continue polling for new files.",
    90  			).WithChildren(watcherDocs...).AtVersion("3.42.0"),
    91  		},
    92  		Categories: []Category{
    93  			CategoryNetwork,
    94  		},
    95  	}
    96  }
    97  
    98  //------------------------------------------------------------------------------
    99  
   100  type watcherConfig struct {
   101  	Enabled      bool   `json:"enabled" yaml:"enabled"`
   102  	MinimumAge   string `json:"minimum_age" yaml:"minimum_age"`
   103  	PollInterval string `json:"poll_interval" yaml:"poll_interval"`
   104  	Cache        string `json:"cache" yaml:"cache"`
   105  }
   106  
   107  // SFTPConfig contains configuration fields for the SFTP input type.
   108  type SFTPConfig struct {
   109  	Address        string                `json:"address" yaml:"address"`
   110  	Credentials    sftpSetup.Credentials `json:"credentials" yaml:"credentials"`
   111  	Paths          []string              `json:"paths" yaml:"paths"`
   112  	Codec          string                `json:"codec" yaml:"codec"`
   113  	DeleteOnFinish bool                  `json:"delete_on_finish" yaml:"delete_on_finish"`
   114  	MaxBuffer      int                   `json:"max_buffer" yaml:"max_buffer"`
   115  	Watcher        watcherConfig         `json:"watcher" yaml:"watcher"`
   116  }
   117  
   118  // NewSFTPConfig creates a new SFTPConfig with default values.
   119  func NewSFTPConfig() SFTPConfig {
   120  	return SFTPConfig{
   121  		Address:        "",
   122  		Credentials:    sftpSetup.Credentials{},
   123  		Paths:          []string{},
   124  		Codec:          "all-bytes",
   125  		DeleteOnFinish: false,
   126  		MaxBuffer:      1000000,
   127  		Watcher: watcherConfig{
   128  			Enabled:      false,
   129  			MinimumAge:   "1s",
   130  			PollInterval: "1s",
   131  			Cache:        "",
   132  		},
   133  	}
   134  }
   135  
   136  //------------------------------------------------------------------------------
   137  
   138  type sftpReader struct {
   139  	conf SFTPConfig
   140  
   141  	log   log.Modular
   142  	stats metrics.Type
   143  	mgr   types.Manager
   144  
   145  	client *sftp.Client
   146  
   147  	paths       []string
   148  	scannerCtor codec.ReaderConstructor
   149  
   150  	scannerMut  sync.Mutex
   151  	scanner     codec.Reader
   152  	currentPath string
   153  
   154  	watcherPollInterval time.Duration
   155  	watcherMinAge       time.Duration
   156  }
   157  
   158  func newSFTPReader(conf SFTPConfig, mgr types.Manager, log log.Modular, stats metrics.Type) (*sftpReader, error) {
   159  	codecConf := codec.NewReaderConfig()
   160  	codecConf.MaxScanTokenSize = conf.MaxBuffer
   161  	ctor, err := codec.GetReader(conf.Codec, codecConf)
   162  	if err != nil {
   163  		return nil, err
   164  	}
   165  
   166  	var watcherPollInterval, watcherMinAge time.Duration
   167  	if conf.Watcher.Enabled {
   168  		if watcherPollInterval, err = time.ParseDuration(conf.Watcher.PollInterval); err != nil {
   169  			return nil, fmt.Errorf("failed to parse watcher poll interval: %w", err)
   170  		}
   171  
   172  		if watcherMinAge, err = time.ParseDuration(conf.Watcher.MinimumAge); err != nil {
   173  			return nil, fmt.Errorf("failed to parse watcher minimum age: %w", err)
   174  		}
   175  
   176  		if conf.Watcher.Cache == "" {
   177  			return nil, errors.New("a cache must be specified when watcher mode is enabled")
   178  		}
   179  
   180  		if err := interop.ProbeCache(context.Background(), mgr, conf.Watcher.Cache); err != nil {
   181  			return nil, err
   182  		}
   183  	}
   184  
   185  	s := &sftpReader{
   186  		conf:                conf,
   187  		log:                 log,
   188  		stats:               stats,
   189  		mgr:                 mgr,
   190  		scannerCtor:         ctor,
   191  		watcherPollInterval: watcherPollInterval,
   192  		watcherMinAge:       watcherMinAge,
   193  	}
   194  
   195  	return s, err
   196  }
   197  
   198  // ConnectWithContext attempts to establish a connection to the target SFTP server.
   199  func (s *sftpReader) ConnectWithContext(ctx context.Context) error {
   200  	var err error
   201  
   202  	s.scannerMut.Lock()
   203  	defer s.scannerMut.Unlock()
   204  
   205  	if s.scanner != nil {
   206  		return nil
   207  	}
   208  
   209  	if s.client == nil {
   210  		if s.client, err = s.conf.Credentials.GetClient(s.conf.Address); err != nil {
   211  			return err
   212  		}
   213  		s.log.Debugln("Finding more paths")
   214  		s.paths, err = s.getFilePaths()
   215  		if err != nil {
   216  			return err
   217  		}
   218  	}
   219  
   220  	if len(s.paths) == 0 {
   221  		if !s.conf.Watcher.Enabled {
   222  			s.client.Close()
   223  			s.client = nil
   224  			s.log.Debugln("Paths exhausted, closing input")
   225  			return types.ErrTypeClosed
   226  		}
   227  		select {
   228  		case <-time.After(s.watcherPollInterval):
   229  		case <-ctx.Done():
   230  			return ctx.Err()
   231  		}
   232  		s.paths, err = s.getFilePaths()
   233  		return err
   234  	}
   235  
   236  	nextPath := s.paths[0]
   237  
   238  	file, err := s.client.Open(nextPath)
   239  	if err != nil {
   240  		return err
   241  	}
   242  
   243  	if s.scanner, err = s.scannerCtor(nextPath, file, func(ctx context.Context, err error) error {
   244  		if err == nil && s.conf.DeleteOnFinish {
   245  			return s.client.Remove(nextPath)
   246  		}
   247  		return nil
   248  	}); err != nil {
   249  		file.Close()
   250  		return err
   251  	}
   252  
   253  	s.currentPath = nextPath
   254  	s.paths = s.paths[1:]
   255  
   256  	s.log.Infof("Consuming from file '%v'\n", nextPath)
   257  	return err
   258  }
   259  
   260  // ReadWithContext attempts to read a new message from the target file(s) on the server.
   261  func (s *sftpReader) ReadWithContext(ctx context.Context) (types.Message, reader.AsyncAckFn, error) {
   262  	s.scannerMut.Lock()
   263  	defer s.scannerMut.Unlock()
   264  
   265  	if s.scanner == nil || s.client == nil {
   266  		return nil, nil, types.ErrNotConnected
   267  	}
   268  
   269  	parts, codecAckFn, err := s.scanner.Next(ctx)
   270  	if err != nil {
   271  		if errors.Is(err, context.Canceled) ||
   272  			errors.Is(err, context.DeadlineExceeded) {
   273  			err = types.ErrTimeout
   274  		}
   275  		if err != types.ErrTimeout {
   276  			if s.conf.Watcher.Enabled {
   277  				var setErr error
   278  				if cerr := interop.AccessCache(ctx, s.mgr, s.conf.Watcher.Cache, func(cache types.Cache) {
   279  					setErr = cache.Set(s.currentPath, []byte("@"))
   280  				}); cerr != nil {
   281  					return nil, nil, fmt.Errorf("failed to get the cache for sftp watcher mode: %v", cerr)
   282  				}
   283  				if setErr != nil {
   284  					return nil, nil, fmt.Errorf("failed to update path in cache %s: %v", s.currentPath, err)
   285  				}
   286  			}
   287  			s.scanner.Close(ctx)
   288  			s.scanner = nil
   289  		}
   290  		if errors.Is(err, io.EOF) {
   291  			err = types.ErrTimeout
   292  		}
   293  		return nil, nil, err
   294  	}
   295  
   296  	for _, part := range parts {
   297  		part.Metadata().Set("sftp_path", s.currentPath)
   298  	}
   299  	msg := message.New(nil)
   300  	msg.Append(parts...)
   301  
   302  	return msg, func(ctx context.Context, res types.Response) error {
   303  		return codecAckFn(ctx, res.Error())
   304  	}, nil
   305  }
   306  
   307  // CloseAsync begins cleaning up resources used by this reader asynchronously.
   308  func (s *sftpReader) CloseAsync() {
   309  	go func() {
   310  		s.scannerMut.Lock()
   311  		if s.scanner != nil {
   312  			s.scanner.Close(context.Background())
   313  			s.scanner = nil
   314  			s.paths = nil
   315  		}
   316  		if s.client != nil {
   317  			s.client.Close()
   318  			s.client = nil
   319  		}
   320  		s.scannerMut.Unlock()
   321  	}()
   322  }
   323  
   324  // WaitForClose will block until either the reader is closed or a specified
   325  // timeout occurs.
   326  func (s *sftpReader) WaitForClose(timeout time.Duration) error {
   327  	return nil
   328  }
   329  
   330  func (s *sftpReader) getFilePaths() ([]string, error) {
   331  	var filepaths []string
   332  	if !s.conf.Watcher.Enabled {
   333  		for _, p := range s.conf.Paths {
   334  			paths, err := s.client.Glob(p)
   335  			if err != nil {
   336  				s.log.Warnf("Failed to scan files from path %v: %v\n", p, err)
   337  				continue
   338  			}
   339  			filepaths = append(filepaths, paths...)
   340  		}
   341  		return filepaths, nil
   342  	}
   343  
   344  	if cerr := interop.AccessCache(context.Background(), s.mgr, s.conf.Watcher.Cache, func(cache types.Cache) {
   345  		for _, p := range s.conf.Paths {
   346  			paths, err := s.client.Glob(p)
   347  			if err != nil {
   348  				s.log.Warnf("Failed to scan files from path %v: %v\n", p, err)
   349  				continue
   350  			}
   351  
   352  			for _, path := range paths {
   353  				info, err := s.client.Stat(path)
   354  				if err != nil {
   355  					s.log.Warnf("Failed to stat path %v: %v\n", path, err)
   356  					continue
   357  				}
   358  				if time.Since(info.ModTime()) < s.watcherMinAge {
   359  					continue
   360  				}
   361  				if _, err := cache.Get(path); err != nil {
   362  					filepaths = append(filepaths, path)
   363  				} else if err = cache.Set(path, []byte("@")); err != nil { // Reset the TTL for the path
   364  					s.log.Warnf("Failed to set key in cache for path %v: %v\n", path, err)
   365  				}
   366  			}
   367  		}
   368  	}); cerr != nil {
   369  		return nil, fmt.Errorf("error getting cache in getFilePaths: %v", cerr)
   370  	}
   371  	return filepaths, nil
   372  }