github.com/Jeffail/benthos/v3@v3.65.0/lib/input/aws_s3.go (about)

     1  package input
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"net/url"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	"github.com/Jeffail/benthos/v3/internal/codec"
    16  	"github.com/Jeffail/benthos/v3/internal/docs"
    17  	"github.com/Jeffail/benthos/v3/lib/input/reader"
    18  	"github.com/Jeffail/benthos/v3/lib/log"
    19  	"github.com/Jeffail/benthos/v3/lib/message"
    20  	"github.com/Jeffail/benthos/v3/lib/metrics"
    21  	"github.com/Jeffail/benthos/v3/lib/types"
    22  	sess "github.com/Jeffail/benthos/v3/lib/util/aws/session"
    23  	"github.com/Jeffail/gabs/v2"
    24  	"github.com/aws/aws-sdk-go/aws"
    25  	"github.com/aws/aws-sdk-go/aws/session"
    26  	"github.com/aws/aws-sdk-go/service/s3"
    27  	"github.com/aws/aws-sdk-go/service/sqs"
    28  )
    29  
    30  func init() {
    31  	Constructors[TypeAWSS3] = TypeSpec{
    32  		constructor: fromSimpleConstructor(func(conf Config, mgr types.Manager, log log.Modular, stats metrics.Type) (Type, error) {
    33  			var r reader.Async
    34  			var err error
    35  			if r, err = newAmazonS3(conf.AWSS3, log, stats); err != nil {
    36  				return nil, err
    37  			}
    38  			// If we're not pulling events directly from an SQS queue then
    39  			// there's no concept of propagating nacks upstream, therefore wrap
    40  			// our reader within a preserver in order to retry indefinitely.
    41  			if conf.AWSS3.SQS.URL == "" {
    42  				r = reader.NewAsyncPreserver(r)
    43  			}
    44  			return NewAsyncReader(TypeAWSS3, false, r, log, stats)
    45  		}),
    46  		Status: docs.StatusStable,
    47  		Summary: `
    48  Downloads objects within an Amazon S3 bucket, optionally filtered by a prefix, either by walking the items in the bucket or by streaming upload notifications in realtime.`,
    49  		Description: `
    50  ## Streaming Objects on Upload with SQS
    51  
    52  A common pattern for consuming S3 objects is to emit upload notification events from the bucket either directly to an SQS queue, or to an SNS topic that is consumed by an SQS queue, and then have your consumer listen for events which prompt it to download the newly uploaded objects. More information about this pattern and how to set it up can be found at: https://docs.aws.amazon.com/AmazonS3/latest/dev/ways-to-add-notification-config-to-bucket.html.
    53  
    54  Benthos is able to follow this pattern when you configure an ` + "`sqs.url`" + `, where it consumes events from SQS and only downloads object keys received within those events. In order for this to work Benthos needs to know where within the event the key and bucket names can be found, specified as [dot paths](/docs/configuration/field_paths) with the fields ` + "`sqs.key_path` and `sqs.bucket_path`" + `. The default values for these fields should already be correct when following the guide above.
    55  
    56  If your notification events are being routed to SQS via an SNS topic then the events will be enveloped by SNS, in which case you also need to specify the field ` + "`sqs.envelope_path`" + `, which in the case of SNS to SQS will usually be ` + "`Message`" + `.
    57  
    58  When using SQS please make sure you have sensible values for ` + "`sqs.max_messages`" + ` and also the visibility timeout of the queue itself. When Benthos consumes an S3 object the SQS message that triggered it is not deleted until the S3 object has been sent onwards. This ensures at-least-once crash resiliency, but also means that if the S3 object takes longer to process than the visibility timeout of your queue then the same objects might be processed multiple times.
    59  
    60  ## Downloading Large Files
    61  
    62  When downloading large files it's often necessary to process it in streamed parts in order to avoid loading the entire file in memory at a given time. In order to do this a ` + "[`codec`](#codec)" + ` can be specified that determines how to break the input into smaller individual messages.
    63  
    64  ## Credentials
    65  
    66  By default Benthos will use a shared credentials file when connecting to AWS services. It's also possible to set them explicitly at the component level, allowing you to transfer data across accounts. You can find out more [in this document](/docs/guides/cloud/aws).
    67  
    68  ## Metadata
    69  
    70  This input adds the following metadata fields to each message:
    71  
    72  ` + "```" + `
    73  - s3_key
    74  - s3_bucket
    75  - s3_last_modified_unix
    76  - s3_last_modified (RFC3339)
    77  - s3_content_type
    78  - s3_content_encoding
    79  - All user defined metadata
    80  ` + "```" + `
    81  
    82  You can access these metadata fields using [function interpolation](/docs/configuration/interpolation#metadata). Note that user defined metadata is case insensitive within AWS, and it is likely that the keys will be received in a capitalized form, if you wish to make them consistent you can map all metadata keys to lower or uppercase using a Bloblang mapping such as ` + "`meta = meta().map_each_key(key -> key.lowercase())`" + `.`,
    83  
    84  		FieldSpecs: append(
    85  			append(docs.FieldSpecs{
    86  				docs.FieldCommon("bucket", "The bucket to consume from. If the field `sqs.url` is specified this field is optional."),
    87  				docs.FieldCommon("prefix", "An optional path prefix, if set only objects with the prefix are consumed when walking a bucket."),
    88  			}, sess.FieldSpecs()...),
    89  			docs.FieldAdvanced("force_path_style_urls", "Forces the client API to use path style URLs for downloading keys, which is often required when connecting to custom endpoints."),
    90  			docs.FieldAdvanced("delete_objects", "Whether to delete downloaded objects from the bucket once they are processed."),
    91  			codec.ReaderDocs,
    92  			docs.FieldCommon("sqs", "Consume SQS messages in order to trigger key downloads.").WithChildren(
    93  				docs.FieldCommon("url", "An optional SQS URL to connect to. When specified this queue will control which objects are downloaded."),
    94  				docs.FieldAdvanced("endpoint", "A custom endpoint to use when connecting to SQS."),
    95  				docs.FieldCommon("key_path", "A [dot path](/docs/configuration/field_paths) whereby object keys are found in SQS messages."),
    96  				docs.FieldCommon("bucket_path", "A [dot path](/docs/configuration/field_paths) whereby the bucket name can be found in SQS messages."),
    97  				docs.FieldCommon("envelope_path", "A [dot path](/docs/configuration/field_paths) of a field to extract an enveloped JSON payload for further extracting the key and bucket from SQS messages. This is specifically useful when subscribing an SQS queue to an SNS topic that receives bucket events.", "Message"),
    98  				docs.FieldAdvanced(
    99  					"delay_period",
   100  					"An optional period of time to wait from when a notification was originally sent to when the target key download is attempted.",
   101  					"10s", "5m",
   102  				),
   103  				docs.FieldAdvanced("max_messages", "The maximum number of SQS messages to consume from each request."),
   104  			),
   105  		),
   106  		Categories: []Category{
   107  			CategoryServices,
   108  			CategoryAWS,
   109  		},
   110  	}
   111  }
   112  
   113  //------------------------------------------------------------------------------
   114  
   115  // AWSS3SQSConfig contains configuration for hooking up the S3 input with an SQS queue.
   116  type AWSS3SQSConfig struct {
   117  	URL          string `json:"url" yaml:"url"`
   118  	Endpoint     string `json:"endpoint" yaml:"endpoint"`
   119  	EnvelopePath string `json:"envelope_path" yaml:"envelope_path"`
   120  	KeyPath      string `json:"key_path" yaml:"key_path"`
   121  	BucketPath   string `json:"bucket_path" yaml:"bucket_path"`
   122  	DelayPeriod  string `json:"delay_period" yaml:"delay_period"`
   123  	MaxMessages  int64  `json:"max_messages" yaml:"max_messages"`
   124  }
   125  
   126  // NewAWSS3SQSConfig creates a new AWSS3SQSConfig with default values.
   127  func NewAWSS3SQSConfig() AWSS3SQSConfig {
   128  	return AWSS3SQSConfig{
   129  		URL:          "",
   130  		Endpoint:     "",
   131  		EnvelopePath: "",
   132  		KeyPath:      "Records.*.s3.object.key",
   133  		BucketPath:   "Records.*.s3.bucket.name",
   134  		DelayPeriod:  "",
   135  		MaxMessages:  10,
   136  	}
   137  }
   138  
   139  // AWSS3Config contains configuration values for the aws_s3 input type.
   140  type AWSS3Config struct {
   141  	sess.Config        `json:",inline" yaml:",inline"`
   142  	Bucket             string         `json:"bucket" yaml:"bucket"`
   143  	Codec              string         `json:"codec" yaml:"codec"`
   144  	Prefix             string         `json:"prefix" yaml:"prefix"`
   145  	ForcePathStyleURLs bool           `json:"force_path_style_urls" yaml:"force_path_style_urls"`
   146  	DeleteObjects      bool           `json:"delete_objects" yaml:"delete_objects"`
   147  	SQS                AWSS3SQSConfig `json:"sqs" yaml:"sqs"`
   148  }
   149  
   150  // NewAWSS3Config creates a new AWSS3Config with default values.
   151  func NewAWSS3Config() AWSS3Config {
   152  	return AWSS3Config{
   153  		Config:             sess.NewConfig(),
   154  		Bucket:             "",
   155  		Prefix:             "",
   156  		Codec:              "all-bytes",
   157  		ForcePathStyleURLs: false,
   158  		DeleteObjects:      false,
   159  		SQS:                NewAWSS3SQSConfig(),
   160  	}
   161  }
   162  
   163  //------------------------------------------------------------------------------
   164  
   165  type s3ObjectTarget struct {
   166  	key            string
   167  	bucket         string
   168  	notificationAt time.Time
   169  
   170  	ackFn func(context.Context, error) error
   171  }
   172  
   173  func newS3ObjectTarget(key, bucket string, notificationAt time.Time, ackFn codec.ReaderAckFn) *s3ObjectTarget {
   174  	if ackFn == nil {
   175  		ackFn = func(context.Context, error) error {
   176  			return nil
   177  		}
   178  	}
   179  	return &s3ObjectTarget{key, bucket, notificationAt, ackFn}
   180  }
   181  
   182  type s3ObjectTargetReader interface {
   183  	Pop(ctx context.Context) (*s3ObjectTarget, error)
   184  	Close(ctx context.Context) error
   185  }
   186  
   187  //------------------------------------------------------------------------------
   188  
   189  func deleteS3ObjectAckFn(
   190  	s3Client *s3.S3,
   191  	bucket, key string,
   192  	del bool,
   193  	prev codec.ReaderAckFn,
   194  ) codec.ReaderAckFn {
   195  	return func(ctx context.Context, err error) error {
   196  		if prev != nil {
   197  			if aerr := prev(ctx, err); aerr != nil {
   198  				return aerr
   199  			}
   200  		}
   201  		if !del || err != nil {
   202  			return nil
   203  		}
   204  		_, aerr := s3Client.DeleteObjectWithContext(ctx, &s3.DeleteObjectInput{
   205  			Bucket: aws.String(bucket),
   206  			Key:    aws.String(key),
   207  		})
   208  		return aerr
   209  	}
   210  }
   211  
   212  //------------------------------------------------------------------------------
   213  
   214  type staticTargetReader struct {
   215  	pending    []*s3ObjectTarget
   216  	s3         *s3.S3
   217  	conf       AWSS3Config
   218  	startAfter *string
   219  }
   220  
   221  func newStaticTargetReader(
   222  	ctx context.Context,
   223  	conf AWSS3Config,
   224  	log log.Modular,
   225  	s3Client *s3.S3,
   226  ) (*staticTargetReader, error) {
   227  	listInput := &s3.ListObjectsV2Input{
   228  		Bucket:  aws.String(conf.Bucket),
   229  		MaxKeys: aws.Int64(100),
   230  	}
   231  	if len(conf.Prefix) > 0 {
   232  		listInput.Prefix = aws.String(conf.Prefix)
   233  	}
   234  	output, err := s3Client.ListObjectsV2WithContext(ctx, listInput)
   235  	if err != nil {
   236  		return nil, fmt.Errorf("failed to list objects: %v", err)
   237  	}
   238  	staticKeys := staticTargetReader{
   239  		s3:   s3Client,
   240  		conf: conf,
   241  	}
   242  	for _, obj := range output.Contents {
   243  		ackFn := deleteS3ObjectAckFn(s3Client, conf.Bucket, *obj.Key, conf.DeleteObjects, nil)
   244  		staticKeys.pending = append(staticKeys.pending, newS3ObjectTarget(*obj.Key, conf.Bucket, time.Time{}, ackFn))
   245  	}
   246  	if len(output.Contents) > 0 {
   247  		staticKeys.startAfter = output.Contents[len(output.Contents)-1].Key
   248  	}
   249  	return &staticKeys, nil
   250  }
   251  
   252  func (s *staticTargetReader) Pop(ctx context.Context) (*s3ObjectTarget, error) {
   253  	if len(s.pending) == 0 && s.startAfter != nil {
   254  		s.pending = nil
   255  		listInput := &s3.ListObjectsV2Input{
   256  			Bucket:     aws.String(s.conf.Bucket),
   257  			MaxKeys:    aws.Int64(100),
   258  			StartAfter: s.startAfter,
   259  		}
   260  		if len(s.conf.Prefix) > 0 {
   261  			listInput.Prefix = aws.String(s.conf.Prefix)
   262  		}
   263  		output, err := s.s3.ListObjectsV2WithContext(ctx, listInput)
   264  		if err != nil {
   265  			return nil, fmt.Errorf("failed to list objects: %v", err)
   266  		}
   267  		for _, obj := range output.Contents {
   268  			ackFn := deleteS3ObjectAckFn(s.s3, s.conf.Bucket, *obj.Key, s.conf.DeleteObjects, nil)
   269  			s.pending = append(s.pending, newS3ObjectTarget(*obj.Key, s.conf.Bucket, time.Time{}, ackFn))
   270  		}
   271  		if len(output.Contents) > 0 {
   272  			s.startAfter = output.Contents[len(output.Contents)-1].Key
   273  		}
   274  	}
   275  	if len(s.pending) == 0 {
   276  		return nil, io.EOF
   277  	}
   278  	obj := s.pending[0]
   279  	s.pending = s.pending[1:]
   280  	return obj, nil
   281  }
   282  
   283  func (s staticTargetReader) Close(context.Context) error {
   284  	return nil
   285  }
   286  
   287  //------------------------------------------------------------------------------
   288  
   289  type sqsTargetReader struct {
   290  	conf AWSS3Config
   291  	log  log.Modular
   292  	sqs  *sqs.SQS
   293  	s3   *s3.S3
   294  
   295  	nextRequest time.Time
   296  
   297  	pending []*s3ObjectTarget
   298  }
   299  
   300  func newSQSTargetReader(
   301  	conf AWSS3Config,
   302  	log log.Modular,
   303  	s3 *s3.S3,
   304  	sqs *sqs.SQS,
   305  ) *sqsTargetReader {
   306  	return &sqsTargetReader{conf, log, sqs, s3, time.Time{}, nil}
   307  }
   308  
   309  func (s *sqsTargetReader) Pop(ctx context.Context) (*s3ObjectTarget, error) {
   310  	if len(s.pending) > 0 {
   311  		t := s.pending[0]
   312  		s.pending = s.pending[1:]
   313  		return t, nil
   314  	}
   315  
   316  	if !s.nextRequest.IsZero() {
   317  		if until := time.Until(s.nextRequest); until > 0 {
   318  			select {
   319  			case <-time.After(until):
   320  			case <-ctx.Done():
   321  				return nil, ctx.Err()
   322  			}
   323  		}
   324  	}
   325  
   326  	var err error
   327  	if s.pending, err = s.readSQSEvents(ctx); err != nil {
   328  		return nil, err
   329  	}
   330  	if len(s.pending) == 0 {
   331  		s.nextRequest = time.Now().Add(time.Millisecond * 500)
   332  		return nil, types.ErrTimeout
   333  	}
   334  	s.nextRequest = time.Time{}
   335  	t := s.pending[0]
   336  	s.pending = s.pending[1:]
   337  	return t, nil
   338  }
   339  
   340  func (s *sqsTargetReader) Close(ctx context.Context) error {
   341  	var err error
   342  	for _, p := range s.pending {
   343  		if aerr := p.ackFn(ctx, errors.New("service shutting down")); aerr != nil {
   344  			err = aerr
   345  		}
   346  	}
   347  	return err
   348  }
   349  
   350  func digStrsFromSlices(slice []interface{}) []string {
   351  	var strs []string
   352  	for _, v := range slice {
   353  		switch t := v.(type) {
   354  		case []interface{}:
   355  			strs = append(strs, digStrsFromSlices(t)...)
   356  		case string:
   357  			strs = append(strs, t)
   358  		}
   359  	}
   360  	return strs
   361  }
   362  
   363  func (s *sqsTargetReader) parseObjectPaths(sqsMsg *string) ([]s3ObjectTarget, error) {
   364  	gObj, err := gabs.ParseJSON([]byte(*sqsMsg))
   365  	if err != nil {
   366  		return nil, fmt.Errorf("failed to parse SQS message: %v", err)
   367  	}
   368  
   369  	if len(s.conf.SQS.EnvelopePath) > 0 {
   370  		d := gObj.Path(s.conf.SQS.EnvelopePath).Data()
   371  		if str, ok := d.(string); ok {
   372  			if gObj, err = gabs.ParseJSON([]byte(str)); err != nil {
   373  				return nil, fmt.Errorf("failed to parse enveloped message: %v", err)
   374  			}
   375  		} else {
   376  			return nil, fmt.Errorf("expected string at envelope path, found %T", d)
   377  		}
   378  	}
   379  
   380  	var keys []string
   381  	var buckets []string
   382  
   383  	switch t := gObj.Path(s.conf.SQS.KeyPath).Data().(type) {
   384  	case string:
   385  		keys = []string{t}
   386  	case []interface{}:
   387  		keys = digStrsFromSlices(t)
   388  	}
   389  	if len(s.conf.SQS.BucketPath) > 0 {
   390  		switch t := gObj.Path(s.conf.SQS.BucketPath).Data().(type) {
   391  		case string:
   392  			buckets = []string{t}
   393  		case []interface{}:
   394  			buckets = digStrsFromSlices(t)
   395  		}
   396  	}
   397  
   398  	objects := make([]s3ObjectTarget, 0, len(keys))
   399  	for i, key := range keys {
   400  		if key, err = url.QueryUnescape(key); err != nil {
   401  			return nil, fmt.Errorf("failed to parse key from SQS message: %v", err)
   402  		}
   403  		bucket := s.conf.Bucket
   404  		if len(buckets) > i {
   405  			bucket = buckets[i]
   406  		}
   407  		if bucket == "" {
   408  			return nil, errors.New("required bucket was not found in SQS message")
   409  		}
   410  		objects = append(objects, s3ObjectTarget{
   411  			key:    key,
   412  			bucket: bucket,
   413  		})
   414  	}
   415  
   416  	return objects, nil
   417  }
   418  
   419  func (s *sqsTargetReader) readSQSEvents(ctx context.Context) ([]*s3ObjectTarget, error) {
   420  	var dudMessageHandles []*sqs.ChangeMessageVisibilityBatchRequestEntry
   421  	addDudFn := func(m *sqs.Message) {
   422  		dudMessageHandles = append(dudMessageHandles, &sqs.ChangeMessageVisibilityBatchRequestEntry{
   423  			Id:                m.MessageId,
   424  			ReceiptHandle:     m.ReceiptHandle,
   425  			VisibilityTimeout: aws.Int64(0),
   426  		})
   427  	}
   428  
   429  	output, err := s.sqs.ReceiveMessageWithContext(ctx, &sqs.ReceiveMessageInput{
   430  		QueueUrl:            aws.String(s.conf.SQS.URL),
   431  		MaxNumberOfMessages: aws.Int64(s.conf.SQS.MaxMessages),
   432  		AttributeNames: []*string{
   433  			aws.String("SentTimestamp"),
   434  		},
   435  	})
   436  	if err != nil {
   437  		return nil, err
   438  	}
   439  
   440  	var pendingObjects []*s3ObjectTarget
   441  
   442  	for _, sqsMsg := range output.Messages {
   443  		sqsMsg := sqsMsg
   444  
   445  		var notificationAt time.Time
   446  		if rcvd, ok := sqsMsg.Attributes["SentTimestamp"]; ok && rcvd != nil {
   447  			if millis, _ := strconv.Atoi(*rcvd); millis > 0 {
   448  				notificationAt = time.Unix(0, int64(millis*1e6))
   449  			}
   450  		}
   451  
   452  		if sqsMsg.Body == nil {
   453  			addDudFn(sqsMsg)
   454  			s.log.Errorln("Received empty SQS message")
   455  			continue
   456  		}
   457  
   458  		objects, err := s.parseObjectPaths(sqsMsg.Body)
   459  		if err != nil {
   460  			addDudFn(sqsMsg)
   461  			s.log.Errorf("SQS extract key error: %v\n", err)
   462  			continue
   463  		}
   464  		if len(objects) == 0 {
   465  			addDudFn(sqsMsg)
   466  			s.log.Debugln("Extracted zero target keys from SQS message")
   467  			continue
   468  		}
   469  
   470  		pendingAcks := int32(len(objects))
   471  		var nackOnce sync.Once
   472  		for _, object := range objects {
   473  			ackOnce := sync.Once{}
   474  			pendingObjects = append(pendingObjects, newS3ObjectTarget(
   475  				object.key, object.bucket, notificationAt,
   476  				deleteS3ObjectAckFn(
   477  					s.s3, object.bucket, object.key, s.conf.DeleteObjects,
   478  					func(ctx context.Context, err error) (aerr error) {
   479  						if err != nil {
   480  							nackOnce.Do(func() {
   481  								// Prevent future acks from triggering a delete.
   482  								atomic.StoreInt32(&pendingAcks, -1)
   483  
   484  								s.log.Debugf("Pushing SQS notification back into the queue due to error: %v\n", err)
   485  
   486  								// It's possible that this is called for one message
   487  								// at the _exact_ same time as another is acked, but
   488  								// if the acked message triggers a full ack of the
   489  								// origin message then even though it shouldn't be
   490  								// possible, it's also harmless.
   491  								aerr = s.nackSQSMessage(ctx, sqsMsg)
   492  							})
   493  						} else {
   494  							ackOnce.Do(func() {
   495  								if atomic.AddInt32(&pendingAcks, -1) == 0 {
   496  									aerr = s.ackSQSMessage(ctx, sqsMsg)
   497  								}
   498  							})
   499  						}
   500  						return
   501  					},
   502  				),
   503  			))
   504  		}
   505  	}
   506  
   507  	// Discard any SQS messages not associated with a target file.
   508  	for len(dudMessageHandles) > 0 {
   509  		input := sqs.ChangeMessageVisibilityBatchInput{
   510  			QueueUrl: aws.String(s.conf.SQS.URL),
   511  			Entries:  dudMessageHandles,
   512  		}
   513  
   514  		// trim input entries to max size
   515  		if len(dudMessageHandles) > 10 {
   516  			input.Entries, dudMessageHandles = dudMessageHandles[:10], dudMessageHandles[10:]
   517  		} else {
   518  			dudMessageHandles = nil
   519  		}
   520  		s.sqs.ChangeMessageVisibilityBatch(&input)
   521  	}
   522  
   523  	return pendingObjects, nil
   524  }
   525  
   526  func (s *sqsTargetReader) nackSQSMessage(ctx context.Context, msg *sqs.Message) error {
   527  	_, err := s.sqs.ChangeMessageVisibilityWithContext(ctx, &sqs.ChangeMessageVisibilityInput{
   528  		QueueUrl:          aws.String(s.conf.SQS.URL),
   529  		ReceiptHandle:     msg.ReceiptHandle,
   530  		VisibilityTimeout: aws.Int64(0),
   531  	})
   532  	return err
   533  }
   534  
   535  func (s *sqsTargetReader) ackSQSMessage(ctx context.Context, msg *sqs.Message) error {
   536  	_, err := s.sqs.DeleteMessageWithContext(ctx, &sqs.DeleteMessageInput{
   537  		QueueUrl:      aws.String(s.conf.SQS.URL),
   538  		ReceiptHandle: msg.ReceiptHandle,
   539  	})
   540  	return err
   541  }
   542  
   543  //------------------------------------------------------------------------------
   544  
   545  // AmazonS3 is a benthos reader.Type implementation that reads messages from an
   546  // Amazon S3 bucket.
   547  type awsS3 struct {
   548  	conf AWSS3Config
   549  
   550  	objectScannerCtor codec.ReaderConstructor
   551  	keyReader         s3ObjectTargetReader
   552  
   553  	session *session.Session
   554  	s3      *s3.S3
   555  	sqs     *sqs.SQS
   556  
   557  	gracePeriod time.Duration
   558  
   559  	objectMut sync.Mutex
   560  	object    *s3PendingObject
   561  
   562  	log   log.Modular
   563  	stats metrics.Type
   564  }
   565  
   566  type s3PendingObject struct {
   567  	target    *s3ObjectTarget
   568  	obj       *s3.GetObjectOutput
   569  	extracted int
   570  	scanner   codec.Reader
   571  }
   572  
   573  // NewAmazonS3 creates a new Amazon S3 bucket reader.Type.
   574  func newAmazonS3(
   575  	conf AWSS3Config,
   576  	log log.Modular,
   577  	stats metrics.Type,
   578  ) (*awsS3, error) {
   579  	if conf.Bucket == "" && conf.SQS.URL == "" {
   580  		return nil, errors.New("either a bucket or an sqs.url must be specified")
   581  	}
   582  	if conf.Prefix != "" && conf.SQS.URL != "" {
   583  		return nil, errors.New("cannot specify both a prefix and sqs.url")
   584  	}
   585  	s := &awsS3{
   586  		conf:  conf,
   587  		log:   log,
   588  		stats: stats,
   589  	}
   590  	var err error
   591  	if s.objectScannerCtor, err = codec.GetReader(conf.Codec, codec.NewReaderConfig()); err != nil {
   592  		return nil, err
   593  	}
   594  	if len(conf.SQS.DelayPeriod) > 0 {
   595  		if s.gracePeriod, err = time.ParseDuration(conf.SQS.DelayPeriod); err != nil {
   596  			return nil, fmt.Errorf("failed to parse grace period: %w", err)
   597  		}
   598  	}
   599  	return s, nil
   600  }
   601  
   602  func (a *awsS3) getTargetReader(ctx context.Context) (s3ObjectTargetReader, error) {
   603  	if a.sqs != nil {
   604  		return newSQSTargetReader(a.conf, a.log, a.s3, a.sqs), nil
   605  	}
   606  	return newStaticTargetReader(ctx, a.conf, a.log, a.s3)
   607  }
   608  
   609  // ConnectWithContext attempts to establish a connection to the target S3 bucket
   610  // and any relevant queues used to traverse the objects (SQS, etc).
   611  func (a *awsS3) ConnectWithContext(ctx context.Context) error {
   612  	if a.session != nil {
   613  		return nil
   614  	}
   615  
   616  	sess, err := a.conf.GetSession(func(c *aws.Config) {
   617  		c.S3ForcePathStyle = aws.Bool(a.conf.ForcePathStyleURLs)
   618  	})
   619  	if err != nil {
   620  		return err
   621  	}
   622  
   623  	a.session = sess
   624  	a.s3 = s3.New(sess)
   625  	if a.conf.SQS.URL != "" {
   626  		sqsSess := sess.Copy()
   627  		if len(a.conf.SQS.Endpoint) > 0 {
   628  			sqsSess.Config.Endpoint = &a.conf.SQS.Endpoint
   629  		}
   630  		a.sqs = sqs.New(sqsSess)
   631  	}
   632  
   633  	if a.keyReader, err = a.getTargetReader(ctx); err != nil {
   634  		a.session = nil
   635  		a.s3 = nil
   636  		a.sqs = nil
   637  		return err
   638  	}
   639  
   640  	if a.conf.SQS.URL == "" {
   641  		a.log.Infof("Downloading S3 objects from bucket: %s\n", a.conf.Bucket)
   642  	} else {
   643  		a.log.Infof("Downloading S3 objects found in messages from SQS: %s\n", a.conf.SQS.URL)
   644  	}
   645  	return nil
   646  }
   647  
   648  func s3MsgFromParts(p *s3PendingObject, parts []types.Part) types.Message {
   649  	msg := message.New(nil)
   650  	msg.Append(parts...)
   651  	msg.Iter(func(_ int, part types.Part) error {
   652  		meta := part.Metadata()
   653  		meta.Set("s3_key", p.target.key)
   654  		meta.Set("s3_bucket", p.target.bucket)
   655  		if p.obj.LastModified != nil {
   656  			meta.Set("s3_last_modified", p.obj.LastModified.Format(time.RFC3339))
   657  			meta.Set("s3_last_modified_unix", strconv.FormatInt(p.obj.LastModified.Unix(), 10))
   658  		}
   659  		if p.obj.ContentType != nil {
   660  			meta.Set("s3_content_type", *p.obj.ContentType)
   661  		}
   662  		if p.obj.ContentEncoding != nil {
   663  			meta.Set("s3_content_encoding", *p.obj.ContentEncoding)
   664  		}
   665  		for k, v := range p.obj.Metadata {
   666  			if v != nil {
   667  				meta.Set(k, *v)
   668  			}
   669  		}
   670  		return nil
   671  	})
   672  	return msg
   673  }
   674  
   675  func (a *awsS3) getObjectTarget(ctx context.Context) (*s3PendingObject, error) {
   676  	if a.object != nil {
   677  		return a.object, nil
   678  	}
   679  
   680  	target, err := a.keyReader.Pop(ctx)
   681  	if err != nil {
   682  		return nil, err
   683  	}
   684  
   685  	if a.gracePeriod > 0 && !target.notificationAt.IsZero() {
   686  		waitFor := a.gracePeriod - time.Since(target.notificationAt)
   687  		if waitFor > 0 && waitFor < a.gracePeriod {
   688  			select {
   689  			case <-time.After(waitFor):
   690  			case <-ctx.Done():
   691  				return nil, ctx.Err()
   692  			}
   693  		}
   694  	}
   695  
   696  	obj, err := a.s3.GetObject(&s3.GetObjectInput{
   697  		Bucket: aws.String(target.bucket),
   698  		Key:    aws.String(target.key),
   699  	})
   700  	if err != nil {
   701  		_ = target.ackFn(ctx, err)
   702  		return nil, err
   703  	}
   704  
   705  	object := &s3PendingObject{
   706  		target: target,
   707  		obj:    obj,
   708  	}
   709  	if object.scanner, err = a.objectScannerCtor(target.key, obj.Body, target.ackFn); err != nil {
   710  		_ = target.ackFn(ctx, err)
   711  		return nil, err
   712  	}
   713  
   714  	a.object = object
   715  	return object, nil
   716  }
   717  
   718  // ReadWithContext attempts to read a new message from the target S3 bucket.
   719  func (a *awsS3) ReadWithContext(ctx context.Context) (msg types.Message, ackFn reader.AsyncAckFn, err error) {
   720  	a.objectMut.Lock()
   721  	defer a.objectMut.Unlock()
   722  	if a.session == nil {
   723  		return nil, nil, types.ErrNotConnected
   724  	}
   725  
   726  	defer func() {
   727  		if errors.Is(err, io.EOF) {
   728  			err = types.ErrTypeClosed
   729  		} else if errors.Is(err, context.Canceled) ||
   730  			errors.Is(err, context.DeadlineExceeded) ||
   731  			(err != nil && strings.HasSuffix(err.Error(), "context canceled")) {
   732  			err = types.ErrTimeout
   733  		}
   734  	}()
   735  
   736  	var object *s3PendingObject
   737  	if object, err = a.getObjectTarget(ctx); err != nil {
   738  		return
   739  	}
   740  
   741  	var parts []types.Part
   742  	var scnAckFn codec.ReaderAckFn
   743  
   744  	for {
   745  		if parts, scnAckFn, err = object.scanner.Next(ctx); err == nil {
   746  			object.extracted++
   747  			break
   748  		}
   749  		a.object = nil
   750  		if err != io.EOF {
   751  			return
   752  		}
   753  		if err = object.scanner.Close(ctx); err != nil {
   754  			a.log.Warnf("Failed to close bucket object scanner cleanly: %v\n", err)
   755  		}
   756  		if object.extracted == 0 {
   757  			a.log.Debugf("Extracted zero messages from key %v\n", object.target.key)
   758  		}
   759  		if object, err = a.getObjectTarget(ctx); err != nil {
   760  			return
   761  		}
   762  	}
   763  
   764  	return s3MsgFromParts(object, parts), func(rctx context.Context, res types.Response) error {
   765  		return scnAckFn(rctx, res.Error())
   766  	}, nil
   767  }
   768  
   769  // CloseAsync begins cleaning up resources used by this reader asynchronously.
   770  func (a *awsS3) CloseAsync() {
   771  	go func() {
   772  		a.objectMut.Lock()
   773  		if a.object != nil {
   774  			a.object.scanner.Close(context.Background())
   775  			a.object = nil
   776  		}
   777  		a.objectMut.Unlock()
   778  	}()
   779  }
   780  
   781  // WaitForClose will block until either the reader is closed or a specified
   782  // timeout occurs.
   783  func (a *awsS3) WaitForClose(time.Duration) error {
   784  	return nil
   785  }
   786  
   787  //------------------------------------------------------------------------------