github.com/pachyderm/pachyderm@v1.13.4/examples/spouts/SQS-S3/sqs-spout.py (about)

     1  import os
     2  import json
     3  import logging
     4  import boto3
     5  import tarfile
     6  import gzip
     7  import stat
     8  import io
     9  import time
    10  import argparse
    11  from datetime import datetime
    12  from botocore.exceptions import ClientError
    13  from pprint import pprint
    14  
    15  
    16  aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID', 'user-id')
    17  aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'secret-key')
    18  aws_region = os.getenv('AWS_REGION', 'us-east-1')
    19  output_pipe = os.getenv('OUTPUT_PIPE', '/pfs/out')
    20  sqs_queue_url = os.getenv('SQS_QUEUE_URL', 'https://sqs.us-west-1.amazonaws.com/ID/Name')
    21  s3_bucket = os.getenv('S3_BUCKET', 's3://bucket-name/')
    22  logging_verbosity = os.getenv('LOGGING_VERBOSITY', "critical")
    23  logging_boto = os.getenv('LOGGING_BOTO', "critical")
    24  timeout = os.getenv('TIMEOUT', 5)
    25  logging_format = '%(levelname)s: %(asctime)s: %(message)s'
    26  
    27  def retrieve_sqs_messages(sqs_client, sqs_queue_url,  num_msgs=1, wait_time=0, visibility_time=5):
    28      """Retrieve messages from an SQS queue
    29  
    30      The retrieved messages are not deleted from the queue.
    31  
    32      :param sqs_queue_url: String URL of existing SQS queue
    33      :param aws_access_key_id: AWS user with access to queue
    34      :param aws_secret_access_key: AWS user secret key / password to access queue
    35      :param aws_region: region in which the queue resides
    36      :param num_msgs: Number of messages to retrieve (1-10)
    37      :param wait_time: Number of seconds to wait if no messages in queue
    38      :param visibility_time: Number of seconds to make retrieved messages
    39          hidden from subsequent retrieval requests
    40      :return: List of retrieved messages. If no messages are available, returned
    41          list is empty. If error, returns None.
    42      """
    43  
    44      # Validate number of messages to retrieve
    45      if num_msgs < 1:
    46          num_msgs = 1
    47      elif num_msgs > 10:
    48          num_msgs = 10
    49  
    50      # Retrieve messages from an SQS queue
    51      try:
    52          msgs = sqs_client.receive_message(QueueUrl=sqs_queue_url,
    53                                            MaxNumberOfMessages=num_msgs,
    54                                            WaitTimeSeconds=wait_time,
    55                                            VisibilityTimeout=visibility_time)
    56      except ClientError as e:
    57          logging.error(e)
    58          return None
    59  
    60      logging.debug("returning msgs: {0}".format(msgs))
    61      # Return the list of retrieved messages
    62      if msgs and 'Messages' in msgs.keys():
    63          return msgs['Messages']
    64  
    65      return None
    66  
    67  
    68  def delete_sqs_message(sqs_client, sqs_queue_url, msg_receipt_handle):
    69      """Delete a message from an SQS queue
    70  
    71      :param sqs_queue_url: String URL of existing SQS queue
    72      :param msg_receipt_handle: Receipt handle value of retrieved message
    73      """
    74  
    75      # Delete the message from the SQS queue
    76      sqs_client.delete_message(QueueUrl=sqs_queue_url,
    77                                ReceiptHandle=msg_receipt_handle)
    78  
    79  def open_pipe(path_to_file, attempts=0, timeout=2, sleep_int=5):
    80      if attempts < timeout : 
    81          flags = os.O_WRONLY  # Refer to "man 2 open".
    82          mode = stat.S_IWUSR  # This is 0o400.
    83          umask = 0o777 ^ mode  # Prevents always downgrading umask to 0.
    84          umask_original = os.umask(umask)
    85          try:
    86              file = os.open(path_to_file, flags, mode)
    87              # you must open the pipe as binary to prevent line-buffering problems.
    88              return os.fdopen(file, "wb")
    89          except OSError as oe:
    90              print ('{0} attempt of {1}; error opening file: {2}'.format(attempts + 1, timeout, oe))
    91              os.umask(umask_original)
    92              time.sleep(sleep_int)
    93              return open_pipe(path_to_file, attempts + 1)
    94          finally:
    95              os.umask(umask_original)
    96      return None
    97  
    98  def main():
    99      parser = argparse.ArgumentParser(description='Listen on an SQS queue for notifications of added files to an S3 bucket and then ingress the files.')
   100  
   101      parser.add_argument('-i', '--aws_access_key_id', required=False,
   102                          help="AWS Access Key ID for accessing the SQS queue and the bucket. Overrides env var AWS_ACCESS_KEY_ID. Default: '{0}'.".format(aws_access_key_id),
   103                          default=aws_access_key_id)
   104      parser.add_argument('-k', '--aws_secret_access_key', required=False,
   105                          help="AWS secret key for accessing the SQS queue and the bucket. Overrides env var AWS_SECRET_ACCESS_KEY. Default: '{0}'.".format(aws_secret_access_key),
   106                          default=aws_secret_access_key)
   107      parser.add_argument('-r', '--aws_region', required=False,
   108                          help="AWS region.  Overrides env var AWS_REGION. Default: '{0}'".format(aws_region),
   109                          default=aws_region)
   110      parser.add_argument('-o', '--output_pipe', required=False,
   111                          help="The named pipe that the tarstream containing the files will be written to.  Overrides env var OUTPUT_PIPE. Default: '{0}'.".format(output_pipe),
   112                          default=output_pipe)
   113      parser.add_argument('-b', '--s3_bucket', required=False,
   114                          help="The url to the SQS queue for bucket notifications.  Overrides env var S3_BUCKET. Default: '{0}'.".format(s3_bucket),
   115                          default=s3_bucket)
   116      parser.add_argument('-q', '--sqs_queue_url', required=False,
   117                          help="The url to the SQS queue for bucket notifications.  Overrides env var SQS_QUEUE_URL. Default: '{0}'.".format(sqs_queue_url),
   118                          default=sqs_queue_url)
   119      parser.add_argument('-v', '--logging_verbosity', required=False,
   120                          help="Verbosity for logging: debug, info, warning, error, critical.  Overrides env var LOGGING_VERBOSITY. Default: '{0}'.".format(logging_verbosity),
   121                          default=logging_verbosity)
   122      parser.add_argument('-l', '--logging_boto', required=False,
   123                          help="Verbosity for logging boto3: debug, info, warning, error, critical.  Overrides env var LOGGING_BOTO. Default: '{0}'.".format(logging_boto),
   124                          default=logging_boto)
   125      parser.add_argument('-t', '--timeout', required=False, type=int,
   126                          help="Timeout for polling the SQS topic.  Overrides env var TIMEOUT. Default: '{0}'.".format(timeout),
   127                          default=timeout)
   128  
   129      
   130      args = parser.parse_args()
   131  
   132      num_messages = 1
   133      sqs_client = boto3.client('sqs', aws_access_key_id=args.aws_access_key_id, aws_secret_access_key=args.aws_secret_access_key, region_name=args.aws_region)
   134      s3_client = boto3.client('s3', aws_access_key_id=args.aws_access_key_id, aws_secret_access_key=args.aws_secret_access_key, region_name=args.aws_region)
   135  
   136      # Set up logging
   137      if args.logging_verbosity == "debug":
   138          logging.basicConfig(level=logging.DEBUG,
   139                              format=logging_verbosity)
   140      elif args.logging_verbosity == "info":
   141          logging.basicConfig(level=logging.INFO,
   142                              format=logging_format)
   143      elif args.logging_verbosity == "warning":
   144          logging.basicConfig(level=logging.WARNING,
   145                              format=logging_format)
   146      elif args.logging_verbosity == "error":
   147          logging.basicConfig(level=logging.ERROR,
   148                              format=logging_format)
   149      else:
   150          logging.basicConfig(level=logging.CRITICAL,
   151                              format=logging_format)
   152  
   153      # Set up logging for boto
   154      if args.logging_boto == "debug":
   155          logging.getLogger('boto3').setLevel(logging.DEBUG)
   156          logging.getLogger('botocore').setLevel(logging.DEBUG)
   157          logging.getLogger('nose').setLevel(logging.DEBUG)
   158      elif args.logging_boto == "info":
   159          logging.getLogger('boto3').setLevel(logging.INFO)
   160          logging.getLogger('botocore').setLevel(logging.INFO)
   161          logging.getLogger('nose').setLevel(logging.INFO)
   162      elif args.logging_boto == "warning":
   163          logging.getLogger('boto3').setLevel(logging.WARNING)
   164          logging.getLogger('botocore').setLevel(logging.WARNING)
   165          logging.getLogger('nose').setLevel(logging.WARNING)
   166      elif args.logging_boto == "error":
   167          logging.getLogger('boto3').setLevel(logging.ERROR)
   168          logging.getLogger('botocore').setLevel(logging.ERROR)
   169          logging.getLogger('nose').setLevel(logging.ERROR)
   170      else:
   171          logging.getLogger('boto3').setLevel(logging.CRITICAL)
   172          logging.getLogger('botocore').setLevel(logging.CRITICAL)
   173          logging.getLogger('nose').setLevel(logging.CRITICAL) 
   174  
   175      console = logging.StreamHandler()
   176      logging.getLogger('').addHandler(console)
   177      
   178  
   179      # Retrieve SQS messages
   180      logging.debug("starting sqs retrieval loop")
   181      while True: 
   182          logging.debug("retrieving {4} messages from {0} using access key starting with {1} and secret starting with {2} in region {3}.".format(
   183            args.sqs_queue_url, args.aws_access_key_id[0:5], args.aws_secret_access_key[0:5], args.aws_region, num_messages))
   184          msgs = retrieve_sqs_messages(sqs_client, args.sqs_queue_url, num_messages, wait_time=args.timeout)
   185          logging.debug("message: {0}.".format(msgs))
   186          if msgs is not None:
   187              for msg in msgs:
   188                  logging.debug(f'SQS: Message ID: {msg["MessageId"]}, '
   189                               f'Contents: {msg["Body"]}')
   190                  # Iterate over the JSON to pull out full file name 
   191              data = json.loads(msg['Body'])['Records'][0]
   192              bucket = data['s3']['bucket']['name']
   193              file = data['s3']['object']['key']
   194  
   195              # Get the file from S3 and extract it.
   196              file_path = os.path.join(args.s3_bucket,file)
   197              logging.debug("fetching url from s3: {0}".format(file_path))
   198              s3_client.download_file(bucket, file, file)
   199              
   200              
   201              # Start Spout portion
   202              logging.debug("opening pipe {0}".format(args.output_pipe))
   203              mySpout = open_pipe(args.output_pipe)
   204  
   205              # To use a tarfile object with a named pipe, you must use the "w|" mode
   206              # which makes it not seekable
   207              logging.debug("creating tarstream")
   208              try: 
   209                  tarStream = tarfile.open(fileobj=mySpout,mode="w|", encoding='utf-8')
   210              except tarfile.TarError as te:
   211                  logging.critical('error creating tarstream: {0}'.format(te))
   212                  exit(-2)
   213  
   214              size = os.stat(file).st_size 
   215              logging.debug("Creating tar archive entry for file {0} of size {1}...".format(file, size))
   216              
   217              tarHeader = tarfile.TarInfo(file)
   218              tarHeader.size = size #Size of the file itself
   219              tarHeader.mode = 0o600
   220              tarHeader.name = file
   221  
   222              logging.debug("Writing tarfile to spout for file {0}...".format(file))
   223              try:
   224                  with open(file, mode="rb") as file:
   225                      tarStream.addfile(tarinfo=tarHeader, fileobj=file)
   226              except tarfile.TarError as te:
   227                  logging.critical('error writing message {0} to tarstream: {1}'.format(file, te))
   228                  tarStream.close()
   229                  mySpout.close()
   230                  exit(-2)
   231                  
   232              tarStream.close()
   233              delete_sqs_message(sqs_client, args.sqs_queue_url, msg['ReceiptHandle'])
   234      mySpout.close()
   235  
   236  if __name__ == '__main__':
   237      main()