github.com/pachyderm/pachyderm@v1.13.4/examples/spouts/SQS-S3/sqs-spout.py (about) 1 import os 2 import json 3 import logging 4 import boto3 5 import tarfile 6 import gzip 7 import stat 8 import io 9 import time 10 import argparse 11 from datetime import datetime 12 from botocore.exceptions import ClientError 13 from pprint import pprint 14 15 16 aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID', 'user-id') 17 aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'secret-key') 18 aws_region = os.getenv('AWS_REGION', 'us-east-1') 19 output_pipe = os.getenv('OUTPUT_PIPE', '/pfs/out') 20 sqs_queue_url = os.getenv('SQS_QUEUE_URL', 'https://sqs.us-west-1.amazonaws.com/ID/Name') 21 s3_bucket = os.getenv('S3_BUCKET', 's3://bucket-name/') 22 logging_verbosity = os.getenv('LOGGING_VERBOSITY', "critical") 23 logging_boto = os.getenv('LOGGING_BOTO', "critical") 24 timeout = os.getenv('TIMEOUT', 5) 25 logging_format = '%(levelname)s: %(asctime)s: %(message)s' 26 27 def retrieve_sqs_messages(sqs_client, sqs_queue_url, num_msgs=1, wait_time=0, visibility_time=5): 28 """Retrieve messages from an SQS queue 29 30 The retrieved messages are not deleted from the queue. 31 32 :param sqs_queue_url: String URL of existing SQS queue 33 :param aws_access_key_id: AWS user with access to queue 34 :param aws_secret_access_key: AWS user secret key / password to access queue 35 :param aws_region: region in which the queue resides 36 :param num_msgs: Number of messages to retrieve (1-10) 37 :param wait_time: Number of seconds to wait if no messages in queue 38 :param visibility_time: Number of seconds to make retrieved messages 39 hidden from subsequent retrieval requests 40 :return: List of retrieved messages. If no messages are available, returned 41 list is empty. If error, returns None. 42 """ 43 44 # Validate number of messages to retrieve 45 if num_msgs < 1: 46 num_msgs = 1 47 elif num_msgs > 10: 48 num_msgs = 10 49 50 # Retrieve messages from an SQS queue 51 try: 52 msgs = sqs_client.receive_message(QueueUrl=sqs_queue_url, 53 MaxNumberOfMessages=num_msgs, 54 WaitTimeSeconds=wait_time, 55 VisibilityTimeout=visibility_time) 56 except ClientError as e: 57 logging.error(e) 58 return None 59 60 logging.debug("returning msgs: {0}".format(msgs)) 61 # Return the list of retrieved messages 62 if msgs and 'Messages' in msgs.keys(): 63 return msgs['Messages'] 64 65 return None 66 67 68 def delete_sqs_message(sqs_client, sqs_queue_url, msg_receipt_handle): 69 """Delete a message from an SQS queue 70 71 :param sqs_queue_url: String URL of existing SQS queue 72 :param msg_receipt_handle: Receipt handle value of retrieved message 73 """ 74 75 # Delete the message from the SQS queue 76 sqs_client.delete_message(QueueUrl=sqs_queue_url, 77 ReceiptHandle=msg_receipt_handle) 78 79 def open_pipe(path_to_file, attempts=0, timeout=2, sleep_int=5): 80 if attempts < timeout : 81 flags = os.O_WRONLY # Refer to "man 2 open". 82 mode = stat.S_IWUSR # This is 0o400. 83 umask = 0o777 ^ mode # Prevents always downgrading umask to 0. 84 umask_original = os.umask(umask) 85 try: 86 file = os.open(path_to_file, flags, mode) 87 # you must open the pipe as binary to prevent line-buffering problems. 88 return os.fdopen(file, "wb") 89 except OSError as oe: 90 print ('{0} attempt of {1}; error opening file: {2}'.format(attempts + 1, timeout, oe)) 91 os.umask(umask_original) 92 time.sleep(sleep_int) 93 return open_pipe(path_to_file, attempts + 1) 94 finally: 95 os.umask(umask_original) 96 return None 97 98 def main(): 99 parser = argparse.ArgumentParser(description='Listen on an SQS queue for notifications of added files to an S3 bucket and then ingress the files.') 100 101 parser.add_argument('-i', '--aws_access_key_id', required=False, 102 help="AWS Access Key ID for accessing the SQS queue and the bucket. Overrides env var AWS_ACCESS_KEY_ID. Default: '{0}'.".format(aws_access_key_id), 103 default=aws_access_key_id) 104 parser.add_argument('-k', '--aws_secret_access_key', required=False, 105 help="AWS secret key for accessing the SQS queue and the bucket. Overrides env var AWS_SECRET_ACCESS_KEY. Default: '{0}'.".format(aws_secret_access_key), 106 default=aws_secret_access_key) 107 parser.add_argument('-r', '--aws_region', required=False, 108 help="AWS region. Overrides env var AWS_REGION. Default: '{0}'".format(aws_region), 109 default=aws_region) 110 parser.add_argument('-o', '--output_pipe', required=False, 111 help="The named pipe that the tarstream containing the files will be written to. Overrides env var OUTPUT_PIPE. Default: '{0}'.".format(output_pipe), 112 default=output_pipe) 113 parser.add_argument('-b', '--s3_bucket', required=False, 114 help="The url to the SQS queue for bucket notifications. Overrides env var S3_BUCKET. Default: '{0}'.".format(s3_bucket), 115 default=s3_bucket) 116 parser.add_argument('-q', '--sqs_queue_url', required=False, 117 help="The url to the SQS queue for bucket notifications. Overrides env var SQS_QUEUE_URL. Default: '{0}'.".format(sqs_queue_url), 118 default=sqs_queue_url) 119 parser.add_argument('-v', '--logging_verbosity', required=False, 120 help="Verbosity for logging: debug, info, warning, error, critical. Overrides env var LOGGING_VERBOSITY. Default: '{0}'.".format(logging_verbosity), 121 default=logging_verbosity) 122 parser.add_argument('-l', '--logging_boto', required=False, 123 help="Verbosity for logging boto3: debug, info, warning, error, critical. Overrides env var LOGGING_BOTO. Default: '{0}'.".format(logging_boto), 124 default=logging_boto) 125 parser.add_argument('-t', '--timeout', required=False, type=int, 126 help="Timeout for polling the SQS topic. Overrides env var TIMEOUT. Default: '{0}'.".format(timeout), 127 default=timeout) 128 129 130 args = parser.parse_args() 131 132 num_messages = 1 133 sqs_client = boto3.client('sqs', aws_access_key_id=args.aws_access_key_id, aws_secret_access_key=args.aws_secret_access_key, region_name=args.aws_region) 134 s3_client = boto3.client('s3', aws_access_key_id=args.aws_access_key_id, aws_secret_access_key=args.aws_secret_access_key, region_name=args.aws_region) 135 136 # Set up logging 137 if args.logging_verbosity == "debug": 138 logging.basicConfig(level=logging.DEBUG, 139 format=logging_verbosity) 140 elif args.logging_verbosity == "info": 141 logging.basicConfig(level=logging.INFO, 142 format=logging_format) 143 elif args.logging_verbosity == "warning": 144 logging.basicConfig(level=logging.WARNING, 145 format=logging_format) 146 elif args.logging_verbosity == "error": 147 logging.basicConfig(level=logging.ERROR, 148 format=logging_format) 149 else: 150 logging.basicConfig(level=logging.CRITICAL, 151 format=logging_format) 152 153 # Set up logging for boto 154 if args.logging_boto == "debug": 155 logging.getLogger('boto3').setLevel(logging.DEBUG) 156 logging.getLogger('botocore').setLevel(logging.DEBUG) 157 logging.getLogger('nose').setLevel(logging.DEBUG) 158 elif args.logging_boto == "info": 159 logging.getLogger('boto3').setLevel(logging.INFO) 160 logging.getLogger('botocore').setLevel(logging.INFO) 161 logging.getLogger('nose').setLevel(logging.INFO) 162 elif args.logging_boto == "warning": 163 logging.getLogger('boto3').setLevel(logging.WARNING) 164 logging.getLogger('botocore').setLevel(logging.WARNING) 165 logging.getLogger('nose').setLevel(logging.WARNING) 166 elif args.logging_boto == "error": 167 logging.getLogger('boto3').setLevel(logging.ERROR) 168 logging.getLogger('botocore').setLevel(logging.ERROR) 169 logging.getLogger('nose').setLevel(logging.ERROR) 170 else: 171 logging.getLogger('boto3').setLevel(logging.CRITICAL) 172 logging.getLogger('botocore').setLevel(logging.CRITICAL) 173 logging.getLogger('nose').setLevel(logging.CRITICAL) 174 175 console = logging.StreamHandler() 176 logging.getLogger('').addHandler(console) 177 178 179 # Retrieve SQS messages 180 logging.debug("starting sqs retrieval loop") 181 while True: 182 logging.debug("retrieving {4} messages from {0} using access key starting with {1} and secret starting with {2} in region {3}.".format( 183 args.sqs_queue_url, args.aws_access_key_id[0:5], args.aws_secret_access_key[0:5], args.aws_region, num_messages)) 184 msgs = retrieve_sqs_messages(sqs_client, args.sqs_queue_url, num_messages, wait_time=args.timeout) 185 logging.debug("message: {0}.".format(msgs)) 186 if msgs is not None: 187 for msg in msgs: 188 logging.debug(f'SQS: Message ID: {msg["MessageId"]}, ' 189 f'Contents: {msg["Body"]}') 190 # Iterate over the JSON to pull out full file name 191 data = json.loads(msg['Body'])['Records'][0] 192 bucket = data['s3']['bucket']['name'] 193 file = data['s3']['object']['key'] 194 195 # Get the file from S3 and extract it. 196 file_path = os.path.join(args.s3_bucket,file) 197 logging.debug("fetching url from s3: {0}".format(file_path)) 198 s3_client.download_file(bucket, file, file) 199 200 201 # Start Spout portion 202 logging.debug("opening pipe {0}".format(args.output_pipe)) 203 mySpout = open_pipe(args.output_pipe) 204 205 # To use a tarfile object with a named pipe, you must use the "w|" mode 206 # which makes it not seekable 207 logging.debug("creating tarstream") 208 try: 209 tarStream = tarfile.open(fileobj=mySpout,mode="w|", encoding='utf-8') 210 except tarfile.TarError as te: 211 logging.critical('error creating tarstream: {0}'.format(te)) 212 exit(-2) 213 214 size = os.stat(file).st_size 215 logging.debug("Creating tar archive entry for file {0} of size {1}...".format(file, size)) 216 217 tarHeader = tarfile.TarInfo(file) 218 tarHeader.size = size #Size of the file itself 219 tarHeader.mode = 0o600 220 tarHeader.name = file 221 222 logging.debug("Writing tarfile to spout for file {0}...".format(file)) 223 try: 224 with open(file, mode="rb") as file: 225 tarStream.addfile(tarinfo=tarHeader, fileobj=file) 226 except tarfile.TarError as te: 227 logging.critical('error writing message {0} to tarstream: {1}'.format(file, te)) 228 tarStream.close() 229 mySpout.close() 230 exit(-2) 231 232 tarStream.close() 233 delete_sqs_message(sqs_client, args.sqs_queue_url, msg['ReceiptHandle']) 234 mySpout.close() 235 236 if __name__ == '__main__': 237 main()