github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/aws/clients/s3/boto3_client.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # pytype: skip-file 19 20 from apache_beam.io.aws.clients.s3 import messages 21 from apache_beam.options import pipeline_options 22 from apache_beam.utils import retry 23 24 try: 25 # pylint: disable=wrong-import-order, wrong-import-position 26 # pylint: disable=ungrouped-imports 27 import boto3 28 29 except ImportError: 30 boto3 = None 31 32 33 def get_http_error_code(exc): 34 if hasattr(exc, 'response'): 35 return exc.response.get('ResponseMetadata', {}).get('HTTPStatusCode') 36 return None 37 38 39 class Client(object): 40 """ 41 Wrapper for boto3 library 42 """ 43 def __init__(self, options): 44 assert boto3 is not None, 'Missing boto3 requirement' 45 if isinstance(options, pipeline_options.PipelineOptions): 46 s3_options = options.view_as(pipeline_options.S3Options) 47 access_key_id = s3_options.s3_access_key_id 48 secret_access_key = s3_options.s3_secret_access_key 49 session_token = s3_options.s3_session_token 50 endpoint_url = s3_options.s3_endpoint_url 51 use_ssl = not s3_options.s3_disable_ssl 52 region_name = s3_options.s3_region_name 53 api_version = s3_options.s3_api_version 54 verify = s3_options.s3_verify 55 else: 56 access_key_id = options.get('s3_access_key_id') 57 secret_access_key = options.get('s3_secret_access_key') 58 session_token = options.get('s3_session_token') 59 endpoint_url = options.get('s3_endpoint_url') 60 use_ssl = not options.get('s3_disable_ssl', False) 61 region_name = options.get('s3_region_name') 62 api_version = options.get('s3_api_version') 63 verify = options.get('s3_verify') 64 65 session = boto3.session.Session() 66 self.client = session.client( 67 service_name='s3', 68 region_name=region_name, 69 api_version=api_version, 70 use_ssl=use_ssl, 71 verify=verify, 72 endpoint_url=endpoint_url, 73 aws_access_key_id=access_key_id, 74 aws_secret_access_key=secret_access_key, 75 aws_session_token=session_token) 76 77 self._download_request = None 78 self._download_stream = None 79 self._download_pos = 0 80 81 def get_object_metadata(self, request): 82 """Retrieves an object's metadata. 83 84 Args: 85 request: (GetRequest) input message 86 87 Returns: 88 (Object) The response message. 89 """ 90 kwargs = {'Bucket': request.bucket, 'Key': request.object} 91 92 try: 93 boto_response = self.client.head_object(**kwargs) 94 except Exception as e: 95 raise messages.S3ClientError(str(e), get_http_error_code(e)) 96 97 item = messages.Item( 98 boto_response['ETag'], 99 request.object, 100 boto_response['LastModified'], 101 boto_response['ContentLength'], 102 boto_response['ContentType']) 103 104 return item 105 106 def get_stream(self, request, start): 107 """Opens a stream object starting at the given position. 108 109 Args: 110 request: (GetRequest) request 111 start: (int) start offset 112 Returns: 113 (Stream) Boto3 stream object. 114 """ 115 116 if self._download_request and ( 117 start != self._download_pos or 118 request.bucket != self._download_request.bucket or 119 request.object != self._download_request.object): 120 self._download_stream.close() 121 self._download_stream = None 122 123 # noinspection PyProtectedMember 124 if not self._download_stream or self._download_stream._raw_stream.closed: 125 try: 126 self._download_stream = self.client.get_object( 127 Bucket=request.bucket, 128 Key=request.object, 129 Range='bytes={}-'.format(start))['Body'] 130 self._download_request = request 131 self._download_pos = start 132 except Exception as e: 133 raise messages.S3ClientError(str(e), get_http_error_code(e)) 134 135 return self._download_stream 136 137 @retry.with_exponential_backoff() 138 def get_range(self, request, start, end): 139 r"""Retrieves an object's contents. 140 141 Args: 142 request: (GetRequest) request 143 start: (int) start offset 144 end: (int) end offset (exclusive) 145 Returns: 146 (bytes) The response message. 147 """ 148 for i in range(2): 149 try: 150 stream = self.get_stream(request, start) 151 data = stream.read(end - start) 152 self._download_pos += len(data) 153 return data 154 except Exception as e: 155 self._download_stream = None 156 self._download_request = None 157 if i == 0: 158 # Read errors are likely with long-lived connections, retry immediately if a read fails once 159 continue 160 if isinstance(e, messages.S3ClientError): 161 raise e 162 raise messages.S3ClientError(str(e), get_http_error_code(e)) 163 164 def list(self, request): 165 r"""Retrieves a list of objects matching the criteria. 166 167 Args: 168 request: (ListRequest) input message 169 Returns: 170 (ListResponse) The response message. 171 """ 172 kwargs = {'Bucket': request.bucket, 'Prefix': request.prefix} 173 174 if request.continuation_token is not None: 175 kwargs['ContinuationToken'] = request.continuation_token 176 177 try: 178 boto_response = self.client.list_objects_v2(**kwargs) 179 except Exception as e: 180 raise messages.S3ClientError(str(e), get_http_error_code(e)) 181 182 if boto_response['KeyCount'] == 0: 183 message = 'Tried to list nonexistent S3 path: s3://%s/%s' % ( 184 request.bucket, request.prefix) 185 raise messages.S3ClientError(message, 404) 186 187 items = [ 188 messages.Item( 189 etag=content['ETag'], 190 key=content['Key'], 191 last_modified=content['LastModified'], 192 size=content['Size']) for content in boto_response['Contents'] 193 ] 194 195 try: 196 next_token = boto_response['NextContinuationToken'] 197 except KeyError: 198 next_token = None 199 200 response = messages.ListResponse(items, next_token) 201 return response 202 203 def create_multipart_upload(self, request): 204 r"""Initates a multipart upload to S3 for a given object 205 206 Args: 207 request: (UploadRequest) input message 208 Returns: 209 (UploadResponse) The response message. 210 """ 211 try: 212 boto_response = self.client.create_multipart_upload( 213 Bucket=request.bucket, 214 Key=request.object, 215 ContentType=request.mime_type) 216 response = messages.UploadResponse(boto_response['UploadId']) 217 except Exception as e: 218 raise messages.S3ClientError(str(e), get_http_error_code(e)) 219 return response 220 221 def upload_part(self, request): 222 r"""Uploads part of a file to S3 during a multipart upload 223 224 Args: 225 request: (UploadPartRequest) input message 226 Returns: 227 (UploadPartResponse) The response message. 228 """ 229 try: 230 boto_response = self.client.upload_part( 231 Body=request.bytes, 232 Bucket=request.bucket, 233 Key=request.object, 234 PartNumber=request.part_number, 235 UploadId=request.upload_id) 236 response = messages.UploadPartResponse( 237 boto_response['ETag'], request.part_number) 238 return response 239 except Exception as e: 240 raise messages.S3ClientError(str(e), get_http_error_code(e)) 241 242 def complete_multipart_upload(self, request): 243 r"""Completes a multipart upload to S3 244 245 Args: 246 request: (UploadPartRequest) input message 247 Returns: 248 (Void) The response message. 249 """ 250 parts = {'Parts': request.parts} 251 try: 252 self.client.complete_multipart_upload( 253 Bucket=request.bucket, 254 Key=request.object, 255 UploadId=request.upload_id, 256 MultipartUpload=parts) 257 except Exception as e: 258 raise messages.S3ClientError(str(e), get_http_error_code(e)) 259 260 def delete(self, request): 261 r"""Deletes given object from bucket 262 Args: 263 request: (DeleteRequest) input message 264 Returns: 265 (void) Void, otherwise will raise if an error occurs 266 """ 267 try: 268 self.client.delete_object(Bucket=request.bucket, Key=request.object) 269 except Exception as e: 270 raise messages.S3ClientError(str(e), get_http_error_code(e)) 271 272 def delete_batch(self, request): 273 274 aws_request = { 275 'Bucket': request.bucket, 276 'Delete': { 277 'Objects': [{ 278 'Key': object 279 } for object in request.objects] 280 } 281 } 282 283 try: 284 aws_response = self.client.delete_objects(**aws_request) 285 except Exception as e: 286 raise messages.S3ClientError(str(e), get_http_error_code(e)) 287 288 deleted = [obj['Key'] for obj in aws_response.get('Deleted', [])] 289 290 failed = [obj['Key'] for obj in aws_response.get('Errors', [])] 291 292 errors = [ 293 messages.S3ClientError(obj['Message'], obj['Code']) 294 for obj in aws_response.get('Errors', []) 295 ] 296 297 return messages.DeleteBatchResponse(deleted, failed, errors) 298 299 def copy(self, request): 300 try: 301 copy_src = {'Bucket': request.src_bucket, 'Key': request.src_key} 302 self.client.copy(copy_src, request.dest_bucket, request.dest_key) 303 except Exception as e: 304 raise messages.S3ClientError(str(e), get_http_error_code(e))