github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/aws/clients/s3/boto3_client.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  
    20  from apache_beam.io.aws.clients.s3 import messages
    21  from apache_beam.options import pipeline_options
    22  from apache_beam.utils import retry
    23  
    24  try:
    25    # pylint: disable=wrong-import-order, wrong-import-position
    26    # pylint: disable=ungrouped-imports
    27    import boto3
    28  
    29  except ImportError:
    30    boto3 = None
    31  
    32  
    33  def get_http_error_code(exc):
    34    if hasattr(exc, 'response'):
    35      return exc.response.get('ResponseMetadata', {}).get('HTTPStatusCode')
    36    return None
    37  
    38  
    39  class Client(object):
    40    """
    41    Wrapper for boto3 library
    42    """
    43    def __init__(self, options):
    44      assert boto3 is not None, 'Missing boto3 requirement'
    45      if isinstance(options, pipeline_options.PipelineOptions):
    46        s3_options = options.view_as(pipeline_options.S3Options)
    47        access_key_id = s3_options.s3_access_key_id
    48        secret_access_key = s3_options.s3_secret_access_key
    49        session_token = s3_options.s3_session_token
    50        endpoint_url = s3_options.s3_endpoint_url
    51        use_ssl = not s3_options.s3_disable_ssl
    52        region_name = s3_options.s3_region_name
    53        api_version = s3_options.s3_api_version
    54        verify = s3_options.s3_verify
    55      else:
    56        access_key_id = options.get('s3_access_key_id')
    57        secret_access_key = options.get('s3_secret_access_key')
    58        session_token = options.get('s3_session_token')
    59        endpoint_url = options.get('s3_endpoint_url')
    60        use_ssl = not options.get('s3_disable_ssl', False)
    61        region_name = options.get('s3_region_name')
    62        api_version = options.get('s3_api_version')
    63        verify = options.get('s3_verify')
    64  
    65      session = boto3.session.Session()
    66      self.client = session.client(
    67          service_name='s3',
    68          region_name=region_name,
    69          api_version=api_version,
    70          use_ssl=use_ssl,
    71          verify=verify,
    72          endpoint_url=endpoint_url,
    73          aws_access_key_id=access_key_id,
    74          aws_secret_access_key=secret_access_key,
    75          aws_session_token=session_token)
    76  
    77      self._download_request = None
    78      self._download_stream = None
    79      self._download_pos = 0
    80  
    81    def get_object_metadata(self, request):
    82      """Retrieves an object's metadata.
    83  
    84      Args:
    85        request: (GetRequest) input message
    86  
    87      Returns:
    88        (Object) The response message.
    89      """
    90      kwargs = {'Bucket': request.bucket, 'Key': request.object}
    91  
    92      try:
    93        boto_response = self.client.head_object(**kwargs)
    94      except Exception as e:
    95        raise messages.S3ClientError(str(e), get_http_error_code(e))
    96  
    97      item = messages.Item(
    98          boto_response['ETag'],
    99          request.object,
   100          boto_response['LastModified'],
   101          boto_response['ContentLength'],
   102          boto_response['ContentType'])
   103  
   104      return item
   105  
   106    def get_stream(self, request, start):
   107      """Opens a stream object starting at the given position.
   108  
   109      Args:
   110        request: (GetRequest) request
   111        start: (int) start offset
   112      Returns:
   113        (Stream) Boto3 stream object.
   114      """
   115  
   116      if self._download_request and (
   117          start != self._download_pos or
   118          request.bucket != self._download_request.bucket or
   119          request.object != self._download_request.object):
   120        self._download_stream.close()
   121        self._download_stream = None
   122  
   123      # noinspection PyProtectedMember
   124      if not self._download_stream or self._download_stream._raw_stream.closed:
   125        try:
   126          self._download_stream = self.client.get_object(
   127              Bucket=request.bucket,
   128              Key=request.object,
   129              Range='bytes={}-'.format(start))['Body']
   130          self._download_request = request
   131          self._download_pos = start
   132        except Exception as e:
   133          raise messages.S3ClientError(str(e), get_http_error_code(e))
   134  
   135      return self._download_stream
   136  
   137    @retry.with_exponential_backoff()
   138    def get_range(self, request, start, end):
   139      r"""Retrieves an object's contents.
   140  
   141        Args:
   142          request: (GetRequest) request
   143          start: (int) start offset
   144          end: (int) end offset (exclusive)
   145        Returns:
   146          (bytes) The response message.
   147        """
   148      for i in range(2):
   149        try:
   150          stream = self.get_stream(request, start)
   151          data = stream.read(end - start)
   152          self._download_pos += len(data)
   153          return data
   154        except Exception as e:
   155          self._download_stream = None
   156          self._download_request = None
   157          if i == 0:
   158            # Read errors are likely with long-lived connections, retry immediately if a read fails once
   159            continue
   160          if isinstance(e, messages.S3ClientError):
   161            raise e
   162          raise messages.S3ClientError(str(e), get_http_error_code(e))
   163  
   164    def list(self, request):
   165      r"""Retrieves a list of objects matching the criteria.
   166  
   167      Args:
   168        request: (ListRequest) input message
   169      Returns:
   170        (ListResponse) The response message.
   171      """
   172      kwargs = {'Bucket': request.bucket, 'Prefix': request.prefix}
   173  
   174      if request.continuation_token is not None:
   175        kwargs['ContinuationToken'] = request.continuation_token
   176  
   177      try:
   178        boto_response = self.client.list_objects_v2(**kwargs)
   179      except Exception as e:
   180        raise messages.S3ClientError(str(e), get_http_error_code(e))
   181  
   182      if boto_response['KeyCount'] == 0:
   183        message = 'Tried to list nonexistent S3 path: s3://%s/%s' % (
   184            request.bucket, request.prefix)
   185        raise messages.S3ClientError(message, 404)
   186  
   187      items = [
   188          messages.Item(
   189              etag=content['ETag'],
   190              key=content['Key'],
   191              last_modified=content['LastModified'],
   192              size=content['Size']) for content in boto_response['Contents']
   193      ]
   194  
   195      try:
   196        next_token = boto_response['NextContinuationToken']
   197      except KeyError:
   198        next_token = None
   199  
   200      response = messages.ListResponse(items, next_token)
   201      return response
   202  
   203    def create_multipart_upload(self, request):
   204      r"""Initates a multipart upload to S3 for a given object
   205  
   206      Args:
   207        request: (UploadRequest) input message
   208      Returns:
   209        (UploadResponse) The response message.
   210      """
   211      try:
   212        boto_response = self.client.create_multipart_upload(
   213            Bucket=request.bucket,
   214            Key=request.object,
   215            ContentType=request.mime_type)
   216        response = messages.UploadResponse(boto_response['UploadId'])
   217      except Exception as e:
   218        raise messages.S3ClientError(str(e), get_http_error_code(e))
   219      return response
   220  
   221    def upload_part(self, request):
   222      r"""Uploads part of a file to S3 during a multipart upload
   223  
   224      Args:
   225        request: (UploadPartRequest) input message
   226      Returns:
   227        (UploadPartResponse) The response message.
   228      """
   229      try:
   230        boto_response = self.client.upload_part(
   231            Body=request.bytes,
   232            Bucket=request.bucket,
   233            Key=request.object,
   234            PartNumber=request.part_number,
   235            UploadId=request.upload_id)
   236        response = messages.UploadPartResponse(
   237            boto_response['ETag'], request.part_number)
   238        return response
   239      except Exception as e:
   240        raise messages.S3ClientError(str(e), get_http_error_code(e))
   241  
   242    def complete_multipart_upload(self, request):
   243      r"""Completes a multipart upload to S3
   244  
   245      Args:
   246        request: (UploadPartRequest) input message
   247      Returns:
   248        (Void) The response message.
   249      """
   250      parts = {'Parts': request.parts}
   251      try:
   252        self.client.complete_multipart_upload(
   253            Bucket=request.bucket,
   254            Key=request.object,
   255            UploadId=request.upload_id,
   256            MultipartUpload=parts)
   257      except Exception as e:
   258        raise messages.S3ClientError(str(e), get_http_error_code(e))
   259  
   260    def delete(self, request):
   261      r"""Deletes given object from bucket
   262      Args:
   263          request: (DeleteRequest) input message
   264        Returns:
   265          (void) Void, otherwise will raise if an error occurs
   266      """
   267      try:
   268        self.client.delete_object(Bucket=request.bucket, Key=request.object)
   269      except Exception as e:
   270        raise messages.S3ClientError(str(e), get_http_error_code(e))
   271  
   272    def delete_batch(self, request):
   273  
   274      aws_request = {
   275          'Bucket': request.bucket,
   276          'Delete': {
   277              'Objects': [{
   278                  'Key': object
   279              } for object in request.objects]
   280          }
   281      }
   282  
   283      try:
   284        aws_response = self.client.delete_objects(**aws_request)
   285      except Exception as e:
   286        raise messages.S3ClientError(str(e), get_http_error_code(e))
   287  
   288      deleted = [obj['Key'] for obj in aws_response.get('Deleted', [])]
   289  
   290      failed = [obj['Key'] for obj in aws_response.get('Errors', [])]
   291  
   292      errors = [
   293          messages.S3ClientError(obj['Message'], obj['Code'])
   294          for obj in aws_response.get('Errors', [])
   295      ]
   296  
   297      return messages.DeleteBatchResponse(deleted, failed, errors)
   298  
   299    def copy(self, request):
   300      try:
   301        copy_src = {'Bucket': request.src_bucket, 'Key': request.src_key}
   302        self.client.copy(copy_src, request.dest_bucket, request.dest_key)
   303      except Exception as e:
   304        raise messages.S3ClientError(str(e), get_http_error_code(e))