github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Google Cloud Storage client.
    19  
    20  This library evolved from the Google App Engine GCS client available at
    21  https://github.com/GoogleCloudPlatform/appengine-gcs-client.
    22  
    23  **Updates to the I/O connector code**
    24  
    25  For any significant updates to this I/O connector, please consider involving
    26  corresponding code reviewers mentioned in
    27  https://github.com/apache/beam/blob/master/sdks/python/OWNERS
    28  """
    29  
    30  # pytype: skip-file
    31  
    32  import errno
    33  import io
    34  import logging
    35  import multiprocessing
    36  import re
    37  import threading
    38  import time
    39  import traceback
    40  from itertools import islice
    41  from typing import Optional
    42  from typing import Union
    43  
    44  import apache_beam
    45  from apache_beam.internal.http_client import get_new_http
    46  from apache_beam.internal.metrics.metric import ServiceCallMetric
    47  from apache_beam.io.filesystemio import Downloader
    48  from apache_beam.io.filesystemio import DownloaderStream
    49  from apache_beam.io.filesystemio import PipeStream
    50  from apache_beam.io.filesystemio import Uploader
    51  from apache_beam.io.filesystemio import UploaderStream
    52  from apache_beam.io.gcp import resource_identifiers
    53  from apache_beam.metrics import monitoring_infos
    54  from apache_beam.options.pipeline_options import PipelineOptions
    55  from apache_beam.utils import retry
    56  from apache_beam.utils.annotations import deprecated
    57  
    58  __all__ = ['GcsIO']
    59  
    60  _LOGGER = logging.getLogger(__name__)
    61  
    62  # Issue a friendlier error message if the storage library is not available.
    63  # TODO(silviuc): Remove this guard when storage is available everywhere.
    64  try:
    65    # pylint: disable=wrong-import-order, wrong-import-position
    66    # pylint: disable=ungrouped-imports
    67    from apitools.base.py.batch import BatchApiRequest
    68    from apitools.base.py.exceptions import HttpError
    69    from apitools.base.py import transfer
    70    from apache_beam.internal.gcp import auth
    71    from apache_beam.io.gcp.internal.clients import storage
    72  except ImportError:
    73    raise ImportError(
    74        'Google Cloud Storage I/O not supported for this execution environment '
    75        '(could not import storage API client).')
    76  
    77  # This is the size of each partial-file read operation from GCS.  This
    78  # parameter was chosen to give good throughput while keeping memory usage at
    79  # a reasonable level; the following table shows throughput reached when
    80  # reading files of a given size with a chosen buffer size and informed the
    81  # choice of the value, as of 11/2016:
    82  #
    83  # +---------------+------------+-------------+-------------+-------------+
    84  # |               | 50 MB file | 100 MB file | 200 MB file | 400 MB file |
    85  # +---------------+------------+-------------+-------------+-------------+
    86  # | 8 MB buffer   | 17.12 MB/s | 22.67 MB/s  | 23.81 MB/s  | 26.05 MB/s  |
    87  # | 16 MB buffer  | 24.21 MB/s | 42.70 MB/s  | 42.89 MB/s  | 46.92 MB/s  |
    88  # | 32 MB buffer  | 28.53 MB/s | 48.08 MB/s  | 54.30 MB/s  | 54.65 MB/s  |
    89  # | 400 MB buffer | 34.72 MB/s | 71.13 MB/s  | 79.13 MB/s  | 85.39 MB/s  |
    90  # +---------------+------------+-------------+-------------+-------------+
    91  DEFAULT_READ_BUFFER_SIZE = 16 * 1024 * 1024
    92  
    93  # This is the number of seconds the library will wait for a partial-file read
    94  # operation from GCS to complete before retrying.
    95  DEFAULT_READ_SEGMENT_TIMEOUT_SECONDS = 60
    96  
    97  # This is the size of chunks used when writing to GCS.
    98  WRITE_CHUNK_SIZE = 8 * 1024 * 1024
    99  
   100  # Maximum number of operations permitted in GcsIO.copy_batch() and
   101  # GcsIO.delete_batch().
   102  MAX_BATCH_OPERATION_SIZE = 100
   103  
   104  # Batch endpoint URL for GCS.
   105  # We have to specify an API specific endpoint here since Google APIs global
   106  # batch endpoints will be deprecated on 03/25/2019.
   107  # See https://developers.googleblog.com/2018/03/discontinuing-support-for-json-rpc-and.html.  # pylint: disable=line-too-long
   108  # Currently apitools library uses a global batch endpoint by default:
   109  # https://github.com/google/apitools/blob/master/apitools/base/py/batch.py#L152
   110  # TODO: remove this constant and it's usage after apitools move to using an API
   111  # specific batch endpoint or after Beam gcsio module start using a GCS client
   112  # library that does not use global batch endpoints.
   113  GCS_BATCH_ENDPOINT = 'https://www.googleapis.com/batch/storage/v1'
   114  
   115  
   116  def parse_gcs_path(gcs_path, object_optional=False):
   117    """Return the bucket and object names of the given gs:// path."""
   118    match = re.match('^gs://([^/]+)/(.*)$', gcs_path)
   119    if match is None or (match.group(2) == '' and not object_optional):
   120      raise ValueError(
   121          'GCS path must be in the form gs://<bucket>/<object>. '
   122          f'Encountered {gcs_path!r}')
   123    return match.group(1), match.group(2)
   124  
   125  
   126  def default_gcs_bucket_name(project, region):
   127    from hashlib import md5
   128    return 'dataflow-staging-%s-%s' % (
   129        region, md5(project.encode('utf8')).hexdigest())
   130  
   131  
   132  def get_or_create_default_gcs_bucket(options):
   133    """Create a default GCS bucket for this project."""
   134    if getattr(options, 'dataflow_kms_key', None):
   135      _LOGGER.warning(
   136          'Cannot create a default bucket when --dataflow_kms_key is set.')
   137      return None
   138  
   139    project = getattr(options, 'project', None)
   140    region = getattr(options, 'region', None)
   141    if not project or not region:
   142      return None
   143  
   144    bucket_name = default_gcs_bucket_name(project, region)
   145    bucket = GcsIO(pipeline_options=options).get_bucket(bucket_name)
   146    if bucket:
   147      return bucket
   148    else:
   149      _LOGGER.warning(
   150          'Creating default GCS bucket for project %s: gs://%s',
   151          project,
   152          bucket_name)
   153      return GcsIO(pipeline_options=options).create_bucket(
   154          bucket_name, project, location=region)
   155  
   156  
   157  class GcsIOError(IOError, retry.PermanentException):
   158    """GCS IO error that should not be retried."""
   159    pass
   160  
   161  
   162  class GcsIO(object):
   163    """Google Cloud Storage I/O client."""
   164    def __init__(self, storage_client=None, pipeline_options=None):
   165      # type: (Optional[storage.StorageV1], Optional[Union[dict, PipelineOptions]]) -> None
   166      if storage_client is None:
   167        if not pipeline_options:
   168          pipeline_options = PipelineOptions()
   169        elif isinstance(pipeline_options, dict):
   170          pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
   171        storage_client = storage.StorageV1(
   172            credentials=auth.get_service_credentials(pipeline_options),
   173            get_credentials=False,
   174            http=get_new_http(),
   175            response_encoding='utf8',
   176            additional_http_headers={
   177                "User-Agent": "apache-beam-%s" % apache_beam.__version__
   178            })
   179      self.client = storage_client
   180      self._rewrite_cb = None
   181      self.bucket_to_project_number = {}
   182  
   183    def get_project_number(self, bucket):
   184      if bucket not in self.bucket_to_project_number:
   185        bucket_metadata = self.get_bucket(bucket_name=bucket)
   186        if bucket_metadata:
   187          self.bucket_to_project_number[bucket] = bucket_metadata.projectNumber
   188        #  else failed to load the bucket metadata due to HttpError
   189  
   190      return self.bucket_to_project_number.get(bucket, None)
   191  
   192    def _set_rewrite_response_callback(self, callback):
   193      """For testing purposes only. No backward compatibility guarantees.
   194  
   195      Args:
   196        callback: A function that receives ``storage.RewriteResponse``.
   197      """
   198      self._rewrite_cb = callback
   199  
   200    def get_bucket(self, bucket_name):
   201      """Returns an object bucket from its name, or None if it does not exist."""
   202      try:
   203        request = storage.StorageBucketsGetRequest(bucket=bucket_name)
   204        return self.client.buckets.Get(request)
   205      except HttpError:
   206        return None
   207  
   208    def create_bucket(self, bucket_name, project, kms_key=None, location=None):
   209      """Create and return a GCS bucket in a specific project."""
   210      encryption = None
   211      if kms_key:
   212        encryption = storage.Bucket.EncryptionValue(kms_key)
   213  
   214      request = storage.StorageBucketsInsertRequest(
   215          bucket=storage.Bucket(
   216              name=bucket_name, location=location, encryption=encryption),
   217          project=project,
   218      )
   219      try:
   220        return self.client.buckets.Insert(request)
   221      except HttpError:
   222        return None
   223  
   224    def open(
   225        self,
   226        filename,
   227        mode='r',
   228        read_buffer_size=DEFAULT_READ_BUFFER_SIZE,
   229        mime_type='application/octet-stream'):
   230      """Open a GCS file path for reading or writing.
   231  
   232      Args:
   233        filename (str): GCS file path in the form ``gs://<bucket>/<object>``.
   234        mode (str): ``'r'`` for reading or ``'w'`` for writing.
   235        read_buffer_size (int): Buffer size to use during read operations.
   236        mime_type (str): Mime type to set for write operations.
   237  
   238      Returns:
   239        GCS file object.
   240  
   241      Raises:
   242        ValueError: Invalid open file mode.
   243      """
   244      if mode == 'r' or mode == 'rb':
   245        downloader = GcsDownloader(
   246            self.client,
   247            filename,
   248            buffer_size=read_buffer_size,
   249            get_project_number=self.get_project_number)
   250        return io.BufferedReader(
   251            DownloaderStream(
   252                downloader, read_buffer_size=read_buffer_size, mode=mode),
   253            buffer_size=read_buffer_size)
   254      elif mode == 'w' or mode == 'wb':
   255        uploader = GcsUploader(
   256            self.client,
   257            filename,
   258            mime_type,
   259            get_project_number=self.get_project_number)
   260        return io.BufferedWriter(
   261            UploaderStream(uploader, mode=mode), buffer_size=128 * 1024)
   262      else:
   263        raise ValueError('Invalid file open mode: %s.' % mode)
   264  
   265    @retry.with_exponential_backoff(
   266        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   267    def delete(self, path):
   268      """Deletes the object at the given GCS path.
   269  
   270      Args:
   271        path: GCS file path pattern in the form gs://<bucket>/<name>.
   272      """
   273      bucket, object_path = parse_gcs_path(path)
   274      request = storage.StorageObjectsDeleteRequest(
   275          bucket=bucket, object=object_path)
   276      try:
   277        self.client.objects.Delete(request)
   278      except HttpError as http_error:
   279        if http_error.status_code == 404:
   280          # Return success when the file doesn't exist anymore for idempotency.
   281          return
   282        raise
   283  
   284    # We intentionally do not decorate this method with a retry, as retrying is
   285    # handled in BatchApiRequest.Execute().
   286    def delete_batch(self, paths):
   287      """Deletes the objects at the given GCS paths.
   288  
   289      Args:
   290        paths: List of GCS file path patterns in the form gs://<bucket>/<name>,
   291               not to exceed MAX_BATCH_OPERATION_SIZE in length.
   292  
   293      Returns: List of tuples of (path, exception) in the same order as the paths
   294               argument, where exception is None if the operation succeeded or
   295               the relevant exception if the operation failed.
   296      """
   297      if not paths:
   298        return []
   299  
   300      paths = iter(paths)
   301      result_statuses = []
   302      while True:
   303        paths_chunk = list(islice(paths, MAX_BATCH_OPERATION_SIZE))
   304        if not paths_chunk:
   305          return result_statuses
   306        batch_request = BatchApiRequest(
   307            batch_url=GCS_BATCH_ENDPOINT,
   308            retryable_codes=retry.SERVER_ERROR_OR_TIMEOUT_CODES,
   309            response_encoding='utf-8')
   310        for path in paths_chunk:
   311          bucket, object_path = parse_gcs_path(path)
   312          request = storage.StorageObjectsDeleteRequest(
   313              bucket=bucket, object=object_path)
   314          batch_request.Add(self.client.objects, 'Delete', request)
   315        api_calls = batch_request.Execute(self.client._http)  # pylint: disable=protected-access
   316        for i, api_call in enumerate(api_calls):
   317          path = paths_chunk[i]
   318          exception = None
   319          if api_call.is_error:
   320            exception = api_call.exception
   321            # Return success when the file doesn't exist anymore for idempotency.
   322            if isinstance(exception, HttpError) and exception.status_code == 404:
   323              exception = None
   324          result_statuses.append((path, exception))
   325      return result_statuses
   326  
   327    @retry.with_exponential_backoff(
   328        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   329    def copy(
   330        self,
   331        src,
   332        dest,
   333        dest_kms_key_name=None,
   334        max_bytes_rewritten_per_call=None):
   335      """Copies the given GCS object from src to dest.
   336  
   337      Args:
   338        src: GCS file path pattern in the form gs://<bucket>/<name>.
   339        dest: GCS file path pattern in the form gs://<bucket>/<name>.
   340        dest_kms_key_name: Experimental. No backwards compatibility guarantees.
   341          Encrypt dest with this Cloud KMS key. If None, will use dest bucket
   342          encryption defaults.
   343        max_bytes_rewritten_per_call: Experimental. No backwards compatibility
   344          guarantees. Each rewrite API call will return after these many bytes.
   345          Used for testing.
   346  
   347      Raises:
   348        TimeoutError: on timeout.
   349      """
   350      src_bucket, src_path = parse_gcs_path(src)
   351      dest_bucket, dest_path = parse_gcs_path(dest)
   352      request = storage.StorageObjectsRewriteRequest(
   353          sourceBucket=src_bucket,
   354          sourceObject=src_path,
   355          destinationBucket=dest_bucket,
   356          destinationObject=dest_path,
   357          destinationKmsKeyName=dest_kms_key_name,
   358          maxBytesRewrittenPerCall=max_bytes_rewritten_per_call)
   359      response = self.client.objects.Rewrite(request)
   360      while not response.done:
   361        _LOGGER.debug(
   362            'Rewrite progress: %d of %d bytes, %s to %s',
   363            response.totalBytesRewritten,
   364            response.objectSize,
   365            src,
   366            dest)
   367        request.rewriteToken = response.rewriteToken
   368        response = self.client.objects.Rewrite(request)
   369        if self._rewrite_cb is not None:
   370          self._rewrite_cb(response)
   371  
   372      _LOGGER.debug('Rewrite done: %s to %s', src, dest)
   373  
   374    # We intentionally do not decorate this method with a retry, as retrying is
   375    # handled in BatchApiRequest.Execute().
   376    def copy_batch(
   377        self,
   378        src_dest_pairs,
   379        dest_kms_key_name=None,
   380        max_bytes_rewritten_per_call=None):
   381      """Copies the given GCS object from src to dest.
   382  
   383      Args:
   384        src_dest_pairs: list of (src, dest) tuples of gs://<bucket>/<name> files
   385                        paths to copy from src to dest, not to exceed
   386                        MAX_BATCH_OPERATION_SIZE in length.
   387        dest_kms_key_name: Experimental. No backwards compatibility guarantees.
   388          Encrypt dest with this Cloud KMS key. If None, will use dest bucket
   389          encryption defaults.
   390        max_bytes_rewritten_per_call: Experimental. No backwards compatibility
   391          guarantees. Each rewrite call will return after these many bytes. Used
   392          primarily for testing.
   393  
   394      Returns: List of tuples of (src, dest, exception) in the same order as the
   395               src_dest_pairs argument, where exception is None if the operation
   396               succeeded or the relevant exception if the operation failed.
   397      """
   398      if not src_dest_pairs:
   399        return []
   400      pair_to_request = {}
   401      for pair in src_dest_pairs:
   402        src_bucket, src_path = parse_gcs_path(pair[0])
   403        dest_bucket, dest_path = parse_gcs_path(pair[1])
   404        request = storage.StorageObjectsRewriteRequest(
   405            sourceBucket=src_bucket,
   406            sourceObject=src_path,
   407            destinationBucket=dest_bucket,
   408            destinationObject=dest_path,
   409            destinationKmsKeyName=dest_kms_key_name,
   410            maxBytesRewrittenPerCall=max_bytes_rewritten_per_call)
   411        pair_to_request[pair] = request
   412      pair_to_status = {}
   413      while True:
   414        pairs_in_batch = list(set(src_dest_pairs) - set(pair_to_status))
   415        if not pairs_in_batch:
   416          break
   417        batch_request = BatchApiRequest(
   418            batch_url=GCS_BATCH_ENDPOINT,
   419            retryable_codes=retry.SERVER_ERROR_OR_TIMEOUT_CODES,
   420            response_encoding='utf-8')
   421        for pair in pairs_in_batch:
   422          batch_request.Add(self.client.objects, 'Rewrite', pair_to_request[pair])
   423        api_calls = batch_request.Execute(self.client._http)  # pylint: disable=protected-access
   424        for pair, api_call in zip(pairs_in_batch, api_calls):
   425          src, dest = pair
   426          response = api_call.response
   427          if self._rewrite_cb is not None:
   428            self._rewrite_cb(response)
   429          if api_call.is_error:
   430            exception = api_call.exception
   431            # Translate 404 to the appropriate not found exception.
   432            if isinstance(exception, HttpError) and exception.status_code == 404:
   433              exception = (
   434                  GcsIOError(errno.ENOENT, 'Source file not found: %s' % src))
   435            pair_to_status[pair] = exception
   436          elif not response.done:
   437            _LOGGER.debug(
   438                'Rewrite progress: %d of %d bytes, %s to %s',
   439                response.totalBytesRewritten,
   440                response.objectSize,
   441                src,
   442                dest)
   443            pair_to_request[pair].rewriteToken = response.rewriteToken
   444          else:
   445            _LOGGER.debug('Rewrite done: %s to %s', src, dest)
   446            pair_to_status[pair] = None
   447  
   448      return [(pair[0], pair[1], pair_to_status[pair]) for pair in src_dest_pairs]
   449  
   450    # We intentionally do not decorate this method with a retry, since the
   451    # underlying copy and delete operations are already idempotent operations
   452    # protected by retry decorators.
   453    def copytree(self, src, dest):
   454      """Renames the given GCS "directory" recursively from src to dest.
   455  
   456      Args:
   457        src: GCS file path pattern in the form gs://<bucket>/<name>/.
   458        dest: GCS file path pattern in the form gs://<bucket>/<name>/.
   459      """
   460      assert src.endswith('/')
   461      assert dest.endswith('/')
   462      for entry in self.list_prefix(src):
   463        rel_path = entry[len(src):]
   464        self.copy(entry, dest + rel_path)
   465  
   466    # We intentionally do not decorate this method with a retry, since the
   467    # underlying copy and delete operations are already idempotent operations
   468    # protected by retry decorators.
   469    def rename(self, src, dest):
   470      """Renames the given GCS object from src to dest.
   471  
   472      Args:
   473        src: GCS file path pattern in the form gs://<bucket>/<name>.
   474        dest: GCS file path pattern in the form gs://<bucket>/<name>.
   475      """
   476      self.copy(src, dest)
   477      self.delete(src)
   478  
   479    def exists(self, path):
   480      """Returns whether the given GCS object exists.
   481  
   482      Args:
   483        path: GCS file path pattern in the form gs://<bucket>/<name>.
   484      """
   485      try:
   486        self._gcs_object(path)  # gcs object
   487        return True
   488      except HttpError as http_error:
   489        if http_error.status_code == 404:
   490          # HTTP 404 indicates that the file did not exist
   491          return False
   492        else:
   493          # We re-raise all other exceptions
   494          raise
   495  
   496    def checksum(self, path):
   497      """Looks up the checksum of a GCS object.
   498  
   499      Args:
   500        path: GCS file path pattern in the form gs://<bucket>/<name>.
   501      """
   502      return self._gcs_object(path).crc32c
   503  
   504    def size(self, path):
   505      """Returns the size of a single GCS object.
   506  
   507      This method does not perform glob expansion. Hence the given path must be
   508      for a single GCS object.
   509  
   510      Returns: size of the GCS object in bytes.
   511      """
   512      return self._gcs_object(path).size
   513  
   514    def kms_key(self, path):
   515      """Returns the KMS key of a single GCS object.
   516  
   517      This method does not perform glob expansion. Hence the given path must be
   518      for a single GCS object.
   519  
   520      Returns: KMS key name of the GCS object as a string, or None if it doesn't
   521        have one.
   522      """
   523      return self._gcs_object(path).kmsKeyName
   524  
   525    def last_updated(self, path):
   526      """Returns the last updated epoch time of a single GCS object.
   527  
   528      This method does not perform glob expansion. Hence the given path must be
   529      for a single GCS object.
   530  
   531      Returns: last updated time of the GCS object in second.
   532      """
   533      return self._updated_to_seconds(self._gcs_object(path).updated)
   534  
   535    def _status(self, path):
   536      """For internal use only; no backwards-compatibility guarantees.
   537  
   538      Returns supported fields (checksum, kms_key, last_updated, size) of a
   539      single object as a dict at once.
   540  
   541      This method does not perform glob expansion. Hence the given path must be
   542      for a single GCS object.
   543  
   544      Returns: dict of fields of the GCS object.
   545      """
   546      gcs_object = self._gcs_object(path)
   547      file_status = {}
   548      if hasattr(gcs_object, 'crc32c'):
   549        file_status['checksum'] = gcs_object.crc32c
   550      if hasattr(gcs_object, 'kmsKeyName'):
   551        file_status['kms_key'] = gcs_object.kmsKeyName
   552      if hasattr(gcs_object, 'updated'):
   553        file_status['last_updated'] = self._updated_to_seconds(gcs_object.updated)
   554      if hasattr(gcs_object, 'size'):
   555        file_status['size'] = gcs_object.size
   556      return file_status
   557  
   558    @retry.with_exponential_backoff(
   559        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   560    def _gcs_object(self, path):
   561      """Returns a gcs object for the given path
   562  
   563      This method does not perform glob expansion. Hence the given path must be
   564      for a single GCS object.
   565  
   566      Returns: GCS object.
   567      """
   568      bucket, object_path = parse_gcs_path(path)
   569      request = storage.StorageObjectsGetRequest(
   570          bucket=bucket, object=object_path)
   571      return self.client.objects.Get(request)
   572  
   573    @deprecated(since='2.45.0', current='list_files')
   574    def list_prefix(self, path, with_metadata=False):
   575      """Lists files matching the prefix.
   576  
   577      ``list_prefix`` has been deprecated. Use `list_files` instead, which returns
   578      a generator of file information instead of a dict.
   579  
   580      Args:
   581        path: GCS file path pattern in the form gs://<bucket>/[name].
   582        with_metadata: Experimental. Specify whether returns file metadata.
   583  
   584      Returns:
   585        If ``with_metadata`` is False: dict of file name -> size; if
   586          ``with_metadata`` is True: dict of file name -> tuple(size, timestamp).
   587      """
   588      file_info = {}
   589      for file_metadata in self.list_files(path, with_metadata):
   590        file_info[file_metadata[0]] = file_metadata[1]
   591  
   592      return file_info
   593  
   594    def list_files(self, path, with_metadata=False):
   595      """Lists files matching the prefix.
   596  
   597      Args:
   598        path: GCS file path pattern in the form gs://<bucket>/[name].
   599        with_metadata: Experimental. Specify whether returns file metadata.
   600  
   601      Returns:
   602        If ``with_metadata`` is False: generator of tuple(file name, size); if
   603        ``with_metadata`` is True: generator of
   604        tuple(file name, tuple(size, timestamp)).
   605      """
   606      bucket, prefix = parse_gcs_path(path, object_optional=True)
   607      request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix)
   608      file_info = set()
   609      counter = 0
   610      start_time = time.time()
   611      if with_metadata:
   612        _LOGGER.debug("Starting the file information of the input")
   613      else:
   614        _LOGGER.debug("Starting the size estimation of the input")
   615      while True:
   616        response = retry.with_exponential_backoff(
   617            retry_filter=retry.retry_on_server_errors_and_timeout_filter)(
   618                self.client.objects.List)(
   619                    request)
   620  
   621        for item in response.items:
   622          file_name = 'gs://%s/%s' % (item.bucket, item.name)
   623          if file_name not in file_info:
   624            file_info.add(file_name)
   625            counter += 1
   626            if counter % 10000 == 0:
   627              if with_metadata:
   628                _LOGGER.info(
   629                    "Finished computing file information of: %s files",
   630                    len(file_info))
   631              else:
   632                _LOGGER.info(
   633                    "Finished computing size of: %s files", len(file_info))
   634  
   635            if with_metadata:
   636              yield file_name, (item.size, self._updated_to_seconds(item.updated))
   637            else:
   638              yield file_name, item.size
   639  
   640        if response.nextPageToken:
   641          request.pageToken = response.nextPageToken
   642        else:
   643          break
   644      _LOGGER.log(
   645          # do not spam logs when list_prefix is likely used to check empty folder
   646          logging.INFO if counter > 0 else logging.DEBUG,
   647          "Finished listing %s files in %s seconds.",
   648          counter,
   649          time.time() - start_time)
   650  
   651    @staticmethod
   652    def _updated_to_seconds(updated):
   653      """Helper function transform the updated field of response to seconds"""
   654      return (
   655          time.mktime(updated.timetuple()) - time.timezone +
   656          updated.microsecond / 1000000.0)
   657  
   658  
   659  class GcsDownloader(Downloader):
   660    def __init__(self, client, path, buffer_size, get_project_number):
   661      self._client = client
   662      self._path = path
   663      self._bucket, self._name = parse_gcs_path(path)
   664      self._buffer_size = buffer_size
   665      self._get_project_number = get_project_number
   666  
   667      # Create a request count metric
   668      resource = resource_identifiers.GoogleCloudStorageBucket(self._bucket)
   669      labels = {
   670          monitoring_infos.SERVICE_LABEL: 'Storage',
   671          monitoring_infos.METHOD_LABEL: 'Objects.get',
   672          monitoring_infos.RESOURCE_LABEL: resource,
   673          monitoring_infos.GCS_BUCKET_LABEL: self._bucket
   674      }
   675      project_number = self._get_project_number(self._bucket)
   676      if project_number:
   677        labels[monitoring_infos.GCS_PROJECT_ID_LABEL] = str(project_number)
   678      else:
   679        _LOGGER.debug(
   680            'Possibly missing storage.buckets.get permission to '
   681            'bucket %s. Label %s is not added to the counter because it '
   682            'cannot be identified.',
   683            self._bucket,
   684            monitoring_infos.GCS_PROJECT_ID_LABEL)
   685  
   686      service_call_metric = ServiceCallMetric(
   687          request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN,
   688          base_labels=labels)
   689  
   690      # Get object state.
   691      self._get_request = (
   692          storage.StorageObjectsGetRequest(
   693              bucket=self._bucket, object=self._name))
   694      try:
   695        metadata = self._get_object_metadata(self._get_request)
   696      except HttpError as http_error:
   697        service_call_metric.call(http_error)
   698        if http_error.status_code == 404:
   699          raise IOError(errno.ENOENT, 'Not found: %s' % self._path)
   700        else:
   701          _LOGGER.error(
   702              'HTTP error while requesting file %s: %s', self._path, http_error)
   703          raise
   704      else:
   705        service_call_metric.call('ok')
   706  
   707      self._size = metadata.size
   708  
   709      # Ensure read is from file of the correct generation.
   710      self._get_request.generation = metadata.generation
   711  
   712      # Initialize read buffer state.
   713      self._download_stream = io.BytesIO()
   714      self._downloader = transfer.Download(
   715          self._download_stream,
   716          auto_transfer=False,
   717          chunksize=self._buffer_size,
   718          num_retries=20)
   719  
   720      try:
   721        self._client.objects.Get(self._get_request, download=self._downloader)
   722        service_call_metric.call('ok')
   723      except HttpError as e:
   724        service_call_metric.call(e)
   725        raise
   726  
   727    @retry.with_exponential_backoff(
   728        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   729    def _get_object_metadata(self, get_request):
   730      return self._client.objects.Get(get_request)
   731  
   732    @property
   733    def size(self):
   734      return self._size
   735  
   736    def get_range(self, start, end):
   737      self._download_stream.seek(0)
   738      self._download_stream.truncate(0)
   739      self._downloader.GetRange(start, end - 1)
   740      return self._download_stream.getvalue()
   741  
   742  
   743  class GcsUploader(Uploader):
   744    def __init__(self, client, path, mime_type, get_project_number):
   745      self._client = client
   746      self._path = path
   747      self._bucket, self._name = parse_gcs_path(path)
   748      self._mime_type = mime_type
   749      self._get_project_number = get_project_number
   750  
   751      # Set up communication with child thread.
   752      parent_conn, child_conn = multiprocessing.Pipe()
   753      self._child_conn = child_conn
   754      self._conn = parent_conn
   755  
   756      # Set up uploader.
   757      self._insert_request = (
   758          storage.StorageObjectsInsertRequest(
   759              bucket=self._bucket, name=self._name))
   760      self._upload = transfer.Upload(
   761          PipeStream(self._child_conn),
   762          self._mime_type,
   763          chunksize=WRITE_CHUNK_SIZE)
   764      self._upload.strategy = transfer.RESUMABLE_UPLOAD
   765  
   766      # Start uploading thread.
   767      self._upload_thread = threading.Thread(target=self._start_upload)
   768      self._upload_thread.daemon = True
   769      self._upload_thread.last_error = None
   770      self._upload_thread.start()
   771  
   772    # TODO(silviuc): Refactor so that retry logic can be applied.
   773    # There is retry logic in the underlying transfer library but we should make
   774    # it more explicit so we can control the retry parameters.
   775    @retry.no_retries  # Using no_retries marks this as an integration point.
   776    def _start_upload(self):
   777      # This starts the uploader thread.  We are forced to run the uploader in
   778      # another thread because the apitools uploader insists on taking a stream
   779      # as input. Happily, this also means we get asynchronous I/O to GCS.
   780      #
   781      # The uploader by default transfers data in chunks of 1024 * 1024 bytes at
   782      # a time, buffering writes until that size is reached.
   783  
   784      project_number = self._get_project_number(self._bucket)
   785  
   786      # Create a request count metric
   787      resource = resource_identifiers.GoogleCloudStorageBucket(self._bucket)
   788      labels = {
   789          monitoring_infos.SERVICE_LABEL: 'Storage',
   790          monitoring_infos.METHOD_LABEL: 'Objects.insert',
   791          monitoring_infos.RESOURCE_LABEL: resource,
   792          monitoring_infos.GCS_BUCKET_LABEL: self._bucket,
   793          monitoring_infos.GCS_PROJECT_ID_LABEL: str(project_number)
   794      }
   795      service_call_metric = ServiceCallMetric(
   796          request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN,
   797          base_labels=labels)
   798      try:
   799        self._client.objects.Insert(self._insert_request, upload=self._upload)
   800        service_call_metric.call('ok')
   801      except Exception as e:  # pylint: disable=broad-except
   802        service_call_metric.call(e)
   803        _LOGGER.error(
   804            'Error in _start_upload while inserting file %s: %s',
   805            self._path,
   806            traceback.format_exc())
   807        self._upload_thread.last_error = e
   808      finally:
   809        self._child_conn.close()
   810  
   811    def put(self, data):
   812      try:
   813        self._conn.send_bytes(data.tobytes())
   814      except EOFError:
   815        if self._upload_thread.last_error is not None:
   816          raise self._upload_thread.last_error  # pylint: disable=raising-bad-type
   817        raise
   818  
   819    def finish(self):
   820      self._conn.close()
   821      # TODO(udim): Add timeout=DEFAULT_HTTP_TIMEOUT_SECONDS * 2 and raise if
   822      # isAlive is True.
   823      self._upload_thread.join()
   824      # Check for exception since the last put() call.
   825      if self._upload_thread.last_error is not None:
   826        raise type(self._upload_thread.last_error)(
   827            "Error while uploading file %s: %s",
   828            self._path,
   829            self._upload_thread.last_error.message)  # pylint: disable=raising-bad-type