github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tests for Google Cloud Storage client."""
    19  # pytype: skip-file
    20  
    21  import datetime
    22  import errno
    23  import io
    24  import logging
    25  import os
    26  import random
    27  import time
    28  import unittest
    29  from email.message import Message
    30  
    31  import httplib2
    32  import mock
    33  
    34  # Protect against environments where apitools library is not available.
    35  # pylint: disable=wrong-import-order, wrong-import-position
    36  from apache_beam.metrics import monitoring_infos
    37  from apache_beam.metrics.execution import MetricsEnvironment
    38  from apache_beam.metrics.metricbase import MetricName
    39  
    40  try:
    41    from apache_beam.io.gcp import gcsio, resource_identifiers
    42    from apache_beam.io.gcp.internal.clients import storage
    43    from apitools.base.py.exceptions import HttpError
    44  except ImportError:
    45    HttpError = None
    46  # pylint: enable=wrong-import-order, wrong-import-position
    47  
    48  DEFAULT_GCP_PROJECT = 'apache-beam-testing'
    49  DEFAULT_PROJECT_NUMBER = 1
    50  
    51  
    52  class FakeGcsClient(object):
    53    # Fake storage client.  Usage in gcsio.py is client.objects.Get(...) and
    54    # client.objects.Insert(...).
    55  
    56    def __init__(self):
    57      self.objects = FakeGcsObjects()
    58      self.buckets = FakeGcsBuckets()
    59      # Referenced in GcsIO.copy_batch() and GcsIO.delete_batch().
    60      self._http = object()
    61  
    62  
    63  class FakeFile(object):
    64    def __init__(
    65        self, bucket, obj, contents, generation, crc32c=None, last_updated=None):
    66      self.bucket = bucket
    67      self.object = obj
    68      self.contents = contents
    69      self.generation = generation
    70      self.crc32c = crc32c
    71      self.last_updated = last_updated
    72  
    73    def get_metadata(self):
    74      last_updated_datetime = None
    75      if self.last_updated:
    76        last_updated_datetime = datetime.datetime.utcfromtimestamp(
    77            self.last_updated)
    78  
    79      return storage.Object(
    80          bucket=self.bucket,
    81          name=self.object,
    82          generation=self.generation,
    83          size=len(self.contents),
    84          crc32c=self.crc32c,
    85          updated=last_updated_datetime)
    86  
    87  
    88  class FakeGcsBuckets(object):
    89    def __init__(self):
    90      pass
    91  
    92    def get_bucket(self, bucket):
    93      return storage.Bucket(name=bucket, projectNumber=DEFAULT_PROJECT_NUMBER)
    94  
    95    def Get(self, get_request):
    96      return self.get_bucket(get_request.bucket)
    97  
    98  
    99  class FakeGcsObjects(object):
   100    def __init__(self):
   101      self.files = {}
   102      # Store the last generation used for a given object name.  Note that this
   103      # has to persist even past the deletion of the object.
   104      self.last_generation = {}
   105      self.list_page_tokens = {}
   106      self._fail_when_getting_metadata = []
   107      self._fail_when_reading = []
   108  
   109    def add_file(
   110        self, f, fail_when_getting_metadata=False, fail_when_reading=False):
   111      self.files[(f.bucket, f.object)] = f
   112      self.last_generation[(f.bucket, f.object)] = f.generation
   113      if fail_when_getting_metadata:
   114        self._fail_when_getting_metadata.append(f)
   115      if fail_when_reading:
   116        self._fail_when_reading.append(f)
   117  
   118    def get_file(self, bucket, obj):
   119      return self.files.get((bucket, obj), None)
   120  
   121    def delete_file(self, bucket, obj):
   122      del self.files[(bucket, obj)]
   123  
   124    def get_last_generation(self, bucket, obj):
   125      return self.last_generation.get((bucket, obj), 0)
   126  
   127    def Get(self, get_request, download=None):  # pylint: disable=invalid-name
   128      f = self.get_file(get_request.bucket, get_request.object)
   129      if f is None:
   130        # Failing with an HTTP 404 if file does not exist.
   131        raise HttpError({'status': 404}, None, None)
   132      if download is None:
   133        if f in self._fail_when_getting_metadata:
   134          raise HttpError({'status': 429}, None, None)
   135        return f.get_metadata()
   136      else:
   137        if f in self._fail_when_reading:
   138          raise HttpError({'status': 429}, None, None)
   139        stream = download.stream
   140  
   141        def get_range_callback(start, end):
   142          if not 0 <= start <= end < len(f.contents):
   143            raise ValueError(
   144                'start=%d end=%d len=%s' % (start, end, len(f.contents)))
   145          stream.write(f.contents[start:end + 1])
   146  
   147        download.GetRange = get_range_callback
   148  
   149    def Insert(self, insert_request, upload=None):  # pylint: disable=invalid-name
   150      assert upload is not None
   151      generation = self.get_last_generation(
   152          insert_request.bucket, insert_request.name) + 1
   153      f = FakeFile(insert_request.bucket, insert_request.name, b'', generation)
   154  
   155      # Stream data into file.
   156      stream = upload.stream
   157      data_list = []
   158      while True:
   159        data = stream.read(1024 * 1024)
   160        if not data:
   161          break
   162        data_list.append(data)
   163      f.contents = b''.join(data_list)
   164  
   165      self.add_file(f)
   166  
   167    REWRITE_TOKEN = 'test_token'
   168  
   169    def Rewrite(self, rewrite_request):  # pylint: disable=invalid-name
   170      if rewrite_request.rewriteToken == self.REWRITE_TOKEN:
   171        dest_object = storage.Object()
   172        return storage.RewriteResponse(
   173            done=True,
   174            objectSize=100,
   175            resource=dest_object,
   176            totalBytesRewritten=100)
   177  
   178      src_file = self.get_file(
   179          rewrite_request.sourceBucket, rewrite_request.sourceObject)
   180      if not src_file:
   181        raise HttpError(
   182            httplib2.Response({'status': '404'}),
   183            '404 Not Found',
   184            'https://fake/url')
   185      generation = self.get_last_generation(
   186          rewrite_request.destinationBucket,
   187          rewrite_request.destinationObject) + 1
   188      dest_file = FakeFile(
   189          rewrite_request.destinationBucket,
   190          rewrite_request.destinationObject,
   191          src_file.contents,
   192          generation)
   193      self.add_file(dest_file)
   194      time.sleep(10)  # time.sleep and time.time are mocked below.
   195      return storage.RewriteResponse(
   196          done=False,
   197          objectSize=100,
   198          rewriteToken=self.REWRITE_TOKEN,
   199          totalBytesRewritten=5)
   200  
   201    def Delete(self, delete_request):  # pylint: disable=invalid-name
   202      # Here, we emulate the behavior of the GCS service in raising a 404 error
   203      # if this object already exists.
   204      if self.get_file(delete_request.bucket, delete_request.object):
   205        self.delete_file(delete_request.bucket, delete_request.object)
   206      else:
   207        raise HttpError(
   208            httplib2.Response({'status': '404'}),
   209            '404 Not Found',
   210            'https://fake/url')
   211  
   212    def List(self, list_request):  # pylint: disable=invalid-name
   213      bucket = list_request.bucket
   214      prefix = list_request.prefix or ''
   215      matching_files = []
   216      for file_bucket, file_name in sorted(iter(self.files)):
   217        if bucket == file_bucket and file_name.startswith(prefix):
   218          file_object = self.files[(file_bucket, file_name)].get_metadata()
   219          matching_files.append(file_object)
   220  
   221      # Handle pagination.
   222      items_per_page = 5
   223      if not list_request.pageToken:
   224        range_start = 0
   225      else:
   226        if list_request.pageToken not in self.list_page_tokens:
   227          raise ValueError('Invalid page token.')
   228        range_start = self.list_page_tokens[list_request.pageToken]
   229        del self.list_page_tokens[list_request.pageToken]
   230  
   231      result = storage.Objects(
   232          items=matching_files[range_start:range_start + items_per_page])
   233      if range_start + items_per_page < len(matching_files):
   234        next_range_start = range_start + items_per_page
   235        next_page_token = '_page_token_%s_%s_%d' % (
   236            bucket, prefix, next_range_start)
   237        self.list_page_tokens[next_page_token] = next_range_start
   238        result.nextPageToken = next_page_token
   239      return result
   240  
   241  
   242  class FakeApiCall(object):
   243    def __init__(self, exception, response):
   244      self.exception = exception
   245      self.is_error = exception is not None
   246      # Response for Rewrite:
   247      self.response = response
   248  
   249  
   250  class FakeBatchApiRequest(object):
   251    def __init__(self, **unused_kwargs):
   252      self.operations = []
   253  
   254    def Add(self, service, method, request):  # pylint: disable=invalid-name
   255      self.operations.append((service, method, request))
   256  
   257    def Execute(self, unused_http, **unused_kwargs):  # pylint: disable=invalid-name
   258      api_calls = []
   259      for service, method, request in self.operations:
   260        exception = None
   261        response = None
   262        try:
   263          response = getattr(service, method)(request)
   264        except Exception as e:  # pylint: disable=broad-except
   265          exception = e
   266        api_calls.append(FakeApiCall(exception, response))
   267      return api_calls
   268  
   269  
   270  @unittest.skipIf(HttpError is None, 'GCP dependencies are not installed')
   271  class TestGCSPathParser(unittest.TestCase):
   272  
   273    BAD_GCS_PATHS = [
   274        'gs://',
   275        'gs://bucket',
   276        'gs:///name',
   277        'gs:///',
   278        'gs:/blah/bucket/name',
   279    ]
   280  
   281    def test_gcs_path(self):
   282      self.assertEqual(
   283          gcsio.parse_gcs_path('gs://bucket/name'), ('bucket', 'name'))
   284      self.assertEqual(
   285          gcsio.parse_gcs_path('gs://bucket/name/sub'), ('bucket', 'name/sub'))
   286  
   287    def test_bad_gcs_path(self):
   288      for path in self.BAD_GCS_PATHS:
   289        self.assertRaises(ValueError, gcsio.parse_gcs_path, path)
   290      self.assertRaises(ValueError, gcsio.parse_gcs_path, 'gs://bucket/')
   291  
   292    def test_gcs_path_object_optional(self):
   293      self.assertEqual(
   294          gcsio.parse_gcs_path('gs://bucket/name', object_optional=True),
   295          ('bucket', 'name'))
   296      self.assertEqual(
   297          gcsio.parse_gcs_path('gs://bucket/', object_optional=True),
   298          ('bucket', ''))
   299  
   300    def test_bad_gcs_path_object_optional(self):
   301      for path in self.BAD_GCS_PATHS:
   302        self.assertRaises(ValueError, gcsio.parse_gcs_path, path, True)
   303  
   304  
   305  class SampleOptions(object):
   306    def __init__(self, project, region, kms_key=None):
   307      self.project = DEFAULT_GCP_PROJECT
   308      self.region = region
   309      self.dataflow_kms_key = kms_key
   310  
   311  
   312  @unittest.skipIf(HttpError is None, 'GCP dependencies are not installed')
   313  @mock.patch.multiple(
   314      'time', time=mock.MagicMock(side_effect=range(100)), sleep=mock.MagicMock())
   315  class TestGCSIO(unittest.TestCase):
   316    def _insert_random_file(
   317        self,
   318        client,
   319        path,
   320        size,
   321        generation=1,
   322        crc32c=None,
   323        last_updated=None,
   324        fail_when_getting_metadata=False,
   325        fail_when_reading=False):
   326      bucket, name = gcsio.parse_gcs_path(path)
   327      f = FakeFile(
   328          bucket,
   329          name,
   330          os.urandom(size),
   331          generation,
   332          crc32c=crc32c,
   333          last_updated=last_updated)
   334      client.objects.add_file(f, fail_when_getting_metadata, fail_when_reading)
   335      return f
   336  
   337    def setUp(self):
   338      self.client = FakeGcsClient()
   339      self.gcs = gcsio.GcsIO(self.client)
   340  
   341    def test_default_bucket_name(self):
   342      self.assertEqual(
   343          gcsio.default_gcs_bucket_name(DEFAULT_GCP_PROJECT, "us-central1"),
   344          'dataflow-staging-us-central1-77b801c0838aee13391c0d1885860494')
   345  
   346    def test_default_bucket_name_failure(self):
   347      self.assertEqual(
   348          gcsio.get_or_create_default_gcs_bucket(
   349              SampleOptions(
   350                  DEFAULT_GCP_PROJECT, "us-central1", kms_key="kmskey!")),
   351          None)
   352  
   353    def test_num_retries(self):
   354      # BEAM-7424: update num_retries accordingly if storage_client is
   355      # regenerated.
   356      self.assertEqual(gcsio.GcsIO().client.num_retries, 20)
   357  
   358    def test_retry_func(self):
   359      # BEAM-7667: update retry_func accordingly if storage_client is
   360      # regenerated.
   361      self.assertIsNotNone(gcsio.GcsIO().client.retry_func)
   362  
   363    def test_exists(self):
   364      file_name = 'gs://gcsio-test/dummy_file'
   365      file_size = 1234
   366      self._insert_random_file(self.client, file_name, file_size)
   367      self.assertFalse(self.gcs.exists(file_name + 'xyz'))
   368      self.assertTrue(self.gcs.exists(file_name))
   369  
   370    @mock.patch.object(FakeGcsObjects, 'Get')
   371    def test_exists_failure(self, mock_get):
   372      # Raising an error other than 404. Raising 404 is a valid failure for
   373      # exists() call.
   374      mock_get.side_effect = HttpError({'status': 400}, None, None)
   375      file_name = 'gs://gcsio-test/dummy_file'
   376      file_size = 1234
   377      self._insert_random_file(self.client, file_name, file_size)
   378      with self.assertRaises(HttpError) as cm:
   379        self.gcs.exists(file_name)
   380      self.assertEqual(400, cm.exception.status_code)
   381  
   382    def test_checksum(self):
   383      file_name = 'gs://gcsio-test/dummy_file'
   384      file_size = 1234
   385      checksum = 'deadbeef'
   386      self._insert_random_file(self.client, file_name, file_size, crc32c=checksum)
   387      self.assertTrue(self.gcs.exists(file_name))
   388      self.assertEqual(checksum, self.gcs.checksum(file_name))
   389  
   390    def test_size(self):
   391      file_name = 'gs://gcsio-test/dummy_file'
   392      file_size = 1234
   393  
   394      self._insert_random_file(self.client, file_name, file_size)
   395      self.assertTrue(self.gcs.exists(file_name))
   396      self.assertEqual(1234, self.gcs.size(file_name))
   397  
   398    def test_last_updated(self):
   399      file_name = 'gs://gcsio-test/dummy_file'
   400      file_size = 1234
   401      last_updated = 123456.78
   402  
   403      self._insert_random_file(
   404          self.client, file_name, file_size, last_updated=last_updated)
   405      self.assertTrue(self.gcs.exists(file_name))
   406      self.assertEqual(last_updated, self.gcs.last_updated(file_name))
   407  
   408    def test_file_status(self):
   409      file_name = 'gs://gcsio-test/dummy_file'
   410      file_size = 1234
   411      last_updated = 123456.78
   412      checksum = 'deadbeef'
   413  
   414      self._insert_random_file(
   415          self.client,
   416          file_name,
   417          file_size,
   418          last_updated=last_updated,
   419          crc32c=checksum)
   420      file_checksum = self.gcs.checksum(file_name)
   421  
   422      file_status = self.gcs._status(file_name)
   423  
   424      self.assertEqual(file_status['size'], file_size)
   425      self.assertEqual(file_status['checksum'], file_checksum)
   426      self.assertEqual(file_status['last_updated'], last_updated)
   427  
   428    def test_file_mode(self):
   429      file_name = 'gs://gcsio-test/dummy_mode_file'
   430      with self.gcs.open(file_name, 'wb') as f:
   431        assert f.mode == 'wb'
   432      with self.gcs.open(file_name, 'rb') as f:
   433        assert f.mode == 'rb'
   434  
   435    def test_bad_file_modes(self):
   436      file_name = 'gs://gcsio-test/dummy_mode_file'
   437      with self.assertRaises(ValueError):
   438        self.gcs.open(file_name, 'w+')
   439      with self.assertRaises(ValueError):
   440        self.gcs.open(file_name, 'r+b')
   441  
   442    def test_empty_batches(self):
   443      self.assertEqual([], self.gcs.copy_batch([]))
   444      self.assertEqual([], self.gcs.delete_batch([]))
   445  
   446    def test_delete(self):
   447      file_name = 'gs://gcsio-test/delete_me'
   448      file_size = 1024
   449  
   450      # Test deletion of non-existent file.
   451      self.gcs.delete(file_name)
   452  
   453      self._insert_random_file(self.client, file_name, file_size)
   454      self.assertTrue(
   455          gcsio.parse_gcs_path(file_name) in self.client.objects.files)
   456  
   457      self.gcs.delete(file_name)
   458  
   459      self.assertFalse(
   460          gcsio.parse_gcs_path(file_name) in self.client.objects.files)
   461  
   462    @mock.patch(
   463        'apache_beam.io.gcp.gcsio.auth.get_service_credentials',
   464        wraps=lambda pipeline_options: None)
   465    @mock.patch('apache_beam.io.gcp.gcsio.get_new_http')
   466    def test_user_agent_passed(self, get_new_http_mock, get_service_creds_mock):
   467      client = gcsio.GcsIO()
   468      try:
   469        client.get_bucket('mabucket')
   470      except:  # pylint: disable=bare-except
   471        # Ignore errors. The errors come from the fact that we did not mock
   472        # the response from the API, so the overall get_bucket call fails
   473        # soon after the GCS API is called.
   474        pass
   475      call = get_new_http_mock.return_value.request.mock_calls[-2]
   476      self.assertIn('apache-beam-', call[2]['headers']['User-Agent'])
   477  
   478    @mock.patch('apache_beam.io.gcp.gcsio.BatchApiRequest')
   479    def test_delete_batch(self, *unused_args):
   480      gcsio.BatchApiRequest = FakeBatchApiRequest
   481      file_name_pattern = 'gs://gcsio-test/delete_me_%d'
   482      file_size = 1024
   483      num_files = 10
   484  
   485      # Test deletion of non-existent files.
   486      result = self.gcs.delete_batch(
   487          [file_name_pattern % i for i in range(num_files)])
   488      self.assertTrue(result)
   489      for i, (file_name, exception) in enumerate(result):
   490        self.assertEqual(file_name, file_name_pattern % i)
   491        self.assertEqual(exception, None)
   492        self.assertFalse(self.gcs.exists(file_name_pattern % i))
   493  
   494      # Insert some files.
   495      for i in range(num_files):
   496        self._insert_random_file(self.client, file_name_pattern % i, file_size)
   497  
   498      # Check files inserted properly.
   499      for i in range(num_files):
   500        self.assertTrue(self.gcs.exists(file_name_pattern % i))
   501  
   502      # Execute batch delete.
   503      self.gcs.delete_batch([file_name_pattern % i for i in range(num_files)])
   504  
   505      # Check files deleted properly.
   506      for i in range(num_files):
   507        self.assertFalse(self.gcs.exists(file_name_pattern % i))
   508  
   509    def test_copy(self):
   510      src_file_name = 'gs://gcsio-test/source'
   511      dest_file_name = 'gs://gcsio-test/dest'
   512      file_size = 1024
   513      self._insert_random_file(self.client, src_file_name, file_size)
   514      self.assertTrue(
   515          gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
   516      self.assertFalse(
   517          gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
   518  
   519      self.gcs.copy(src_file_name, dest_file_name, dest_kms_key_name='kms_key')
   520  
   521      self.assertTrue(
   522          gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
   523      self.assertTrue(
   524          gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
   525  
   526      # Test copy of non-existent files.
   527      with self.assertRaisesRegex(HttpError, r'Not Found'):
   528        self.gcs.copy(
   529            'gs://gcsio-test/non-existent',
   530            'gs://gcsio-test/non-existent-destination')
   531  
   532    @mock.patch('apache_beam.io.gcp.gcsio.BatchApiRequest')
   533    def test_copy_batch(self, *unused_args):
   534      gcsio.BatchApiRequest = FakeBatchApiRequest
   535      from_name_pattern = 'gs://gcsio-test/copy_me_%d'
   536      to_name_pattern = 'gs://gcsio-test/destination_%d'
   537      file_size = 1024
   538      num_files = 10
   539  
   540      result = self.gcs.copy_batch([(from_name_pattern % i, to_name_pattern % i)
   541                                    for i in range(num_files)],
   542                                   dest_kms_key_name='kms_key')
   543      self.assertTrue(result)
   544      for i, (src, dest, exception) in enumerate(result):
   545        self.assertEqual(src, from_name_pattern % i)
   546        self.assertEqual(dest, to_name_pattern % i)
   547        self.assertTrue(isinstance(exception, IOError))
   548        self.assertEqual(exception.errno, errno.ENOENT)
   549        self.assertFalse(self.gcs.exists(from_name_pattern % i))
   550        self.assertFalse(self.gcs.exists(to_name_pattern % i))
   551  
   552      # Insert some files.
   553      for i in range(num_files):
   554        self._insert_random_file(self.client, from_name_pattern % i, file_size)
   555  
   556      # Check files inserted properly.
   557      for i in range(num_files):
   558        self.assertTrue(self.gcs.exists(from_name_pattern % i))
   559  
   560      # Execute batch copy.
   561      self.gcs.copy_batch([(from_name_pattern % i, to_name_pattern % i)
   562                           for i in range(num_files)])
   563  
   564      # Check files copied properly.
   565      for i in range(num_files):
   566        self.assertTrue(self.gcs.exists(from_name_pattern % i))
   567        self.assertTrue(self.gcs.exists(to_name_pattern % i))
   568  
   569    def test_copytree(self):
   570      src_dir_name = 'gs://gcsio-test/source/'
   571      dest_dir_name = 'gs://gcsio-test/dest/'
   572      file_size = 1024
   573      paths = ['a', 'b/c', 'b/d']
   574      for path in paths:
   575        src_file_name = src_dir_name + path
   576        dest_file_name = dest_dir_name + path
   577        self._insert_random_file(self.client, src_file_name, file_size)
   578        self.assertTrue(
   579            gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
   580        self.assertFalse(
   581            gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
   582  
   583      self.gcs.copytree(src_dir_name, dest_dir_name)
   584  
   585      for path in paths:
   586        src_file_name = src_dir_name + path
   587        dest_file_name = dest_dir_name + path
   588        self.assertTrue(
   589            gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
   590        self.assertTrue(
   591            gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
   592  
   593    def test_rename(self):
   594      src_file_name = 'gs://gcsio-test/source'
   595      dest_file_name = 'gs://gcsio-test/dest'
   596      file_size = 1024
   597      self._insert_random_file(self.client, src_file_name, file_size)
   598      self.assertTrue(
   599          gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
   600      self.assertFalse(
   601          gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
   602  
   603      self.gcs.rename(src_file_name, dest_file_name)
   604  
   605      self.assertFalse(
   606          gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
   607      self.assertTrue(
   608          gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
   609  
   610    def test_full_file_read(self):
   611      file_name = 'gs://gcsio-test/full_file'
   612      file_size = 5 * 1024 * 1024 + 100
   613      random_file = self._insert_random_file(self.client, file_name, file_size)
   614      f = self.gcs.open(file_name)
   615      self.assertEqual(f.mode, 'r')
   616      f.seek(0, os.SEEK_END)
   617      self.assertEqual(f.tell(), file_size)
   618      self.assertEqual(f.read(), b'')
   619      f.seek(0)
   620      self.assertEqual(f.read(), random_file.contents)
   621  
   622    def test_file_random_seek(self):
   623      file_name = 'gs://gcsio-test/seek_file'
   624      file_size = 5 * 1024 * 1024 - 100
   625      random_file = self._insert_random_file(self.client, file_name, file_size)
   626  
   627      f = self.gcs.open(file_name)
   628      random.seed(0)
   629      for _ in range(0, 10):
   630        a = random.randint(0, file_size - 1)
   631        b = random.randint(0, file_size - 1)
   632        start, end = min(a, b), max(a, b)
   633        f.seek(start)
   634        self.assertEqual(f.tell(), start)
   635        self.assertEqual(
   636            f.read(end - start + 1), random_file.contents[start:end + 1])
   637        self.assertEqual(f.tell(), end + 1)
   638  
   639    def test_file_iterator(self):
   640      file_name = 'gs://gcsio-test/iterating_file'
   641      lines = []
   642      line_count = 10
   643      for _ in range(line_count):
   644        line_length = random.randint(100, 500)
   645        line = os.urandom(line_length).replace(b'\n', b' ') + b'\n'
   646        lines.append(line)
   647  
   648      contents = b''.join(lines)
   649      bucket, name = gcsio.parse_gcs_path(file_name)
   650      self.client.objects.add_file(FakeFile(bucket, name, contents, 1))
   651  
   652      f = self.gcs.open(file_name)
   653  
   654      read_lines = 0
   655      for line in f:
   656        read_lines += 1
   657  
   658      self.assertEqual(read_lines, line_count)
   659  
   660    def test_file_read_line(self):
   661      file_name = 'gs://gcsio-test/read_line_file'
   662      lines = []
   663  
   664      # Set a small buffer size to exercise refilling the buffer.
   665      # First line is carefully crafted so the newline falls as the last character
   666      # of the buffer to exercise this code path.
   667      read_buffer_size = 1024
   668      lines.append(b'x' * 1023 + b'\n')
   669  
   670      for _ in range(1, 1000):
   671        line_length = random.randint(100, 500)
   672        line = os.urandom(line_length).replace(b'\n', b' ') + b'\n'
   673        lines.append(line)
   674      contents = b''.join(lines)
   675  
   676      file_size = len(contents)
   677      bucket, name = gcsio.parse_gcs_path(file_name)
   678      self.client.objects.add_file(FakeFile(bucket, name, contents, 1))
   679  
   680      f = self.gcs.open(file_name, read_buffer_size=read_buffer_size)
   681  
   682      # Test read of first two lines.
   683      f.seek(0)
   684      self.assertEqual(f.readline(), lines[0])
   685      self.assertEqual(f.tell(), len(lines[0]))
   686      self.assertEqual(f.readline(), lines[1])
   687  
   688      # Test read at line boundary.
   689      f.seek(file_size - len(lines[-1]) - 1)
   690      self.assertEqual(f.readline(), b'\n')
   691  
   692      # Test read at end of file.
   693      f.seek(file_size)
   694      self.assertEqual(f.readline(), b'')
   695  
   696      # Test reads at random positions.
   697      random.seed(0)
   698      for _ in range(0, 10):
   699        start = random.randint(0, file_size - 1)
   700        line_index = 0
   701        # Find line corresponding to start index.
   702        chars_left = start
   703        while True:
   704          next_line_length = len(lines[line_index])
   705          if chars_left - next_line_length < 0:
   706            break
   707          chars_left -= next_line_length
   708          line_index += 1
   709        f.seek(start)
   710        self.assertEqual(f.readline(), lines[line_index][chars_left:])
   711  
   712    def test_file_write(self):
   713      file_name = 'gs://gcsio-test/write_file'
   714      file_size = 5 * 1024 * 1024 + 2000
   715      contents = os.urandom(file_size)
   716      f = self.gcs.open(file_name, 'w')
   717      self.assertEqual(f.mode, 'w')
   718      f.write(contents[0:1000])
   719      f.write(contents[1000:1024 * 1024])
   720      f.write(contents[1024 * 1024:])
   721      f.close()
   722      bucket, name = gcsio.parse_gcs_path(file_name)
   723      self.assertEqual(
   724          self.client.objects.get_file(bucket, name).contents, contents)
   725  
   726    def test_file_close(self):
   727      file_name = 'gs://gcsio-test/close_file'
   728      file_size = 5 * 1024 * 1024 + 2000
   729      contents = os.urandom(file_size)
   730      f = self.gcs.open(file_name, 'w')
   731      self.assertEqual(f.mode, 'w')
   732      f.write(contents)
   733      f.close()
   734      f.close()  # This should not crash.
   735      bucket, name = gcsio.parse_gcs_path(file_name)
   736      self.assertEqual(
   737          self.client.objects.get_file(bucket, name).contents, contents)
   738  
   739    def test_file_flush(self):
   740      file_name = 'gs://gcsio-test/flush_file'
   741      file_size = 5 * 1024 * 1024 + 2000
   742      contents = os.urandom(file_size)
   743      bucket, name = gcsio.parse_gcs_path(file_name)
   744      f = self.gcs.open(file_name, 'w')
   745      self.assertEqual(f.mode, 'w')
   746      f.write(contents[0:1000])
   747      f.flush()
   748      f.write(contents[1000:1024 * 1024])
   749      f.flush()
   750      f.flush()  # Should be a NOOP.
   751      f.write(contents[1024 * 1024:])
   752      f.close()  # This should already call the equivalent of flush() in its body.
   753      self.assertEqual(
   754          self.client.objects.get_file(bucket, name).contents, contents)
   755  
   756    def test_context_manager(self):
   757      # Test writing with a context manager.
   758      file_name = 'gs://gcsio-test/context_manager_file'
   759      file_size = 1024
   760      contents = os.urandom(file_size)
   761      with self.gcs.open(file_name, 'w') as f:
   762        f.write(contents)
   763      bucket, name = gcsio.parse_gcs_path(file_name)
   764      self.assertEqual(
   765          self.client.objects.get_file(bucket, name).contents, contents)
   766  
   767      # Test reading with a context manager.
   768      with self.gcs.open(file_name) as f:
   769        self.assertEqual(f.read(), contents)
   770  
   771      # Test that exceptions are not swallowed by the context manager.
   772      with self.assertRaises(ZeroDivisionError):
   773        with self.gcs.open(file_name) as f:
   774          f.read(0 // 0)
   775  
   776    def test_list_prefix(self):
   777      bucket_name = 'gcsio-test'
   778      objects = [
   779          ('cow/cat/fish', 2),
   780          ('cow/cat/blubber', 3),
   781          ('cow/dog/blubber', 4),
   782      ]
   783      for (object_name, size) in objects:
   784        file_name = 'gs://%s/%s' % (bucket_name, object_name)
   785        self._insert_random_file(self.client, file_name, size)
   786      test_cases = [
   787          (
   788              'gs://gcsio-test/c',
   789              [
   790                  ('cow/cat/fish', 2),
   791                  ('cow/cat/blubber', 3),
   792                  ('cow/dog/blubber', 4),
   793              ]),
   794          (
   795              'gs://gcsio-test/cow/',
   796              [
   797                  ('cow/cat/fish', 2),
   798                  ('cow/cat/blubber', 3),
   799                  ('cow/dog/blubber', 4),
   800              ]),
   801          ('gs://gcsio-test/cow/cat/fish', [
   802              ('cow/cat/fish', 2),
   803          ]),
   804      ]
   805      for file_pattern, expected_object_names in test_cases:
   806        expected_file_names = [('gs://%s/%s' % (bucket_name, object_name), size)
   807                               for (object_name, size) in expected_object_names]
   808        self.assertEqual(
   809            set(self.gcs.list_prefix(file_pattern).items()),
   810            set(expected_file_names))
   811  
   812    def test_mime_binary_encoding(self):
   813      # This test verifies that the MIME email_generator library works properly
   814      # and does not corrupt '\r\n' during uploads (the patch to apitools in
   815      # Python 3 is applied in io/gcp/__init__.py).
   816      from apitools.base.py.transfer import email_generator
   817      generator_cls = email_generator.BytesGenerator
   818      output_buffer = io.BytesIO()
   819      generator = generator_cls(output_buffer)
   820      test_msg = 'a\nb\r\nc\n\r\n\n\nd'
   821      message = Message()
   822      message.set_payload(test_msg)
   823      generator._handle_text(message)
   824      self.assertEqual(test_msg.encode('ascii'), output_buffer.getvalue())
   825  
   826    def test_downloader_monitoring_info(self):
   827      # Clear the process wide metric container.
   828      MetricsEnvironment.process_wide_container().reset()
   829  
   830      file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
   831      file_size = 5 * 1024 * 1024 + 100
   832      random_file = self._insert_random_file(self.client, file_name, file_size)
   833      self.gcs.open(file_name, 'r')
   834  
   835      resource = resource_identifiers.GoogleCloudStorageBucket(random_file.bucket)
   836      labels = {
   837          monitoring_infos.SERVICE_LABEL: 'Storage',
   838          monitoring_infos.METHOD_LABEL: 'Objects.get',
   839          monitoring_infos.RESOURCE_LABEL: resource,
   840          monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
   841          monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER),
   842          monitoring_infos.STATUS_LABEL: 'ok'
   843      }
   844  
   845      metric_name = MetricName(
   846          None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels)
   847      metric_value = MetricsEnvironment.process_wide_container().get_counter(
   848          metric_name).get_cumulative()
   849  
   850      self.assertEqual(metric_value, 2)
   851  
   852    @mock.patch.object(FakeGcsBuckets, 'Get')
   853    def test_downloader_fail_to_get_project_number(self, mock_get):
   854      # Raising an error when listing GCS Bucket so that project number fails to
   855      # be retrieved.
   856      mock_get.side_effect = HttpError({'status': 403}, None, None)
   857      # Clear the process wide metric container.
   858      MetricsEnvironment.process_wide_container().reset()
   859  
   860      file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
   861      file_size = 5 * 1024 * 1024 + 100
   862      random_file = self._insert_random_file(self.client, file_name, file_size)
   863      self.gcs.open(file_name, 'r')
   864  
   865      resource = resource_identifiers.GoogleCloudStorageBucket(random_file.bucket)
   866      labels = {
   867          monitoring_infos.SERVICE_LABEL: 'Storage',
   868          monitoring_infos.METHOD_LABEL: 'Objects.get',
   869          monitoring_infos.RESOURCE_LABEL: resource,
   870          monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
   871          monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER),
   872          monitoring_infos.STATUS_LABEL: 'ok'
   873      }
   874  
   875      metric_name = MetricName(
   876          None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels)
   877      metric_value = MetricsEnvironment.process_wide_container().get_counter(
   878          metric_name).get_cumulative()
   879  
   880      self.assertEqual(metric_value, 0)
   881  
   882      labels_without_project_id = {
   883          monitoring_infos.SERVICE_LABEL: 'Storage',
   884          monitoring_infos.METHOD_LABEL: 'Objects.get',
   885          monitoring_infos.RESOURCE_LABEL: resource,
   886          monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
   887          monitoring_infos.STATUS_LABEL: 'ok'
   888      }
   889      metric_name = MetricName(
   890          None,
   891          None,
   892          urn=monitoring_infos.API_REQUEST_COUNT_URN,
   893          labels=labels_without_project_id)
   894      metric_value = MetricsEnvironment.process_wide_container().get_counter(
   895          metric_name).get_cumulative()
   896  
   897      self.assertEqual(metric_value, 2)
   898  
   899    def test_downloader_fail_non_existent_object(self):
   900      file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
   901      with self.assertRaises(IOError):
   902        self.gcs.open(file_name, 'r')
   903  
   904    def test_downloader_fail_when_getting_metadata(self):
   905      file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
   906      file_size = 5 * 1024 * 1024 + 100
   907      self._insert_random_file(
   908          self.client, file_name, file_size, fail_when_getting_metadata=True)
   909      with self.assertRaises(HttpError):
   910        self.gcs.open(file_name, 'r')
   911  
   912    def test_downloader_fail_when_reading(self):
   913      file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
   914      file_size = 5 * 1024 * 1024 + 100
   915      self._insert_random_file(
   916          self.client, file_name, file_size, fail_when_reading=True)
   917      with self.assertRaises(HttpError):
   918        self.gcs.open(file_name, 'r')
   919  
   920    def test_uploader_monitoring_info(self):
   921      # Clear the process wide metric container.
   922      MetricsEnvironment.process_wide_container().reset()
   923  
   924      file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
   925      file_size = 5 * 1024 * 1024 + 100
   926      random_file = self._insert_random_file(self.client, file_name, file_size)
   927      f = self.gcs.open(file_name, 'w')
   928  
   929      resource = resource_identifiers.GoogleCloudStorageBucket(random_file.bucket)
   930      labels = {
   931          monitoring_infos.SERVICE_LABEL: 'Storage',
   932          monitoring_infos.METHOD_LABEL: 'Objects.insert',
   933          monitoring_infos.RESOURCE_LABEL: resource,
   934          monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
   935          monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER),
   936          monitoring_infos.STATUS_LABEL: 'ok'
   937      }
   938  
   939      f.close()
   940      metric_name = MetricName(
   941          None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels)
   942      metric_value = MetricsEnvironment.process_wide_container().get_counter(
   943          metric_name).get_cumulative()
   944  
   945      self.assertEqual(metric_value, 1)
   946  
   947  
   948  if __name__ == '__main__':
   949    logging.getLogger().setLevel(logging.INFO)
   950    unittest.main()