github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Google Cloud Storage client. 19 20 This library evolved from the Google App Engine GCS client available at 21 https://github.com/GoogleCloudPlatform/appengine-gcs-client. 22 23 **Updates to the I/O connector code** 24 25 For any significant updates to this I/O connector, please consider involving 26 corresponding code reviewers mentioned in 27 https://github.com/apache/beam/blob/master/sdks/python/OWNERS 28 """ 29 30 # pytype: skip-file 31 32 import errno 33 import io 34 import logging 35 import multiprocessing 36 import re 37 import threading 38 import time 39 import traceback 40 from itertools import islice 41 from typing import Optional 42 from typing import Union 43 44 import apache_beam 45 from apache_beam.internal.http_client import get_new_http 46 from apache_beam.internal.metrics.metric import ServiceCallMetric 47 from apache_beam.io.filesystemio import Downloader 48 from apache_beam.io.filesystemio import DownloaderStream 49 from apache_beam.io.filesystemio import PipeStream 50 from apache_beam.io.filesystemio import Uploader 51 from apache_beam.io.filesystemio import UploaderStream 52 from apache_beam.io.gcp import resource_identifiers 53 from apache_beam.metrics import monitoring_infos 54 from apache_beam.options.pipeline_options import PipelineOptions 55 from apache_beam.utils import retry 56 from apache_beam.utils.annotations import deprecated 57 58 __all__ = ['GcsIO'] 59 60 _LOGGER = logging.getLogger(__name__) 61 62 # Issue a friendlier error message if the storage library is not available. 63 # TODO(silviuc): Remove this guard when storage is available everywhere. 64 try: 65 # pylint: disable=wrong-import-order, wrong-import-position 66 # pylint: disable=ungrouped-imports 67 from apitools.base.py.batch import BatchApiRequest 68 from apitools.base.py.exceptions import HttpError 69 from apitools.base.py import transfer 70 from apache_beam.internal.gcp import auth 71 from apache_beam.io.gcp.internal.clients import storage 72 except ImportError: 73 raise ImportError( 74 'Google Cloud Storage I/O not supported for this execution environment ' 75 '(could not import storage API client).') 76 77 # This is the size of each partial-file read operation from GCS. This 78 # parameter was chosen to give good throughput while keeping memory usage at 79 # a reasonable level; the following table shows throughput reached when 80 # reading files of a given size with a chosen buffer size and informed the 81 # choice of the value, as of 11/2016: 82 # 83 # +---------------+------------+-------------+-------------+-------------+ 84 # | | 50 MB file | 100 MB file | 200 MB file | 400 MB file | 85 # +---------------+------------+-------------+-------------+-------------+ 86 # | 8 MB buffer | 17.12 MB/s | 22.67 MB/s | 23.81 MB/s | 26.05 MB/s | 87 # | 16 MB buffer | 24.21 MB/s | 42.70 MB/s | 42.89 MB/s | 46.92 MB/s | 88 # | 32 MB buffer | 28.53 MB/s | 48.08 MB/s | 54.30 MB/s | 54.65 MB/s | 89 # | 400 MB buffer | 34.72 MB/s | 71.13 MB/s | 79.13 MB/s | 85.39 MB/s | 90 # +---------------+------------+-------------+-------------+-------------+ 91 DEFAULT_READ_BUFFER_SIZE = 16 * 1024 * 1024 92 93 # This is the number of seconds the library will wait for a partial-file read 94 # operation from GCS to complete before retrying. 95 DEFAULT_READ_SEGMENT_TIMEOUT_SECONDS = 60 96 97 # This is the size of chunks used when writing to GCS. 98 WRITE_CHUNK_SIZE = 8 * 1024 * 1024 99 100 # Maximum number of operations permitted in GcsIO.copy_batch() and 101 # GcsIO.delete_batch(). 102 MAX_BATCH_OPERATION_SIZE = 100 103 104 # Batch endpoint URL for GCS. 105 # We have to specify an API specific endpoint here since Google APIs global 106 # batch endpoints will be deprecated on 03/25/2019. 107 # See https://developers.googleblog.com/2018/03/discontinuing-support-for-json-rpc-and.html. # pylint: disable=line-too-long 108 # Currently apitools library uses a global batch endpoint by default: 109 # https://github.com/google/apitools/blob/master/apitools/base/py/batch.py#L152 110 # TODO: remove this constant and it's usage after apitools move to using an API 111 # specific batch endpoint or after Beam gcsio module start using a GCS client 112 # library that does not use global batch endpoints. 113 GCS_BATCH_ENDPOINT = 'https://www.googleapis.com/batch/storage/v1' 114 115 116 def parse_gcs_path(gcs_path, object_optional=False): 117 """Return the bucket and object names of the given gs:// path.""" 118 match = re.match('^gs://([^/]+)/(.*)$', gcs_path) 119 if match is None or (match.group(2) == '' and not object_optional): 120 raise ValueError( 121 'GCS path must be in the form gs://<bucket>/<object>. ' 122 f'Encountered {gcs_path!r}') 123 return match.group(1), match.group(2) 124 125 126 def default_gcs_bucket_name(project, region): 127 from hashlib import md5 128 return 'dataflow-staging-%s-%s' % ( 129 region, md5(project.encode('utf8')).hexdigest()) 130 131 132 def get_or_create_default_gcs_bucket(options): 133 """Create a default GCS bucket for this project.""" 134 if getattr(options, 'dataflow_kms_key', None): 135 _LOGGER.warning( 136 'Cannot create a default bucket when --dataflow_kms_key is set.') 137 return None 138 139 project = getattr(options, 'project', None) 140 region = getattr(options, 'region', None) 141 if not project or not region: 142 return None 143 144 bucket_name = default_gcs_bucket_name(project, region) 145 bucket = GcsIO(pipeline_options=options).get_bucket(bucket_name) 146 if bucket: 147 return bucket 148 else: 149 _LOGGER.warning( 150 'Creating default GCS bucket for project %s: gs://%s', 151 project, 152 bucket_name) 153 return GcsIO(pipeline_options=options).create_bucket( 154 bucket_name, project, location=region) 155 156 157 class GcsIOError(IOError, retry.PermanentException): 158 """GCS IO error that should not be retried.""" 159 pass 160 161 162 class GcsIO(object): 163 """Google Cloud Storage I/O client.""" 164 def __init__(self, storage_client=None, pipeline_options=None): 165 # type: (Optional[storage.StorageV1], Optional[Union[dict, PipelineOptions]]) -> None 166 if storage_client is None: 167 if not pipeline_options: 168 pipeline_options = PipelineOptions() 169 elif isinstance(pipeline_options, dict): 170 pipeline_options = PipelineOptions.from_dictionary(pipeline_options) 171 storage_client = storage.StorageV1( 172 credentials=auth.get_service_credentials(pipeline_options), 173 get_credentials=False, 174 http=get_new_http(), 175 response_encoding='utf8', 176 additional_http_headers={ 177 "User-Agent": "apache-beam-%s" % apache_beam.__version__ 178 }) 179 self.client = storage_client 180 self._rewrite_cb = None 181 self.bucket_to_project_number = {} 182 183 def get_project_number(self, bucket): 184 if bucket not in self.bucket_to_project_number: 185 bucket_metadata = self.get_bucket(bucket_name=bucket) 186 if bucket_metadata: 187 self.bucket_to_project_number[bucket] = bucket_metadata.projectNumber 188 # else failed to load the bucket metadata due to HttpError 189 190 return self.bucket_to_project_number.get(bucket, None) 191 192 def _set_rewrite_response_callback(self, callback): 193 """For testing purposes only. No backward compatibility guarantees. 194 195 Args: 196 callback: A function that receives ``storage.RewriteResponse``. 197 """ 198 self._rewrite_cb = callback 199 200 def get_bucket(self, bucket_name): 201 """Returns an object bucket from its name, or None if it does not exist.""" 202 try: 203 request = storage.StorageBucketsGetRequest(bucket=bucket_name) 204 return self.client.buckets.Get(request) 205 except HttpError: 206 return None 207 208 def create_bucket(self, bucket_name, project, kms_key=None, location=None): 209 """Create and return a GCS bucket in a specific project.""" 210 encryption = None 211 if kms_key: 212 encryption = storage.Bucket.EncryptionValue(kms_key) 213 214 request = storage.StorageBucketsInsertRequest( 215 bucket=storage.Bucket( 216 name=bucket_name, location=location, encryption=encryption), 217 project=project, 218 ) 219 try: 220 return self.client.buckets.Insert(request) 221 except HttpError: 222 return None 223 224 def open( 225 self, 226 filename, 227 mode='r', 228 read_buffer_size=DEFAULT_READ_BUFFER_SIZE, 229 mime_type='application/octet-stream'): 230 """Open a GCS file path for reading or writing. 231 232 Args: 233 filename (str): GCS file path in the form ``gs://<bucket>/<object>``. 234 mode (str): ``'r'`` for reading or ``'w'`` for writing. 235 read_buffer_size (int): Buffer size to use during read operations. 236 mime_type (str): Mime type to set for write operations. 237 238 Returns: 239 GCS file object. 240 241 Raises: 242 ValueError: Invalid open file mode. 243 """ 244 if mode == 'r' or mode == 'rb': 245 downloader = GcsDownloader( 246 self.client, 247 filename, 248 buffer_size=read_buffer_size, 249 get_project_number=self.get_project_number) 250 return io.BufferedReader( 251 DownloaderStream( 252 downloader, read_buffer_size=read_buffer_size, mode=mode), 253 buffer_size=read_buffer_size) 254 elif mode == 'w' or mode == 'wb': 255 uploader = GcsUploader( 256 self.client, 257 filename, 258 mime_type, 259 get_project_number=self.get_project_number) 260 return io.BufferedWriter( 261 UploaderStream(uploader, mode=mode), buffer_size=128 * 1024) 262 else: 263 raise ValueError('Invalid file open mode: %s.' % mode) 264 265 @retry.with_exponential_backoff( 266 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 267 def delete(self, path): 268 """Deletes the object at the given GCS path. 269 270 Args: 271 path: GCS file path pattern in the form gs://<bucket>/<name>. 272 """ 273 bucket, object_path = parse_gcs_path(path) 274 request = storage.StorageObjectsDeleteRequest( 275 bucket=bucket, object=object_path) 276 try: 277 self.client.objects.Delete(request) 278 except HttpError as http_error: 279 if http_error.status_code == 404: 280 # Return success when the file doesn't exist anymore for idempotency. 281 return 282 raise 283 284 # We intentionally do not decorate this method with a retry, as retrying is 285 # handled in BatchApiRequest.Execute(). 286 def delete_batch(self, paths): 287 """Deletes the objects at the given GCS paths. 288 289 Args: 290 paths: List of GCS file path patterns in the form gs://<bucket>/<name>, 291 not to exceed MAX_BATCH_OPERATION_SIZE in length. 292 293 Returns: List of tuples of (path, exception) in the same order as the paths 294 argument, where exception is None if the operation succeeded or 295 the relevant exception if the operation failed. 296 """ 297 if not paths: 298 return [] 299 300 paths = iter(paths) 301 result_statuses = [] 302 while True: 303 paths_chunk = list(islice(paths, MAX_BATCH_OPERATION_SIZE)) 304 if not paths_chunk: 305 return result_statuses 306 batch_request = BatchApiRequest( 307 batch_url=GCS_BATCH_ENDPOINT, 308 retryable_codes=retry.SERVER_ERROR_OR_TIMEOUT_CODES, 309 response_encoding='utf-8') 310 for path in paths_chunk: 311 bucket, object_path = parse_gcs_path(path) 312 request = storage.StorageObjectsDeleteRequest( 313 bucket=bucket, object=object_path) 314 batch_request.Add(self.client.objects, 'Delete', request) 315 api_calls = batch_request.Execute(self.client._http) # pylint: disable=protected-access 316 for i, api_call in enumerate(api_calls): 317 path = paths_chunk[i] 318 exception = None 319 if api_call.is_error: 320 exception = api_call.exception 321 # Return success when the file doesn't exist anymore for idempotency. 322 if isinstance(exception, HttpError) and exception.status_code == 404: 323 exception = None 324 result_statuses.append((path, exception)) 325 return result_statuses 326 327 @retry.with_exponential_backoff( 328 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 329 def copy( 330 self, 331 src, 332 dest, 333 dest_kms_key_name=None, 334 max_bytes_rewritten_per_call=None): 335 """Copies the given GCS object from src to dest. 336 337 Args: 338 src: GCS file path pattern in the form gs://<bucket>/<name>. 339 dest: GCS file path pattern in the form gs://<bucket>/<name>. 340 dest_kms_key_name: Experimental. No backwards compatibility guarantees. 341 Encrypt dest with this Cloud KMS key. If None, will use dest bucket 342 encryption defaults. 343 max_bytes_rewritten_per_call: Experimental. No backwards compatibility 344 guarantees. Each rewrite API call will return after these many bytes. 345 Used for testing. 346 347 Raises: 348 TimeoutError: on timeout. 349 """ 350 src_bucket, src_path = parse_gcs_path(src) 351 dest_bucket, dest_path = parse_gcs_path(dest) 352 request = storage.StorageObjectsRewriteRequest( 353 sourceBucket=src_bucket, 354 sourceObject=src_path, 355 destinationBucket=dest_bucket, 356 destinationObject=dest_path, 357 destinationKmsKeyName=dest_kms_key_name, 358 maxBytesRewrittenPerCall=max_bytes_rewritten_per_call) 359 response = self.client.objects.Rewrite(request) 360 while not response.done: 361 _LOGGER.debug( 362 'Rewrite progress: %d of %d bytes, %s to %s', 363 response.totalBytesRewritten, 364 response.objectSize, 365 src, 366 dest) 367 request.rewriteToken = response.rewriteToken 368 response = self.client.objects.Rewrite(request) 369 if self._rewrite_cb is not None: 370 self._rewrite_cb(response) 371 372 _LOGGER.debug('Rewrite done: %s to %s', src, dest) 373 374 # We intentionally do not decorate this method with a retry, as retrying is 375 # handled in BatchApiRequest.Execute(). 376 def copy_batch( 377 self, 378 src_dest_pairs, 379 dest_kms_key_name=None, 380 max_bytes_rewritten_per_call=None): 381 """Copies the given GCS object from src to dest. 382 383 Args: 384 src_dest_pairs: list of (src, dest) tuples of gs://<bucket>/<name> files 385 paths to copy from src to dest, not to exceed 386 MAX_BATCH_OPERATION_SIZE in length. 387 dest_kms_key_name: Experimental. No backwards compatibility guarantees. 388 Encrypt dest with this Cloud KMS key. If None, will use dest bucket 389 encryption defaults. 390 max_bytes_rewritten_per_call: Experimental. No backwards compatibility 391 guarantees. Each rewrite call will return after these many bytes. Used 392 primarily for testing. 393 394 Returns: List of tuples of (src, dest, exception) in the same order as the 395 src_dest_pairs argument, where exception is None if the operation 396 succeeded or the relevant exception if the operation failed. 397 """ 398 if not src_dest_pairs: 399 return [] 400 pair_to_request = {} 401 for pair in src_dest_pairs: 402 src_bucket, src_path = parse_gcs_path(pair[0]) 403 dest_bucket, dest_path = parse_gcs_path(pair[1]) 404 request = storage.StorageObjectsRewriteRequest( 405 sourceBucket=src_bucket, 406 sourceObject=src_path, 407 destinationBucket=dest_bucket, 408 destinationObject=dest_path, 409 destinationKmsKeyName=dest_kms_key_name, 410 maxBytesRewrittenPerCall=max_bytes_rewritten_per_call) 411 pair_to_request[pair] = request 412 pair_to_status = {} 413 while True: 414 pairs_in_batch = list(set(src_dest_pairs) - set(pair_to_status)) 415 if not pairs_in_batch: 416 break 417 batch_request = BatchApiRequest( 418 batch_url=GCS_BATCH_ENDPOINT, 419 retryable_codes=retry.SERVER_ERROR_OR_TIMEOUT_CODES, 420 response_encoding='utf-8') 421 for pair in pairs_in_batch: 422 batch_request.Add(self.client.objects, 'Rewrite', pair_to_request[pair]) 423 api_calls = batch_request.Execute(self.client._http) # pylint: disable=protected-access 424 for pair, api_call in zip(pairs_in_batch, api_calls): 425 src, dest = pair 426 response = api_call.response 427 if self._rewrite_cb is not None: 428 self._rewrite_cb(response) 429 if api_call.is_error: 430 exception = api_call.exception 431 # Translate 404 to the appropriate not found exception. 432 if isinstance(exception, HttpError) and exception.status_code == 404: 433 exception = ( 434 GcsIOError(errno.ENOENT, 'Source file not found: %s' % src)) 435 pair_to_status[pair] = exception 436 elif not response.done: 437 _LOGGER.debug( 438 'Rewrite progress: %d of %d bytes, %s to %s', 439 response.totalBytesRewritten, 440 response.objectSize, 441 src, 442 dest) 443 pair_to_request[pair].rewriteToken = response.rewriteToken 444 else: 445 _LOGGER.debug('Rewrite done: %s to %s', src, dest) 446 pair_to_status[pair] = None 447 448 return [(pair[0], pair[1], pair_to_status[pair]) for pair in src_dest_pairs] 449 450 # We intentionally do not decorate this method with a retry, since the 451 # underlying copy and delete operations are already idempotent operations 452 # protected by retry decorators. 453 def copytree(self, src, dest): 454 """Renames the given GCS "directory" recursively from src to dest. 455 456 Args: 457 src: GCS file path pattern in the form gs://<bucket>/<name>/. 458 dest: GCS file path pattern in the form gs://<bucket>/<name>/. 459 """ 460 assert src.endswith('/') 461 assert dest.endswith('/') 462 for entry in self.list_prefix(src): 463 rel_path = entry[len(src):] 464 self.copy(entry, dest + rel_path) 465 466 # We intentionally do not decorate this method with a retry, since the 467 # underlying copy and delete operations are already idempotent operations 468 # protected by retry decorators. 469 def rename(self, src, dest): 470 """Renames the given GCS object from src to dest. 471 472 Args: 473 src: GCS file path pattern in the form gs://<bucket>/<name>. 474 dest: GCS file path pattern in the form gs://<bucket>/<name>. 475 """ 476 self.copy(src, dest) 477 self.delete(src) 478 479 def exists(self, path): 480 """Returns whether the given GCS object exists. 481 482 Args: 483 path: GCS file path pattern in the form gs://<bucket>/<name>. 484 """ 485 try: 486 self._gcs_object(path) # gcs object 487 return True 488 except HttpError as http_error: 489 if http_error.status_code == 404: 490 # HTTP 404 indicates that the file did not exist 491 return False 492 else: 493 # We re-raise all other exceptions 494 raise 495 496 def checksum(self, path): 497 """Looks up the checksum of a GCS object. 498 499 Args: 500 path: GCS file path pattern in the form gs://<bucket>/<name>. 501 """ 502 return self._gcs_object(path).crc32c 503 504 def size(self, path): 505 """Returns the size of a single GCS object. 506 507 This method does not perform glob expansion. Hence the given path must be 508 for a single GCS object. 509 510 Returns: size of the GCS object in bytes. 511 """ 512 return self._gcs_object(path).size 513 514 def kms_key(self, path): 515 """Returns the KMS key of a single GCS object. 516 517 This method does not perform glob expansion. Hence the given path must be 518 for a single GCS object. 519 520 Returns: KMS key name of the GCS object as a string, or None if it doesn't 521 have one. 522 """ 523 return self._gcs_object(path).kmsKeyName 524 525 def last_updated(self, path): 526 """Returns the last updated epoch time of a single GCS object. 527 528 This method does not perform glob expansion. Hence the given path must be 529 for a single GCS object. 530 531 Returns: last updated time of the GCS object in second. 532 """ 533 return self._updated_to_seconds(self._gcs_object(path).updated) 534 535 def _status(self, path): 536 """For internal use only; no backwards-compatibility guarantees. 537 538 Returns supported fields (checksum, kms_key, last_updated, size) of a 539 single object as a dict at once. 540 541 This method does not perform glob expansion. Hence the given path must be 542 for a single GCS object. 543 544 Returns: dict of fields of the GCS object. 545 """ 546 gcs_object = self._gcs_object(path) 547 file_status = {} 548 if hasattr(gcs_object, 'crc32c'): 549 file_status['checksum'] = gcs_object.crc32c 550 if hasattr(gcs_object, 'kmsKeyName'): 551 file_status['kms_key'] = gcs_object.kmsKeyName 552 if hasattr(gcs_object, 'updated'): 553 file_status['last_updated'] = self._updated_to_seconds(gcs_object.updated) 554 if hasattr(gcs_object, 'size'): 555 file_status['size'] = gcs_object.size 556 return file_status 557 558 @retry.with_exponential_backoff( 559 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 560 def _gcs_object(self, path): 561 """Returns a gcs object for the given path 562 563 This method does not perform glob expansion. Hence the given path must be 564 for a single GCS object. 565 566 Returns: GCS object. 567 """ 568 bucket, object_path = parse_gcs_path(path) 569 request = storage.StorageObjectsGetRequest( 570 bucket=bucket, object=object_path) 571 return self.client.objects.Get(request) 572 573 @deprecated(since='2.45.0', current='list_files') 574 def list_prefix(self, path, with_metadata=False): 575 """Lists files matching the prefix. 576 577 ``list_prefix`` has been deprecated. Use `list_files` instead, which returns 578 a generator of file information instead of a dict. 579 580 Args: 581 path: GCS file path pattern in the form gs://<bucket>/[name]. 582 with_metadata: Experimental. Specify whether returns file metadata. 583 584 Returns: 585 If ``with_metadata`` is False: dict of file name -> size; if 586 ``with_metadata`` is True: dict of file name -> tuple(size, timestamp). 587 """ 588 file_info = {} 589 for file_metadata in self.list_files(path, with_metadata): 590 file_info[file_metadata[0]] = file_metadata[1] 591 592 return file_info 593 594 def list_files(self, path, with_metadata=False): 595 """Lists files matching the prefix. 596 597 Args: 598 path: GCS file path pattern in the form gs://<bucket>/[name]. 599 with_metadata: Experimental. Specify whether returns file metadata. 600 601 Returns: 602 If ``with_metadata`` is False: generator of tuple(file name, size); if 603 ``with_metadata`` is True: generator of 604 tuple(file name, tuple(size, timestamp)). 605 """ 606 bucket, prefix = parse_gcs_path(path, object_optional=True) 607 request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix) 608 file_info = set() 609 counter = 0 610 start_time = time.time() 611 if with_metadata: 612 _LOGGER.debug("Starting the file information of the input") 613 else: 614 _LOGGER.debug("Starting the size estimation of the input") 615 while True: 616 response = retry.with_exponential_backoff( 617 retry_filter=retry.retry_on_server_errors_and_timeout_filter)( 618 self.client.objects.List)( 619 request) 620 621 for item in response.items: 622 file_name = 'gs://%s/%s' % (item.bucket, item.name) 623 if file_name not in file_info: 624 file_info.add(file_name) 625 counter += 1 626 if counter % 10000 == 0: 627 if with_metadata: 628 _LOGGER.info( 629 "Finished computing file information of: %s files", 630 len(file_info)) 631 else: 632 _LOGGER.info( 633 "Finished computing size of: %s files", len(file_info)) 634 635 if with_metadata: 636 yield file_name, (item.size, self._updated_to_seconds(item.updated)) 637 else: 638 yield file_name, item.size 639 640 if response.nextPageToken: 641 request.pageToken = response.nextPageToken 642 else: 643 break 644 _LOGGER.log( 645 # do not spam logs when list_prefix is likely used to check empty folder 646 logging.INFO if counter > 0 else logging.DEBUG, 647 "Finished listing %s files in %s seconds.", 648 counter, 649 time.time() - start_time) 650 651 @staticmethod 652 def _updated_to_seconds(updated): 653 """Helper function transform the updated field of response to seconds""" 654 return ( 655 time.mktime(updated.timetuple()) - time.timezone + 656 updated.microsecond / 1000000.0) 657 658 659 class GcsDownloader(Downloader): 660 def __init__(self, client, path, buffer_size, get_project_number): 661 self._client = client 662 self._path = path 663 self._bucket, self._name = parse_gcs_path(path) 664 self._buffer_size = buffer_size 665 self._get_project_number = get_project_number 666 667 # Create a request count metric 668 resource = resource_identifiers.GoogleCloudStorageBucket(self._bucket) 669 labels = { 670 monitoring_infos.SERVICE_LABEL: 'Storage', 671 monitoring_infos.METHOD_LABEL: 'Objects.get', 672 monitoring_infos.RESOURCE_LABEL: resource, 673 monitoring_infos.GCS_BUCKET_LABEL: self._bucket 674 } 675 project_number = self._get_project_number(self._bucket) 676 if project_number: 677 labels[monitoring_infos.GCS_PROJECT_ID_LABEL] = str(project_number) 678 else: 679 _LOGGER.debug( 680 'Possibly missing storage.buckets.get permission to ' 681 'bucket %s. Label %s is not added to the counter because it ' 682 'cannot be identified.', 683 self._bucket, 684 monitoring_infos.GCS_PROJECT_ID_LABEL) 685 686 service_call_metric = ServiceCallMetric( 687 request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN, 688 base_labels=labels) 689 690 # Get object state. 691 self._get_request = ( 692 storage.StorageObjectsGetRequest( 693 bucket=self._bucket, object=self._name)) 694 try: 695 metadata = self._get_object_metadata(self._get_request) 696 except HttpError as http_error: 697 service_call_metric.call(http_error) 698 if http_error.status_code == 404: 699 raise IOError(errno.ENOENT, 'Not found: %s' % self._path) 700 else: 701 _LOGGER.error( 702 'HTTP error while requesting file %s: %s', self._path, http_error) 703 raise 704 else: 705 service_call_metric.call('ok') 706 707 self._size = metadata.size 708 709 # Ensure read is from file of the correct generation. 710 self._get_request.generation = metadata.generation 711 712 # Initialize read buffer state. 713 self._download_stream = io.BytesIO() 714 self._downloader = transfer.Download( 715 self._download_stream, 716 auto_transfer=False, 717 chunksize=self._buffer_size, 718 num_retries=20) 719 720 try: 721 self._client.objects.Get(self._get_request, download=self._downloader) 722 service_call_metric.call('ok') 723 except HttpError as e: 724 service_call_metric.call(e) 725 raise 726 727 @retry.with_exponential_backoff( 728 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 729 def _get_object_metadata(self, get_request): 730 return self._client.objects.Get(get_request) 731 732 @property 733 def size(self): 734 return self._size 735 736 def get_range(self, start, end): 737 self._download_stream.seek(0) 738 self._download_stream.truncate(0) 739 self._downloader.GetRange(start, end - 1) 740 return self._download_stream.getvalue() 741 742 743 class GcsUploader(Uploader): 744 def __init__(self, client, path, mime_type, get_project_number): 745 self._client = client 746 self._path = path 747 self._bucket, self._name = parse_gcs_path(path) 748 self._mime_type = mime_type 749 self._get_project_number = get_project_number 750 751 # Set up communication with child thread. 752 parent_conn, child_conn = multiprocessing.Pipe() 753 self._child_conn = child_conn 754 self._conn = parent_conn 755 756 # Set up uploader. 757 self._insert_request = ( 758 storage.StorageObjectsInsertRequest( 759 bucket=self._bucket, name=self._name)) 760 self._upload = transfer.Upload( 761 PipeStream(self._child_conn), 762 self._mime_type, 763 chunksize=WRITE_CHUNK_SIZE) 764 self._upload.strategy = transfer.RESUMABLE_UPLOAD 765 766 # Start uploading thread. 767 self._upload_thread = threading.Thread(target=self._start_upload) 768 self._upload_thread.daemon = True 769 self._upload_thread.last_error = None 770 self._upload_thread.start() 771 772 # TODO(silviuc): Refactor so that retry logic can be applied. 773 # There is retry logic in the underlying transfer library but we should make 774 # it more explicit so we can control the retry parameters. 775 @retry.no_retries # Using no_retries marks this as an integration point. 776 def _start_upload(self): 777 # This starts the uploader thread. We are forced to run the uploader in 778 # another thread because the apitools uploader insists on taking a stream 779 # as input. Happily, this also means we get asynchronous I/O to GCS. 780 # 781 # The uploader by default transfers data in chunks of 1024 * 1024 bytes at 782 # a time, buffering writes until that size is reached. 783 784 project_number = self._get_project_number(self._bucket) 785 786 # Create a request count metric 787 resource = resource_identifiers.GoogleCloudStorageBucket(self._bucket) 788 labels = { 789 monitoring_infos.SERVICE_LABEL: 'Storage', 790 monitoring_infos.METHOD_LABEL: 'Objects.insert', 791 monitoring_infos.RESOURCE_LABEL: resource, 792 monitoring_infos.GCS_BUCKET_LABEL: self._bucket, 793 monitoring_infos.GCS_PROJECT_ID_LABEL: str(project_number) 794 } 795 service_call_metric = ServiceCallMetric( 796 request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN, 797 base_labels=labels) 798 try: 799 self._client.objects.Insert(self._insert_request, upload=self._upload) 800 service_call_metric.call('ok') 801 except Exception as e: # pylint: disable=broad-except 802 service_call_metric.call(e) 803 _LOGGER.error( 804 'Error in _start_upload while inserting file %s: %s', 805 self._path, 806 traceback.format_exc()) 807 self._upload_thread.last_error = e 808 finally: 809 self._child_conn.close() 810 811 def put(self, data): 812 try: 813 self._conn.send_bytes(data.tobytes()) 814 except EOFError: 815 if self._upload_thread.last_error is not None: 816 raise self._upload_thread.last_error # pylint: disable=raising-bad-type 817 raise 818 819 def finish(self): 820 self._conn.close() 821 # TODO(udim): Add timeout=DEFAULT_HTTP_TIMEOUT_SECONDS * 2 and raise if 822 # isAlive is True. 823 self._upload_thread.join() 824 # Check for exception since the last put() call. 825 if self._upload_thread.last_error is not None: 826 raise type(self._upload_thread.last_error)( 827 "Error while uploading file %s: %s", 828 self._path, 829 self._upload_thread.last_error.message) # pylint: disable=raising-bad-type