github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/gubernator/third_party/cloudstorage/cloudstorage_api.py (about) 1 # Copyright 2012 Google Inc. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, 10 # software distributed under the License is distributed on an 11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 12 # either express or implied. See the License for the specific 13 # language governing permissions and limitations under the License. 14 15 """File Interface for Google Cloud Storage.""" 16 17 18 19 from __future__ import with_statement 20 21 22 23 __all__ = ['copy2', 24 'delete', 25 'listbucket', 26 'open', 27 'stat', 28 'compose', 29 ] 30 31 import logging 32 import StringIO 33 import urllib 34 import os 35 import itertools 36 import types 37 import xml.etree.cElementTree as ET 38 from . import api_utils 39 from . import common 40 from . import errors 41 from . import storage_api 42 43 44 45 def open(filename, 46 mode='r', 47 content_type=None, 48 options=None, 49 read_buffer_size=storage_api.ReadBuffer.DEFAULT_BUFFER_SIZE, 50 retry_params=None, 51 _account_id=None, 52 offset=0): 53 """Opens a Google Cloud Storage file and returns it as a File-like object. 54 55 Args: 56 filename: A Google Cloud Storage filename of form '/bucket/filename'. 57 mode: 'r' for reading mode. 'w' for writing mode. 58 In reading mode, the file must exist. In writing mode, a file will 59 be created or be overrode. 60 content_type: The MIME type of the file. str. Only valid in writing mode. 61 options: A str->basestring dict to specify additional headers to pass to 62 GCS e.g. {'x-goog-acl': 'private', 'x-goog-meta-foo': 'foo'}. 63 Supported options are x-goog-acl, x-goog-meta-, cache-control, 64 content-disposition, and content-encoding. 65 Only valid in writing mode. 66 See https://developers.google.com/storage/docs/reference-headers 67 for details. 68 read_buffer_size: The buffer size for read. Read keeps a buffer 69 and prefetches another one. To minimize blocking for large files, 70 always read by buffer size. To minimize number of RPC requests for 71 small files, set a large buffer size. Max is 30MB. 72 retry_params: An instance of api_utils.RetryParams for subsequent calls 73 to GCS from this file handle. If None, the default one is used. 74 _account_id: Internal-use only. 75 offset: Number of bytes to skip at the start of the file. If None, 0 is 76 used. 77 78 Returns: 79 A reading or writing buffer that supports File-like interface. Buffer 80 must be closed after operations are done. 81 82 Raises: 83 errors.AuthorizationError: if authorization failed. 84 errors.NotFoundError: if an object that's expected to exist doesn't. 85 ValueError: invalid open mode or if content_type or options are specified 86 in reading mode. 87 """ 88 common.validate_file_path(filename) 89 api = storage_api._get_storage_api(retry_params=retry_params, 90 account_id=_account_id) 91 filename = api_utils._quote_filename(filename) 92 93 if mode == 'w': 94 common.validate_options(options) 95 return storage_api.StreamingBuffer(api, filename, content_type, options) 96 elif mode == 'r': 97 if content_type or options: 98 raise ValueError('Options and content_type can only be specified ' 99 'for writing mode.') 100 return storage_api.ReadBuffer(api, 101 filename, 102 buffer_size=read_buffer_size, 103 offset=offset) 104 else: 105 raise ValueError('Invalid mode %s.' % mode) 106 107 108 def delete(filename, retry_params=None, _account_id=None): 109 """Delete a Google Cloud Storage file. 110 111 Args: 112 filename: A Google Cloud Storage filename of form '/bucket/filename'. 113 retry_params: An api_utils.RetryParams for this call to GCS. If None, 114 the default one is used. 115 _account_id: Internal-use only. 116 117 Raises: 118 errors.NotFoundError: if the file doesn't exist prior to deletion. 119 """ 120 api = storage_api._get_storage_api(retry_params=retry_params, 121 account_id=_account_id) 122 common.validate_file_path(filename) 123 filename = api_utils._quote_filename(filename) 124 status, resp_headers, content = api.delete_object(filename) 125 errors.check_status(status, [204], filename, resp_headers=resp_headers, 126 body=content) 127 128 129 def stat(filename, retry_params=None, _account_id=None): 130 """Get GCSFileStat of a Google Cloud storage file. 131 132 Args: 133 filename: A Google Cloud Storage filename of form '/bucket/filename'. 134 retry_params: An api_utils.RetryParams for this call to GCS. If None, 135 the default one is used. 136 _account_id: Internal-use only. 137 138 Returns: 139 a GCSFileStat object containing info about this file. 140 141 Raises: 142 errors.AuthorizationError: if authorization failed. 143 errors.NotFoundError: if an object that's expected to exist doesn't. 144 """ 145 common.validate_file_path(filename) 146 api = storage_api._get_storage_api(retry_params=retry_params, 147 account_id=_account_id) 148 status, headers, content = api.head_object( 149 api_utils._quote_filename(filename)) 150 errors.check_status(status, [200], filename, resp_headers=headers, 151 body=content) 152 file_stat = common.GCSFileStat( 153 filename=filename, 154 st_size=common.get_stored_content_length(headers), 155 st_ctime=common.http_time_to_posix(headers.get('last-modified')), 156 etag=headers.get('etag'), 157 content_type=headers.get('content-type'), 158 metadata=common.get_metadata(headers)) 159 160 return file_stat 161 162 163 def copy2(src, dst, metadata=None, retry_params=None): 164 """Copy the file content from src to dst. 165 166 Args: 167 src: /bucket/filename 168 dst: /bucket/filename 169 metadata: a dict of metadata for this copy. If None, old metadata is copied. 170 For example, {'x-goog-meta-foo': 'bar'}. 171 retry_params: An api_utils.RetryParams for this call to GCS. If None, 172 the default one is used. 173 174 Raises: 175 errors.AuthorizationError: if authorization failed. 176 errors.NotFoundError: if an object that's expected to exist doesn't. 177 """ 178 common.validate_file_path(src) 179 common.validate_file_path(dst) 180 181 if metadata is None: 182 metadata = {} 183 copy_meta = 'COPY' 184 else: 185 copy_meta = 'REPLACE' 186 metadata.update({'x-goog-copy-source': src, 187 'x-goog-metadata-directive': copy_meta}) 188 189 api = storage_api._get_storage_api(retry_params=retry_params) 190 status, resp_headers, content = api.put_object( 191 api_utils._quote_filename(dst), headers=metadata) 192 errors.check_status(status, [200], src, metadata, resp_headers, body=content) 193 194 195 def listbucket(path_prefix, marker=None, prefix=None, max_keys=None, 196 delimiter=None, retry_params=None, _account_id=None): 197 """Returns a GCSFileStat iterator over a bucket. 198 199 Optional arguments can limit the result to a subset of files under bucket. 200 201 This function has two modes: 202 1. List bucket mode: Lists all files in the bucket without any concept of 203 hierarchy. GCS doesn't have real directory hierarchies. 204 2. Directory emulation mode: If you specify the 'delimiter' argument, 205 it is used as a path separator to emulate a hierarchy of directories. 206 In this mode, the "path_prefix" argument should end in the delimiter 207 specified (thus designates a logical directory). The logical directory's 208 contents, both files and subdirectories, are listed. The names of 209 subdirectories returned will end with the delimiter. So listbucket 210 can be called with the subdirectory name to list the subdirectory's 211 contents. 212 213 Args: 214 path_prefix: A Google Cloud Storage path of format "/bucket" or 215 "/bucket/prefix". Only objects whose fullpath starts with the 216 path_prefix will be returned. 217 marker: Another path prefix. Only objects whose fullpath starts 218 lexicographically after marker will be returned (exclusive). 219 prefix: Deprecated. Use path_prefix. 220 max_keys: The limit on the number of objects to return. int. 221 For best performance, specify max_keys only if you know how many objects 222 you want. Otherwise, this method requests large batches and handles 223 pagination for you. 224 delimiter: Use to turn on directory mode. str of one or multiple chars 225 that your bucket uses as its directory separator. 226 retry_params: An api_utils.RetryParams for this call to GCS. If None, 227 the default one is used. 228 _account_id: Internal-use only. 229 230 Examples: 231 For files "/bucket/a", 232 "/bucket/bar/1" 233 "/bucket/foo", 234 "/bucket/foo/1", "/bucket/foo/2/1", "/bucket/foo/3/1", 235 236 Regular mode: 237 listbucket("/bucket/f", marker="/bucket/foo/1") 238 will match "/bucket/foo/2/1", "/bucket/foo/3/1". 239 240 Directory mode: 241 listbucket("/bucket/", delimiter="/") 242 will match "/bucket/a, "/bucket/bar/" "/bucket/foo", "/bucket/foo/". 243 listbucket("/bucket/foo/", delimiter="/") 244 will match "/bucket/foo/1", "/bucket/foo/2/", "/bucket/foo/3/" 245 246 Returns: 247 Regular mode: 248 A GCSFileStat iterator over matched files ordered by filename. 249 The iterator returns GCSFileStat objects. filename, etag, st_size, 250 st_ctime, and is_dir are set. 251 252 Directory emulation mode: 253 A GCSFileStat iterator over matched files and directories ordered by 254 name. The iterator returns GCSFileStat objects. For directories, 255 only the filename and is_dir fields are set. 256 257 The last name yielded can be used as next call's marker. 258 """ 259 if prefix: 260 common.validate_bucket_path(path_prefix) 261 bucket = path_prefix 262 else: 263 bucket, prefix = common._process_path_prefix(path_prefix) 264 265 if marker and marker.startswith(bucket): 266 marker = marker[len(bucket) + 1:] 267 268 api = storage_api._get_storage_api(retry_params=retry_params, 269 account_id=_account_id) 270 options = {} 271 if marker: 272 options['marker'] = marker 273 if max_keys: 274 options['max-keys'] = max_keys 275 if prefix: 276 options['prefix'] = prefix 277 if delimiter: 278 options['delimiter'] = delimiter 279 280 return _Bucket(api, bucket, options) 281 282 def compose(list_of_files, destination_file, files_metadata=None, 283 content_type=None, retry_params=None, _account_id=None): 284 """Runs the GCS Compose on the given files. 285 286 Merges between 2 and 32 files into one file. Composite files may even 287 be built from other existing composites, provided that the total 288 component count does not exceed 1024. See here for details: 289 https://cloud.google.com/storage/docs/composite-objects 290 291 Args: 292 list_of_files: List of file name strings with no leading slashes or bucket. 293 destination_file: Path to the output file. Must have the bucket in the path. 294 files_metadata: Optional, file metadata, order must match list_of_files, 295 see link for available options: 296 https://cloud.google.com/storage/docs/composite-objects#_Xml 297 content_type: Optional, used to specify content-header of the output file. 298 retry_params: Optional, an api_utils.RetryParams for this call to GCS. 299 If None,the default one is used. 300 _account_id: Internal-use only. 301 302 Raises: 303 ValueError: If the number of files is outside the range of 2-32. 304 """ 305 api = storage_api._get_storage_api(retry_params=retry_params, 306 account_id=_account_id) 307 308 309 if os.getenv('SERVER_SOFTWARE').startswith('Dev'): 310 def _temp_func(file_list, destination_file, content_type): 311 bucket = '/' + destination_file.split('/')[1] + '/' 312 with open(destination_file, 'w', content_type=content_type) as gcs_merge: 313 for source_file in file_list: 314 with open(bucket + source_file['Name'], 'r') as gcs_source: 315 gcs_merge.write(gcs_source.read()) 316 317 compose_object = _temp_func 318 else: 319 compose_object = api.compose_object 320 file_list, _ = _validate_compose_list(destination_file, 321 list_of_files, 322 files_metadata, 32) 323 compose_object(file_list, destination_file, content_type) 324 325 326 def _file_exists(destination): 327 """Checks if a file exists. 328 329 Tries to open the file. 330 If it succeeds returns True otherwise False. 331 332 Args: 333 destination: Full path to the file (ie. /bucket/object) with leading slash. 334 335 Returns: 336 True if the file is accessible otherwise False. 337 """ 338 try: 339 with open(destination, "r"): 340 return True 341 except errors.NotFoundError: 342 return False 343 344 345 def _validate_compose_list(destination_file, file_list, 346 files_metadata=None, number_of_files=32): 347 """Validates the file_list and merges the file_list, files_metadata. 348 349 Args: 350 destination: Path to the file (ie. /destination_bucket/destination_file). 351 file_list: List of files to compose, see compose for details. 352 files_metadata: Meta details for each file in the file_list. 353 number_of_files: Maximum number of files allowed in the list. 354 355 Returns: 356 A tuple (list_of_files, bucket): 357 list_of_files: Ready to use dict version of the list. 358 bucket: bucket name extracted from the file paths. 359 """ 360 common.validate_file_path(destination_file) 361 bucket = destination_file[0:(destination_file.index('/', 1) + 1)] 362 try: 363 if isinstance(file_list, types.StringTypes): 364 raise TypeError 365 list_len = len(file_list) 366 except TypeError: 367 raise TypeError('file_list must be a list') 368 369 if list_len > number_of_files: 370 raise ValueError( 371 'Compose attempted to create composite with too many' 372 '(%i) components; limit is (%i).' % (list_len, number_of_files)) 373 if list_len <= 1: 374 raise ValueError('Compose operation requires at' 375 ' least two components; %i provided.' % list_len) 376 377 if files_metadata is None: 378 files_metadata = [] 379 elif len(files_metadata) > list_len: 380 raise ValueError('files_metadata contains more entries(%i)' 381 ' than file_list(%i)' 382 % (len(files_metadata), list_len)) 383 list_of_files = [] 384 for source_file, meta_data in itertools.izip_longest(file_list, 385 files_metadata): 386 if not isinstance(source_file, str): 387 raise TypeError('Each item of file_list must be a string') 388 if source_file.startswith('/'): 389 logging.warn('Detected a "/" at the start of the file, ' 390 'Unless the file name contains a "/" it ' 391 ' may cause files to be misread') 392 if source_file.startswith(bucket): 393 logging.warn('Detected bucket name at the start of the file, ' 394 'must not specify the bucket when listing file_names.' 395 ' May cause files to be misread') 396 common.validate_file_path(bucket + source_file) 397 398 list_entry = {} 399 400 if meta_data is not None: 401 list_entry.update(meta_data) 402 list_entry['Name'] = source_file 403 list_of_files.append(list_entry) 404 405 return list_of_files, bucket 406 407 408 class _Bucket(object): 409 """A wrapper for a GCS bucket as the return value of listbucket.""" 410 411 def __init__(self, api, path, options): 412 """Initialize. 413 414 Args: 415 api: storage_api instance. 416 path: bucket path of form '/bucket'. 417 options: a dict of listbucket options. Please see listbucket doc. 418 """ 419 self._init(api, path, options) 420 421 def _init(self, api, path, options): 422 self._api = api 423 self._path = path 424 self._options = options.copy() 425 self._get_bucket_fut = self._api.get_bucket_async( 426 self._path + '?' + urllib.urlencode(self._options)) 427 self._last_yield = None 428 self._new_max_keys = self._options.get('max-keys') 429 430 def __getstate__(self): 431 options = self._options 432 if self._last_yield: 433 options['marker'] = self._last_yield.filename[len(self._path) + 1:] 434 if self._new_max_keys is not None: 435 options['max-keys'] = self._new_max_keys 436 return {'api': self._api, 437 'path': self._path, 438 'options': options} 439 440 def __setstate__(self, state): 441 self._init(state['api'], state['path'], state['options']) 442 443 def __iter__(self): 444 """Iter over the bucket. 445 446 Yields: 447 GCSFileStat: a GCSFileStat for an object in the bucket. 448 They are ordered by GCSFileStat.filename. 449 """ 450 total = 0 451 max_keys = self._options.get('max-keys') 452 453 while self._get_bucket_fut: 454 status, resp_headers, content = self._get_bucket_fut.get_result() 455 errors.check_status(status, [200], self._path, resp_headers=resp_headers, 456 body=content, extras=self._options) 457 458 if self._should_get_another_batch(content): 459 self._get_bucket_fut = self._api.get_bucket_async( 460 self._path + '?' + urllib.urlencode(self._options)) 461 else: 462 self._get_bucket_fut = None 463 464 root = ET.fromstring(content) 465 dirs = self._next_dir_gen(root) 466 files = self._next_file_gen(root) 467 next_file = files.next() 468 next_dir = dirs.next() 469 470 while ((max_keys is None or total < max_keys) and 471 not (next_file is None and next_dir is None)): 472 total += 1 473 if next_file is None: 474 self._last_yield = next_dir 475 next_dir = dirs.next() 476 elif next_dir is None: 477 self._last_yield = next_file 478 next_file = files.next() 479 elif next_dir < next_file: 480 self._last_yield = next_dir 481 next_dir = dirs.next() 482 elif next_file < next_dir: 483 self._last_yield = next_file 484 next_file = files.next() 485 else: 486 logging.error( 487 'Should never reach. next file is %r. next dir is %r.', 488 next_file, next_dir) 489 if self._new_max_keys: 490 self._new_max_keys -= 1 491 yield self._last_yield 492 493 def _next_file_gen(self, root): 494 """Generator for next file element in the document. 495 496 Args: 497 root: root element of the XML tree. 498 499 Yields: 500 GCSFileStat for the next file. 501 """ 502 for e in root.getiterator(common._T_CONTENTS): 503 st_ctime, size, etag, key = None, None, None, None 504 for child in e.getiterator('*'): 505 if child.tag == common._T_LAST_MODIFIED: 506 st_ctime = common.dt_str_to_posix(child.text) 507 elif child.tag == common._T_ETAG: 508 etag = child.text 509 elif child.tag == common._T_SIZE: 510 size = child.text 511 elif child.tag == common._T_KEY: 512 key = child.text 513 yield common.GCSFileStat(self._path + '/' + key, 514 size, etag, st_ctime) 515 e.clear() 516 yield None 517 518 def _next_dir_gen(self, root): 519 """Generator for next directory element in the document. 520 521 Args: 522 root: root element in the XML tree. 523 524 Yields: 525 GCSFileStat for the next directory. 526 """ 527 for e in root.getiterator(common._T_COMMON_PREFIXES): 528 yield common.GCSFileStat( 529 self._path + '/' + e.find(common._T_PREFIX).text, 530 st_size=None, etag=None, st_ctime=None, is_dir=True) 531 e.clear() 532 yield None 533 534 def _should_get_another_batch(self, content): 535 """Whether to issue another GET bucket call. 536 537 Args: 538 content: response XML. 539 540 Returns: 541 True if should, also update self._options for the next request. 542 False otherwise. 543 """ 544 if ('max-keys' in self._options and 545 self._options['max-keys'] <= common._MAX_GET_BUCKET_RESULT): 546 return False 547 548 elements = self._find_elements( 549 content, set([common._T_IS_TRUNCATED, 550 common._T_NEXT_MARKER])) 551 if elements.get(common._T_IS_TRUNCATED, 'false').lower() != 'true': 552 return False 553 554 next_marker = elements.get(common._T_NEXT_MARKER) 555 if next_marker is None: 556 self._options.pop('marker', None) 557 return False 558 self._options['marker'] = next_marker 559 return True 560 561 def _find_elements(self, result, elements): 562 """Find interesting elements from XML. 563 564 This function tries to only look for specified elements 565 without parsing the entire XML. The specified elements is better 566 located near the beginning. 567 568 Args: 569 result: response XML. 570 elements: a set of interesting element tags. 571 572 Returns: 573 A dict from element tag to element value. 574 """ 575 element_mapping = {} 576 result = StringIO.StringIO(result) 577 for _, e in ET.iterparse(result, events=('end',)): 578 if not elements: 579 break 580 if e.tag in elements: 581 element_mapping[e.tag] = e.text 582 elements.remove(e.tag) 583 return element_mapping