github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/gubernator/third_party/cloudstorage/cloudstorage_api.py

github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/gubernator/third_party/cloudstorage/cloudstorage_api.py (about)

     1  # Copyright 2012 Google Inc. All Rights Reserved.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing,
    10  # software distributed under the License is distributed on an
    11  # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
    12  # either express or implied. See the License for the specific
    13  # language governing permissions and limitations under the License.
    14  
    15  """File Interface for Google Cloud Storage."""
    16  
    17  
    18  
    19  from __future__ import with_statement
    20  
    21  
    22  
    23  __all__ = ['copy2',
    24             'delete',
    25             'listbucket',
    26             'open',
    27             'stat',
    28             'compose',
    29            ]
    30  
    31  import logging
    32  import StringIO
    33  import urllib
    34  import os
    35  import itertools
    36  import types
    37  import xml.etree.cElementTree as ET
    38  from . import api_utils
    39  from . import common
    40  from . import errors
    41  from . import storage_api
    42  
    43  
    44  
    45  def open(filename,
    46           mode='r',
    47           content_type=None,
    48           options=None,
    49           read_buffer_size=storage_api.ReadBuffer.DEFAULT_BUFFER_SIZE,
    50           retry_params=None,
    51           _account_id=None,
    52           offset=0):
    53    """Opens a Google Cloud Storage file and returns it as a File-like object.
    54  
    55    Args:
    56      filename: A Google Cloud Storage filename of form '/bucket/filename'.
    57      mode: 'r' for reading mode. 'w' for writing mode.
    58        In reading mode, the file must exist. In writing mode, a file will
    59        be created or be overrode.
    60      content_type: The MIME type of the file. str. Only valid in writing mode.
    61      options: A str->basestring dict to specify additional headers to pass to
    62        GCS e.g. {'x-goog-acl': 'private', 'x-goog-meta-foo': 'foo'}.
    63        Supported options are x-goog-acl, x-goog-meta-, cache-control,
    64        content-disposition, and content-encoding.
    65        Only valid in writing mode.
    66        See https://developers.google.com/storage/docs/reference-headers
    67        for details.
    68      read_buffer_size: The buffer size for read. Read keeps a buffer
    69        and prefetches another one. To minimize blocking for large files,
    70        always read by buffer size. To minimize number of RPC requests for
    71        small files, set a large buffer size. Max is 30MB.
    72      retry_params: An instance of api_utils.RetryParams for subsequent calls
    73        to GCS from this file handle. If None, the default one is used.
    74      _account_id: Internal-use only.
    75      offset: Number of bytes to skip at the start of the file. If None, 0 is
    76        used.
    77  
    78    Returns:
    79      A reading or writing buffer that supports File-like interface. Buffer
    80      must be closed after operations are done.
    81  
    82    Raises:
    83      errors.AuthorizationError: if authorization failed.
    84      errors.NotFoundError: if an object that's expected to exist doesn't.
    85      ValueError: invalid open mode or if content_type or options are specified
    86        in reading mode.
    87    """
    88    common.validate_file_path(filename)
    89    api = storage_api._get_storage_api(retry_params=retry_params,
    90                                       account_id=_account_id)
    91    filename = api_utils._quote_filename(filename)
    92  
    93    if mode == 'w':
    94      common.validate_options(options)
    95      return storage_api.StreamingBuffer(api, filename, content_type, options)
    96    elif mode == 'r':
    97      if content_type or options:
    98        raise ValueError('Options and content_type can only be specified '
    99                         'for writing mode.')
   100      return storage_api.ReadBuffer(api,
   101                                    filename,
   102                                    buffer_size=read_buffer_size,
   103                                    offset=offset)
   104    else:
   105      raise ValueError('Invalid mode %s.' % mode)
   106  
   107  
   108  def delete(filename, retry_params=None, _account_id=None):
   109    """Delete a Google Cloud Storage file.
   110  
   111    Args:
   112      filename: A Google Cloud Storage filename of form '/bucket/filename'.
   113      retry_params: An api_utils.RetryParams for this call to GCS. If None,
   114        the default one is used.
   115      _account_id: Internal-use only.
   116  
   117    Raises:
   118      errors.NotFoundError: if the file doesn't exist prior to deletion.
   119    """
   120    api = storage_api._get_storage_api(retry_params=retry_params,
   121                                       account_id=_account_id)
   122    common.validate_file_path(filename)
   123    filename = api_utils._quote_filename(filename)
   124    status, resp_headers, content = api.delete_object(filename)
   125    errors.check_status(status, [204], filename, resp_headers=resp_headers,
   126                        body=content)
   127  
   128  
   129  def stat(filename, retry_params=None, _account_id=None):
   130    """Get GCSFileStat of a Google Cloud storage file.
   131  
   132    Args:
   133      filename: A Google Cloud Storage filename of form '/bucket/filename'.
   134      retry_params: An api_utils.RetryParams for this call to GCS. If None,
   135        the default one is used.
   136      _account_id: Internal-use only.
   137  
   138    Returns:
   139      a GCSFileStat object containing info about this file.
   140  
   141    Raises:
   142      errors.AuthorizationError: if authorization failed.
   143      errors.NotFoundError: if an object that's expected to exist doesn't.
   144    """
   145    common.validate_file_path(filename)
   146    api = storage_api._get_storage_api(retry_params=retry_params,
   147                                       account_id=_account_id)
   148    status, headers, content = api.head_object(
   149        api_utils._quote_filename(filename))
   150    errors.check_status(status, [200], filename, resp_headers=headers,
   151                        body=content)
   152    file_stat = common.GCSFileStat(
   153        filename=filename,
   154        st_size=common.get_stored_content_length(headers),
   155        st_ctime=common.http_time_to_posix(headers.get('last-modified')),
   156        etag=headers.get('etag'),
   157        content_type=headers.get('content-type'),
   158        metadata=common.get_metadata(headers))
   159  
   160    return file_stat
   161  
   162  
   163  def copy2(src, dst, metadata=None, retry_params=None):
   164    """Copy the file content from src to dst.
   165  
   166    Args:
   167      src: /bucket/filename
   168      dst: /bucket/filename
   169      metadata: a dict of metadata for this copy. If None, old metadata is copied.
   170        For example, {'x-goog-meta-foo': 'bar'}.
   171      retry_params: An api_utils.RetryParams for this call to GCS. If None,
   172        the default one is used.
   173  
   174    Raises:
   175      errors.AuthorizationError: if authorization failed.
   176      errors.NotFoundError: if an object that's expected to exist doesn't.
   177    """
   178    common.validate_file_path(src)
   179    common.validate_file_path(dst)
   180  
   181    if metadata is None:
   182      metadata = {}
   183      copy_meta = 'COPY'
   184    else:
   185      copy_meta = 'REPLACE'
   186    metadata.update({'x-goog-copy-source': src,
   187                     'x-goog-metadata-directive': copy_meta})
   188  
   189    api = storage_api._get_storage_api(retry_params=retry_params)
   190    status, resp_headers, content = api.put_object(
   191        api_utils._quote_filename(dst), headers=metadata)
   192    errors.check_status(status, [200], src, metadata, resp_headers, body=content)
   193  
   194  
   195  def listbucket(path_prefix, marker=None, prefix=None, max_keys=None,
   196                 delimiter=None, retry_params=None, _account_id=None):
   197    """Returns a GCSFileStat iterator over a bucket.
   198  
   199    Optional arguments can limit the result to a subset of files under bucket.
   200  
   201    This function has two modes:
   202    1. List bucket mode: Lists all files in the bucket without any concept of
   203       hierarchy. GCS doesn't have real directory hierarchies.
   204    2. Directory emulation mode: If you specify the 'delimiter' argument,
   205       it is used as a path separator to emulate a hierarchy of directories.
   206       In this mode, the "path_prefix" argument should end in the delimiter
   207       specified (thus designates a logical directory). The logical directory's
   208       contents, both files and subdirectories, are listed. The names of
   209       subdirectories returned will end with the delimiter. So listbucket
   210       can be called with the subdirectory name to list the subdirectory's
   211       contents.
   212  
   213    Args:
   214      path_prefix: A Google Cloud Storage path of format "/bucket" or
   215        "/bucket/prefix". Only objects whose fullpath starts with the
   216        path_prefix will be returned.
   217      marker: Another path prefix. Only objects whose fullpath starts
   218        lexicographically after marker will be returned (exclusive).
   219      prefix: Deprecated. Use path_prefix.
   220      max_keys: The limit on the number of objects to return. int.
   221        For best performance, specify max_keys only if you know how many objects
   222        you want. Otherwise, this method requests large batches and handles
   223        pagination for you.
   224      delimiter: Use to turn on directory mode. str of one or multiple chars
   225        that your bucket uses as its directory separator.
   226      retry_params: An api_utils.RetryParams for this call to GCS. If None,
   227        the default one is used.
   228      _account_id: Internal-use only.
   229  
   230    Examples:
   231      For files "/bucket/a",
   232                "/bucket/bar/1"
   233                "/bucket/foo",
   234                "/bucket/foo/1", "/bucket/foo/2/1", "/bucket/foo/3/1",
   235  
   236      Regular mode:
   237      listbucket("/bucket/f", marker="/bucket/foo/1")
   238      will match "/bucket/foo/2/1", "/bucket/foo/3/1".
   239  
   240      Directory mode:
   241      listbucket("/bucket/", delimiter="/")
   242      will match "/bucket/a, "/bucket/bar/" "/bucket/foo", "/bucket/foo/".
   243      listbucket("/bucket/foo/", delimiter="/")
   244      will match "/bucket/foo/1", "/bucket/foo/2/", "/bucket/foo/3/"
   245  
   246    Returns:
   247      Regular mode:
   248      A GCSFileStat iterator over matched files ordered by filename.
   249      The iterator returns GCSFileStat objects. filename, etag, st_size,
   250      st_ctime, and is_dir are set.
   251  
   252      Directory emulation mode:
   253      A GCSFileStat iterator over matched files and directories ordered by
   254      name. The iterator returns GCSFileStat objects. For directories,
   255      only the filename and is_dir fields are set.
   256  
   257      The last name yielded can be used as next call's marker.
   258    """
   259    if prefix:
   260      common.validate_bucket_path(path_prefix)
   261      bucket = path_prefix
   262    else:
   263      bucket, prefix = common._process_path_prefix(path_prefix)
   264  
   265    if marker and marker.startswith(bucket):
   266      marker = marker[len(bucket) + 1:]
   267  
   268    api = storage_api._get_storage_api(retry_params=retry_params,
   269                                       account_id=_account_id)
   270    options = {}
   271    if marker:
   272      options['marker'] = marker
   273    if max_keys:
   274      options['max-keys'] = max_keys
   275    if prefix:
   276      options['prefix'] = prefix
   277    if delimiter:
   278      options['delimiter'] = delimiter
   279  
   280    return _Bucket(api, bucket, options)
   281  
   282  def compose(list_of_files, destination_file, files_metadata=None,
   283              content_type=None, retry_params=None, _account_id=None):
   284    """Runs the GCS Compose on the given files.
   285  
   286    Merges between 2 and 32 files into one file. Composite files may even
   287    be built from other existing composites, provided that the total
   288    component count does not exceed 1024. See here for details:
   289    https://cloud.google.com/storage/docs/composite-objects
   290  
   291    Args:
   292      list_of_files: List of file name strings with no leading slashes or bucket.
   293      destination_file: Path to the output file. Must have the bucket in the path.
   294      files_metadata: Optional, file metadata, order must match list_of_files,
   295        see link for available options:
   296        https://cloud.google.com/storage/docs/composite-objects#_Xml
   297      content_type: Optional, used to specify content-header of the output file.
   298      retry_params: Optional, an api_utils.RetryParams for this call to GCS.
   299        If None,the default one is used.
   300      _account_id: Internal-use only.
   301  
   302    Raises:
   303      ValueError: If the number of files is outside the range of 2-32.
   304    """
   305    api = storage_api._get_storage_api(retry_params=retry_params,
   306                                       account_id=_account_id)
   307  
   308  
   309    if os.getenv('SERVER_SOFTWARE').startswith('Dev'):
   310      def _temp_func(file_list, destination_file, content_type):
   311        bucket = '/' + destination_file.split('/')[1] + '/'
   312        with open(destination_file, 'w', content_type=content_type) as gcs_merge:
   313          for source_file in file_list:
   314            with open(bucket + source_file['Name'], 'r') as gcs_source:
   315              gcs_merge.write(gcs_source.read())
   316  
   317      compose_object = _temp_func
   318    else:
   319      compose_object = api.compose_object
   320    file_list, _ = _validate_compose_list(destination_file,
   321                                          list_of_files,
   322                                          files_metadata, 32)
   323    compose_object(file_list, destination_file, content_type)
   324  
   325  
   326  def _file_exists(destination):
   327    """Checks if a file exists.
   328  
   329    Tries to open the file.
   330    If it succeeds returns True otherwise False.
   331  
   332    Args:
   333      destination: Full path to the file (ie. /bucket/object) with leading slash.
   334  
   335    Returns:
   336      True if the file is accessible otherwise False.
   337    """
   338    try:
   339      with open(destination, "r"):
   340        return True
   341    except errors.NotFoundError:
   342      return False
   343  
   344  
   345  def _validate_compose_list(destination_file, file_list,
   346                             files_metadata=None, number_of_files=32):
   347    """Validates the file_list and merges the file_list, files_metadata.
   348  
   349    Args:
   350      destination: Path to the file (ie. /destination_bucket/destination_file).
   351      file_list: List of files to compose, see compose for details.
   352      files_metadata: Meta details for each file in the file_list.
   353      number_of_files: Maximum number of files allowed in the list.
   354  
   355    Returns:
   356      A tuple (list_of_files, bucket):
   357        list_of_files: Ready to use dict version of the list.
   358        bucket: bucket name extracted from the file paths.
   359    """
   360    common.validate_file_path(destination_file)
   361    bucket = destination_file[0:(destination_file.index('/', 1) + 1)]
   362    try:
   363      if isinstance(file_list, types.StringTypes):
   364        raise TypeError
   365      list_len = len(file_list)
   366    except TypeError:
   367      raise TypeError('file_list must be a list')
   368  
   369    if list_len > number_of_files:
   370      raise ValueError(
   371            'Compose attempted to create composite with too many'
   372             '(%i) components; limit is (%i).' % (list_len, number_of_files))
   373    if list_len <= 1:
   374      raise ValueError('Compose operation requires at'
   375                       ' least two components; %i provided.' % list_len)
   376  
   377    if files_metadata is None:
   378      files_metadata = []
   379    elif len(files_metadata) > list_len:
   380      raise ValueError('files_metadata contains more entries(%i)'
   381                       ' than file_list(%i)'
   382                       % (len(files_metadata), list_len))
   383    list_of_files = []
   384    for source_file, meta_data in itertools.izip_longest(file_list,
   385                                                         files_metadata):
   386      if not isinstance(source_file, str):
   387        raise TypeError('Each item of file_list must be a string')
   388      if source_file.startswith('/'):
   389        logging.warn('Detected a "/" at the start of the file, '
   390                     'Unless the file name contains a "/" it '
   391                     ' may cause files to be misread')
   392      if source_file.startswith(bucket):
   393        logging.warn('Detected bucket name at the start of the file, '
   394                     'must not specify the bucket when listing file_names.'
   395                     ' May cause files to be misread')
   396      common.validate_file_path(bucket + source_file)
   397  
   398      list_entry = {}
   399  
   400      if meta_data is not None:
   401        list_entry.update(meta_data)
   402      list_entry['Name'] = source_file
   403      list_of_files.append(list_entry)
   404  
   405    return list_of_files, bucket
   406  
   407  
   408  class _Bucket(object):
   409    """A wrapper for a GCS bucket as the return value of listbucket."""
   410  
   411    def __init__(self, api, path, options):
   412      """Initialize.
   413  
   414      Args:
   415        api: storage_api instance.
   416        path: bucket path of form '/bucket'.
   417        options: a dict of listbucket options. Please see listbucket doc.
   418      """
   419      self._init(api, path, options)
   420  
   421    def _init(self, api, path, options):
   422      self._api = api
   423      self._path = path
   424      self._options = options.copy()
   425      self._get_bucket_fut = self._api.get_bucket_async(
   426          self._path + '?' + urllib.urlencode(self._options))
   427      self._last_yield = None
   428      self._new_max_keys = self._options.get('max-keys')
   429  
   430    def __getstate__(self):
   431      options = self._options
   432      if self._last_yield:
   433        options['marker'] = self._last_yield.filename[len(self._path) + 1:]
   434      if self._new_max_keys is not None:
   435        options['max-keys'] = self._new_max_keys
   436      return {'api': self._api,
   437              'path': self._path,
   438              'options': options}
   439  
   440    def __setstate__(self, state):
   441      self._init(state['api'], state['path'], state['options'])
   442  
   443    def __iter__(self):
   444      """Iter over the bucket.
   445  
   446      Yields:
   447        GCSFileStat: a GCSFileStat for an object in the bucket.
   448          They are ordered by GCSFileStat.filename.
   449      """
   450      total = 0
   451      max_keys = self._options.get('max-keys')
   452  
   453      while self._get_bucket_fut:
   454        status, resp_headers, content = self._get_bucket_fut.get_result()
   455        errors.check_status(status, [200], self._path, resp_headers=resp_headers,
   456                            body=content, extras=self._options)
   457  
   458        if self._should_get_another_batch(content):
   459          self._get_bucket_fut = self._api.get_bucket_async(
   460              self._path + '?' + urllib.urlencode(self._options))
   461        else:
   462          self._get_bucket_fut = None
   463  
   464        root = ET.fromstring(content)
   465        dirs = self._next_dir_gen(root)
   466        files = self._next_file_gen(root)
   467        next_file = files.next()
   468        next_dir = dirs.next()
   469  
   470        while ((max_keys is None or total < max_keys) and
   471               not (next_file is None and next_dir is None)):
   472          total += 1
   473          if next_file is None:
   474            self._last_yield = next_dir
   475            next_dir = dirs.next()
   476          elif next_dir is None:
   477            self._last_yield = next_file
   478            next_file = files.next()
   479          elif next_dir < next_file:
   480            self._last_yield = next_dir
   481            next_dir = dirs.next()
   482          elif next_file < next_dir:
   483            self._last_yield = next_file
   484            next_file = files.next()
   485          else:
   486            logging.error(
   487                'Should never reach. next file is %r. next dir is %r.',
   488                next_file, next_dir)
   489          if self._new_max_keys:
   490            self._new_max_keys -= 1
   491          yield self._last_yield
   492  
   493    def _next_file_gen(self, root):
   494      """Generator for next file element in the document.
   495  
   496      Args:
   497        root: root element of the XML tree.
   498  
   499      Yields:
   500        GCSFileStat for the next file.
   501      """
   502      for e in root.getiterator(common._T_CONTENTS):
   503        st_ctime, size, etag, key = None, None, None, None
   504        for child in e.getiterator('*'):
   505          if child.tag == common._T_LAST_MODIFIED:
   506            st_ctime = common.dt_str_to_posix(child.text)
   507          elif child.tag == common._T_ETAG:
   508            etag = child.text
   509          elif child.tag == common._T_SIZE:
   510            size = child.text
   511          elif child.tag == common._T_KEY:
   512            key = child.text
   513        yield common.GCSFileStat(self._path + '/' + key,
   514                                 size, etag, st_ctime)
   515        e.clear()
   516      yield None
   517  
   518    def _next_dir_gen(self, root):
   519      """Generator for next directory element in the document.
   520  
   521      Args:
   522        root: root element in the XML tree.
   523  
   524      Yields:
   525        GCSFileStat for the next directory.
   526      """
   527      for e in root.getiterator(common._T_COMMON_PREFIXES):
   528        yield common.GCSFileStat(
   529            self._path + '/' + e.find(common._T_PREFIX).text,
   530            st_size=None, etag=None, st_ctime=None, is_dir=True)
   531        e.clear()
   532      yield None
   533  
   534    def _should_get_another_batch(self, content):
   535      """Whether to issue another GET bucket call.
   536  
   537      Args:
   538        content: response XML.
   539  
   540      Returns:
   541        True if should, also update self._options for the next request.
   542        False otherwise.
   543      """
   544      if ('max-keys' in self._options and
   545          self._options['max-keys'] <= common._MAX_GET_BUCKET_RESULT):
   546        return False
   547  
   548      elements = self._find_elements(
   549          content, set([common._T_IS_TRUNCATED,
   550                        common._T_NEXT_MARKER]))
   551      if elements.get(common._T_IS_TRUNCATED, 'false').lower() != 'true':
   552        return False
   553  
   554      next_marker = elements.get(common._T_NEXT_MARKER)
   555      if next_marker is None:
   556        self._options.pop('marker', None)
   557        return False
   558      self._options['marker'] = next_marker
   559      return True
   560  
   561    def _find_elements(self, result, elements):
   562      """Find interesting elements from XML.
   563  
   564      This function tries to only look for specified elements
   565      without parsing the entire XML. The specified elements is better
   566      located near the beginning.
   567  
   568      Args:
   569        result: response XML.
   570        elements: a set of interesting element tags.
   571  
   572      Returns:
   573        A dict from element tag to element value.
   574      """
   575      element_mapping = {}
   576      result = StringIO.StringIO(result)
   577      for _, e in ET.iterparse(result, events=('end',)):
   578        if not elements:
   579          break
   580        if e.tag in elements:
   581          element_mapping[e.tag] = e.text
   582          elements.remove(e.tag)
   583      return element_mapping