github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/gubernator/third_party/cloudstorage/common.py (about)

     1  # Copyright 2012 Google Inc. All Rights Reserved.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing,
    10  # software distributed under the License is distributed on an
    11  # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
    12  # either express or implied. See the License for the specific
    13  # language governing permissions and limitations under the License.
    14  
    15  """Helpers shared by cloudstorage_stub and cloudstorage_api."""
    16  
    17  
    18  
    19  
    20  
    21  __all__ = ['CS_XML_NS',
    22             'CSFileStat',
    23             'dt_str_to_posix',
    24             'local_api_url',
    25             'LOCAL_GCS_ENDPOINT',
    26             'local_run',
    27             'get_access_token',
    28             'get_stored_content_length',
    29             'get_metadata',
    30             'GCSFileStat',
    31             'http_time_to_posix',
    32             'memory_usage',
    33             'posix_time_to_http',
    34             'posix_to_dt_str',
    35             'set_access_token',
    36             'validate_options',
    37             'validate_bucket_name',
    38             'validate_bucket_path',
    39             'validate_file_path',
    40            ]
    41  
    42  
    43  import calendar
    44  import datetime
    45  from email import utils as email_utils
    46  import logging
    47  import os
    48  import re
    49  
    50  try:
    51    from google.appengine.api import runtime
    52  except ImportError:
    53    from google.appengine.api import runtime
    54  
    55  
    56  _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
    57  _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
    58  _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
    59  _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
    60  _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
    61  _GCS_METADATA = ['x-goog-meta-',
    62                   'content-disposition',
    63                   'cache-control',
    64                   'content-encoding']
    65  _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
    66  CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
    67  LOCAL_GCS_ENDPOINT = '/_ah/gcs'
    68  _access_token = ''
    69  
    70  
    71  _MAX_GET_BUCKET_RESULT = 1000
    72  
    73  
    74  def set_access_token(access_token):
    75    """Set the shared access token to authenticate with Google Cloud Storage.
    76  
    77    When set, the library will always attempt to communicate with the
    78    real Google Cloud Storage with this token even when running on dev appserver.
    79    Note the token could expire so it's up to you to renew it.
    80  
    81    When absent, the library will automatically request and refresh a token
    82    on appserver, or when on dev appserver, talk to a Google Cloud Storage
    83    stub.
    84  
    85    Args:
    86      access_token: you can get one by run 'gsutil -d ls' and copy the
    87        str after 'Bearer'.
    88    """
    89    global _access_token
    90    _access_token = access_token
    91  
    92  
    93  def get_access_token():
    94    """Returns the shared access token."""
    95    return _access_token
    96  
    97  
    98  class GCSFileStat(object):
    99    """Container for GCS file stat."""
   100  
   101    def __init__(self,
   102                 filename,
   103                 st_size,
   104                 etag,
   105                 st_ctime,
   106                 content_type=None,
   107                 metadata=None,
   108                 is_dir=False):
   109      """Initialize.
   110  
   111      For files, the non optional arguments are always set.
   112      For directories, only filename and is_dir is set.
   113  
   114      Args:
   115        filename: a Google Cloud Storage filename of form '/bucket/filename'.
   116        st_size: file size in bytes. long compatible.
   117        etag: hex digest of the md5 hash of the file's content. str.
   118        st_ctime: posix file creation time. float compatible.
   119        content_type: content type. str.
   120        metadata: a str->str dict of user specified options when creating
   121          the file. Possible keys are x-goog-meta-, content-disposition,
   122          content-encoding, and cache-control.
   123        is_dir: True if this represents a directory. False if this is a real file.
   124      """
   125      self.filename = filename
   126      self.is_dir = is_dir
   127      self.st_size = None
   128      self.st_ctime = None
   129      self.etag = None
   130      self.content_type = content_type
   131      self.metadata = metadata
   132  
   133      if not is_dir:
   134        self.st_size = long(st_size)
   135        self.st_ctime = float(st_ctime)
   136        if etag[0] == '"' and etag[-1] == '"':
   137          etag = etag[1:-1]
   138        self.etag = etag
   139  
   140    def __repr__(self):
   141      if self.is_dir:
   142        return '(directory: %s)' % self.filename
   143  
   144      return (
   145          '(filename: %(filename)s, st_size: %(st_size)s, '
   146          'st_ctime: %(st_ctime)s, etag: %(etag)s, '
   147          'content_type: %(content_type)s, '
   148          'metadata: %(metadata)s)' %
   149          dict(filename=self.filename,
   150               st_size=self.st_size,
   151               st_ctime=self.st_ctime,
   152               etag=self.etag,
   153               content_type=self.content_type,
   154               metadata=self.metadata))
   155  
   156    def __cmp__(self, other):
   157      if not isinstance(other, self.__class__):
   158        raise ValueError('Argument to cmp must have the same type. '
   159                         'Expect %s, got %s', self.__class__.__name__,
   160                         other.__class__.__name__)
   161      if self.filename > other.filename:
   162        return 1
   163      elif self.filename < other.filename:
   164        return -1
   165      return 0
   166  
   167    def __hash__(self):
   168      if self.etag:
   169        return hash(self.etag)
   170      return hash(self.filename)
   171  
   172  
   173  CSFileStat = GCSFileStat
   174  
   175  
   176  def get_stored_content_length(headers):
   177    """Return the content length (in bytes) of the object as stored in GCS.
   178  
   179    x-goog-stored-content-length should always be present except when called via
   180    the local dev_appserver. Therefore if it is not present we default to the
   181    standard content-length header.
   182  
   183    Args:
   184      headers: a dict of headers from the http response.
   185  
   186    Returns:
   187      the stored content length.
   188    """
   189    length = headers.get('x-goog-stored-content-length')
   190    if length is None:
   191      length = headers.get('content-length')
   192    return length
   193  
   194  
   195  def get_metadata(headers):
   196    """Get user defined options from HTTP response headers."""
   197    return dict((k, v) for k, v in headers.iteritems()
   198                if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
   199  
   200  
   201  def validate_bucket_name(name):
   202    """Validate a Google Storage bucket name.
   203  
   204    Args:
   205      name: a Google Storage bucket name with no prefix or suffix.
   206  
   207    Raises:
   208      ValueError: if name is invalid.
   209    """
   210    _validate_path(name)
   211    if not _GCS_BUCKET_REGEX.match(name):
   212      raise ValueError('Bucket should be 3-63 characters long using only a-z,'
   213                       '0-9, underscore, dash or dot but got %s' % name)
   214  
   215  
   216  def validate_bucket_path(path):
   217    """Validate a Google Cloud Storage bucket path.
   218  
   219    Args:
   220      path: a Google Storage bucket path. It should have form '/bucket'.
   221  
   222    Raises:
   223      ValueError: if path is invalid.
   224    """
   225    _validate_path(path)
   226    if not _GCS_BUCKET_PATH_REGEX.match(path):
   227      raise ValueError('Bucket should have format /bucket '
   228                       'but got %s' % path)
   229  
   230  
   231  def validate_file_path(path):
   232    """Validate a Google Cloud Storage file path.
   233  
   234    Args:
   235      path: a Google Storage file path. It should have form '/bucket/filename'.
   236  
   237    Raises:
   238      ValueError: if path is invalid.
   239    """
   240    _validate_path(path)
   241    if not _GCS_FULLPATH_REGEX.match(path):
   242      raise ValueError('Path should have format /bucket/filename '
   243                       'but got %s' % path)
   244  
   245  
   246  def _process_path_prefix(path_prefix):
   247    """Validate and process a Google Cloud Stoarge path prefix.
   248  
   249    Args:
   250      path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
   251        or '/bucket/' or '/bucket'.
   252  
   253    Raises:
   254      ValueError: if path is invalid.
   255  
   256    Returns:
   257      a tuple of /bucket and prefix. prefix can be None.
   258    """
   259    _validate_path(path_prefix)
   260    if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
   261      raise ValueError('Path prefix should have format /bucket, /bucket/, '
   262                       'or /bucket/prefix but got %s.' % path_prefix)
   263    bucket_name_end = path_prefix.find('/', 1)
   264    bucket = path_prefix
   265    prefix = None
   266    if bucket_name_end != -1:
   267      bucket = path_prefix[:bucket_name_end]
   268      prefix = path_prefix[bucket_name_end + 1:] or None
   269    return bucket, prefix
   270  
   271  
   272  def _validate_path(path):
   273    """Basic validation of Google Storage paths.
   274  
   275    Args:
   276      path: a Google Storage path. It should have form '/bucket/filename'
   277        or '/bucket'.
   278  
   279    Raises:
   280      ValueError: if path is invalid.
   281      TypeError: if path is not of type basestring.
   282    """
   283    if not path:
   284      raise ValueError('Path is empty')
   285    if not isinstance(path, basestring):
   286      raise TypeError('Path should be a string but is %s (%s).' %
   287                      (path.__class__, path))
   288  
   289  
   290  def validate_options(options):
   291    """Validate Google Cloud Storage options.
   292  
   293    Args:
   294      options: a str->basestring dict of options to pass to Google Cloud Storage.
   295  
   296    Raises:
   297      ValueError: if option is not supported.
   298      TypeError: if option is not of type str or value of an option
   299        is not of type basestring.
   300    """
   301    if not options:
   302      return
   303  
   304    for k, v in options.iteritems():
   305      if not isinstance(k, str):
   306        raise TypeError('option %r should be a str.' % k)
   307      if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
   308        raise ValueError('option %s is not supported.' % k)
   309      if not isinstance(v, basestring):
   310        raise TypeError('value %r for option %s should be of type basestring.' %
   311                        (v, k))
   312  
   313  
   314  def http_time_to_posix(http_time):
   315    """Convert HTTP time format to posix time.
   316  
   317    See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
   318    for http time format.
   319  
   320    Args:
   321      http_time: time in RFC 2616 format. e.g.
   322        "Mon, 20 Nov 1995 19:12:08 GMT".
   323  
   324    Returns:
   325      A float of secs from unix epoch.
   326    """
   327    if http_time is not None:
   328      return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
   329  
   330  
   331  def posix_time_to_http(posix_time):
   332    """Convert posix time to HTML header time format.
   333  
   334    Args:
   335      posix_time: unix time.
   336  
   337    Returns:
   338      A datatime str in RFC 2616 format.
   339    """
   340    if posix_time:
   341      return email_utils.formatdate(posix_time, usegmt=True)
   342  
   343  
   344  _DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
   345  
   346  
   347  def dt_str_to_posix(dt_str):
   348    """format str to posix.
   349  
   350    datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
   351    e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
   352    between date and time when they are on the same line.
   353    Z indicates UTC (zero meridian).
   354  
   355    A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
   356  
   357    This is used to parse LastModified node from GCS's GET bucket XML response.
   358  
   359    Args:
   360      dt_str: A datetime str.
   361  
   362    Returns:
   363      A float of secs from unix epoch. By posix definition, epoch is midnight
   364      1970/1/1 UTC.
   365    """
   366    parsable, _ = dt_str.split('.')
   367    dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
   368    return calendar.timegm(dt.utctimetuple())
   369  
   370  
   371  def posix_to_dt_str(posix):
   372    """Reverse of str_to_datetime.
   373  
   374    This is used by GCS stub to generate GET bucket XML response.
   375  
   376    Args:
   377      posix: A float of secs from unix epoch.
   378  
   379    Returns:
   380      A datetime str.
   381    """
   382    dt = datetime.datetime.utcfromtimestamp(posix)
   383    dt_str = dt.strftime(_DT_FORMAT)
   384    return dt_str + '.000Z'
   385  
   386  
   387  def local_run():
   388    """Whether we should hit GCS dev appserver stub."""
   389    server_software = os.environ.get('SERVER_SOFTWARE')
   390    if server_software is None:
   391      return True
   392    if 'remote_api' in server_software:
   393      return False
   394    if server_software.startswith(('Development', 'testutil')):
   395      return True
   396    return False
   397  
   398  
   399  def local_api_url():
   400    """Return URL for GCS emulation on dev appserver."""
   401    return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
   402  
   403  
   404  def memory_usage(method):
   405    """Log memory usage before and after a method."""
   406    def wrapper(*args, **kwargs):
   407      logging.info('Memory before method %s is %s.',
   408                   method.__name__, runtime.memory_usage().current())
   409      result = method(*args, **kwargs)
   410      logging.info('Memory after method %s is %s',
   411                   method.__name__, runtime.memory_usage().current())
   412      return result
   413    return wrapper
   414  
   415  
   416  def _add_ns(tagname):
   417    return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
   418                                'tag': tagname}
   419  
   420  
   421  _T_CONTENTS = _add_ns('Contents')
   422  _T_LAST_MODIFIED = _add_ns('LastModified')
   423  _T_ETAG = _add_ns('ETag')
   424  _T_KEY = _add_ns('Key')
   425  _T_SIZE = _add_ns('Size')
   426  _T_PREFIX = _add_ns('Prefix')
   427  _T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
   428  _T_NEXT_MARKER = _add_ns('NextMarker')
   429  _T_IS_TRUNCATED = _add_ns('IsTruncated')