github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/lib/python/camli/op.py

github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/lib/python/camli/op.py (about)

     1  #!/usr/bin/env python
     2  #
     3  # Camlistore uploader client for Python.
     4  #
     5  # Copyright 2010 Google Inc.
     6  #
     7  # Licensed under the Apache License, Version 2.0 (the "License");
     8  # you may not use this file except in compliance with the License.
     9  # You may obtain a copy of the License at
    10  #
    11  #     http://www.apache.org/licenses/LICENSE-2.0
    12  #
    13  # Unless required by applicable law or agreed to in writing, software
    14  # distributed under the License is distributed on an "AS IS" BASIS,
    15  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  # See the License for the specific language governing permissions and
    17  # limitations under the License.
    18  #
    19  """Client library for Camlistore."""
    20  
    21  __author__ = 'Brett Slatkin (bslatkin@gmail.com)'
    22  
    23  import base64
    24  import cStringIO
    25  import hashlib
    26  import httplib
    27  import logging
    28  import mimetools
    29  import urllib
    30  import urlparse
    31  
    32  import simplejson
    33  
    34  __all__ = ['Error', 'ServerError', 'PayloadError', 'BUFFER_SIZE', 'CamliOp']
    35  
    36  
    37  BUFFER_SIZE = 512 * 1024
    38  
    39  
    40  class Error(Exception):
    41    """Base class for exceptions in this module."""
    42  
    43  
    44  class ServerError(Error):
    45    """An unexpected error was returned by the server."""
    46  
    47  
    48  class PayloadError(ServerError):
    49    """Something about a data payload was bad."""
    50  
    51  
    52  def buffered_sha1(data, buffer_size=BUFFER_SIZE):
    53    """Calculates the sha1 hash of some data.
    54  
    55    Args:
    56      data: A string of data to write or an open file-like object. File-like
    57        objects will be seeked back to their original position before this
    58        function returns.
    59      buffer_size: How much data to munge at a time.
    60  
    61    Returns:
    62      Hex sha1 string.
    63    """
    64    compute = hashlib.sha1()
    65    if isinstance(data, basestring):
    66      compute.update(data)
    67    else:
    68      start = data.tell()
    69      while True:
    70        line = data.read(buffer_size)
    71        if line == '':
    72          break
    73        compute.update(line)
    74      data.seek(start)
    75    return compute.hexdigest()
    76  
    77  
    78  class CamliOp(object):
    79    """Camlistore client class that is single threaded, using one socket."""
    80  
    81    def __init__(self,
    82                 server_address,
    83                 buffer_size=BUFFER_SIZE,
    84                 create_connection=httplib.HTTPConnection,
    85                 auth=None,
    86                 basepath=""):
    87      """Initializer.
    88  
    89      Args:
    90        server_address: hostname:port for the server.
    91        buffer_size: Byte size to use for in-memory buffering for various
    92          client-related operations.
    93        create_connection: Use for testing.
    94        auth: Optional. 'username:password' to use for HTTP basic auth.
    95        basepath: Optional path suffix. e.g. if the server is at
    96              "localhost:3179/bs", the basepath should be "/bs".
    97      """
    98      self.server_address = server_address
    99      self.buffer_size = buffer_size
   100      self._create_connection = create_connection
   101      self._connection = None
   102      self._authorization = ''
   103      self.basepath = ""
   104      if auth:
   105        if len(auth.split(':')) != 2:
   106            # Default to dummy username; current server doesn't care
   107            # TODO(jrabbit): care when necessary
   108            auth = "username:" + auth #If username not given use the implicit default, 'username'
   109        self._authorization = ('Basic ' + base64.encodestring(auth).strip())
   110      if basepath:
   111        if '/' not in basepath:
   112          raise NameError("basepath must be in form '/bs'")
   113        if basepath[-1] == '/':
   114          basepath = basepath[:-1]
   115        self.basepath = basepath
   116  
   117    def _setup_connection(self):
   118      """Sets up the HTTP connection."""
   119      self.connection = self._create_connection(self.server_address)
   120  
   121    def put_blobs(self, blobs):
   122      """Puts a set of blobs.
   123  
   124      Args:
   125        blobs: List of (data, blobref) tuples; list of open files; or list of
   126          blob data strings.
   127  
   128      Returns:
   129        The set of blobs that were actually uploaded. If all blobs are already
   130        present this set will be empty.
   131  
   132      Raises:
   133        ServerError if the server response is bad.
   134        PayloadError if the server response is not in the right format.
   135        OSError or IOError if reading any blobs breaks.
   136      """
   137      if isinstance(blobs, dict):
   138        raise TypeError('Must pass iterable of tuples, open files, or strings.')
   139  
   140      blobref_dict = {}
   141      for item in blobs:
   142        if isinstance(item, tuple):
   143          blob, blobref = item
   144        else:
   145          blob, blobref = item, None
   146        if blobref is None:
   147          blobref = 'sha1-' + buffered_sha1(blob, buffer_size=self.buffer_size)
   148        blobref_dict[blobref] = blob
   149  
   150      preupload = {'camliversion': '1'}
   151      for index, blobref in enumerate(blobref_dict.keys()):
   152        preupload['blob%d' % (index+1)] = blobref
   153  
   154      # TODO: What is the max number of blobs that can be specified in a
   155      # preupload request? The server probably has some reasonable limit and
   156      # after that we need to do batching in smaller groups.
   157  
   158      self._setup_connection()
   159      if self.basepath:
   160        fullpath = self.basepath + '/camli/stat'
   161      else:
   162        fullpath = '/camli/stat'
   163      self.connection.request(
   164          'POST', fullpath, urllib.urlencode(preupload),
   165          {'Content-Type': 'application/x-www-form-urlencoded',
   166           'Authorization': self._authorization})
   167      response = self.connection.getresponse()
   168      logging.debug('Preupload HTTP response: %d %s',
   169                    response.status, response.reason)
   170      if response.status != 200:
   171        raise ServerError('Bad preupload response status: %d %s' %
   172                          (response.status, response.reason))
   173  
   174      data = response.read()
   175      try:
   176        response_dict = simplejson.loads(data)
   177      except simplejson.decoder.JSONDecodeError:
   178        raise PayloadError('Server returned bad preupload response: %r' % data)
   179  
   180      logging.debug('Parsed preupload response: %r', response_dict)
   181      if 'stat' not in response_dict:
   182        raise PayloadError(
   183            'Could not find "stat" in preupload response: %r' %
   184            response_dict)
   185      if 'uploadUrl' not in response_dict:
   186        raise PayloadError(
   187            'Could not find "uploadUrl" in preupload response: %r' %
   188            response_dict)
   189  
   190      already_have_blobrefs = set()
   191      for blobref_json in response_dict['stat']:
   192        if 'blobRef' not in blobref_json:
   193          raise PayloadError(
   194              'Cannot find "blobRef" in preupload response: %r',
   195              response_dict)
   196        already_have_blobrefs.add(blobref_json['blobRef'])
   197      logging.debug('Already have blobs: %r', already_have_blobrefs)
   198  
   199      missing_blobrefs = set(blobref_dict.iterkeys())
   200      missing_blobrefs.difference_update(already_have_blobrefs)
   201      if not missing_blobrefs:
   202        logging.debug('All blobs already present.')
   203        return
   204  
   205      # TODO(bslatkin): Figure out the 'Content-Length' header value by looking
   206      # at the size of the files by seeking; required for multipart POST.
   207      out = cStringIO.StringIO()
   208      boundary = mimetools.choose_boundary()
   209      boundary_start = '--' + boundary
   210  
   211      blob_number = 0
   212      for blobref in blobref_dict.iterkeys():
   213        if blobref in already_have_blobrefs:
   214          logging.debug('Already have blobref=%s', blobref)
   215          continue
   216        blob = blobref_dict[blobref]
   217        blob_number += 1
   218  
   219        out.write(boundary_start)
   220        out.write('\r\nContent-Type: application/octet-stream\r\n')
   221        out.write('Content-Disposition: form-data; name="%s"; '
   222                  'filename="%d"\r\n\r\n' % (blobref, blob_number))
   223        if isinstance(blob, basestring):
   224          out.write(blob)
   225        else:
   226          while True:
   227            buf = blob.read(self.buffer_size)
   228            if buf == '':
   229              break
   230            out.write(buf)
   231        out.write('\r\n')
   232      out.write(boundary_start)
   233      out.write('--\r\n')
   234      request_body = out.getvalue()
   235  
   236      pieces = list(urlparse.urlparse(response_dict['uploadUrl']))
   237      # TODO: Support upload servers on another base URL.
   238      pieces[0], pieces[1] = '', ''
   239      relative_url = urlparse.urlunparse(pieces)
   240      self.connection.request(
   241          'POST', relative_url, request_body,
   242          {'Content-Type': 'multipart/form-data; boundary="%s"' % boundary,
   243           'Content-Length': str(len(request_body)),
   244           'Authorization': self._authorization})
   245  
   246      response = self.connection.getresponse()
   247      logging.debug('Upload response: %d %s', response.status, response.reason)
   248      if response.status not in (200, 301, 302, 303):
   249        raise ServerError('Bad upload response status: %d %s' %
   250                          (response.status, response.reason))
   251  
   252      while response.status in (301, 302, 303):
   253        # TODO(bslatkin): Support connections to servers on different addresses
   254        # after redirects. For now just send another request to the same server.
   255        location = response.getheader('Location')
   256        pieces = list(urlparse.urlparse(location))
   257        pieces[0], pieces[1] = '', ''
   258        new_relative_url = urlparse.urlunparse(pieces)
   259        logging.debug('Redirect %s -> %s', relative_url, new_relative_url)
   260        relative_url = new_relative_url
   261        self.connection.request('GET', relative_url, headers={
   262            'Authorization': self._authorization})
   263        response = self.connection.getresponse()
   264  
   265      if response.status != 200:
   266        raise ServerError('Bad upload response status: %d %s' %
   267                          (response.status, response.reason))
   268  
   269      data = response.read()
   270      try:
   271        response_dict = simplejson.loads(data)
   272      except simplejson.decoder.JSONDecodeError:
   273        raise PayloadError('Server returned bad upload response: %r' % data)
   274  
   275      if 'received' not in response_dict:
   276        raise PayloadError('Could not find "received" in upload response: %r' %
   277                           response_dict)
   278  
   279      received_blobrefs = set()
   280      for blobref_json in response_dict['received']:
   281        if 'blobRef' not in blobref_json:
   282          raise PayloadError(
   283              'Cannot find "blobRef" in upload response: %r',
   284              response_dict)
   285        received_blobrefs.add(blobref_json['blobRef'])
   286      logging.debug('Received blobs: %r', received_blobrefs)
   287  
   288      missing_blobrefs.difference_update(received_blobrefs)
   289      if missing_blobrefs:
   290        # TODO: Try to upload the missing ones.
   291        raise ServerError('Some blobs not uploaded: %r', missing_blobrefs)
   292  
   293      logging.debug('Upload of %d blobs successful.', len(blobref_dict))
   294      return received_blobrefs
   295  
   296    def get_blobs(self,
   297                  blobref_list,
   298                  start_out=None,
   299                  end_out=None,
   300                  check_sha1=True):
   301      """Gets a set of blobs.
   302  
   303      Args:
   304        blobref_list: A single blobref as a string or an iterable of strings that
   305          are blobrefs.
   306        start_out: Optional. A function taking the blobref's key, returns a
   307          file-like object to which the blob should be written. Called before
   308          the blob has started any writing.
   309        end_out: Optional along with start_out. A function that takes the
   310          blobref and open file-like object that does proper cleanup and closing
   311          of the file. Called when all of the file's contents have been written.
   312        check_sha1: Double-check that the file's contents match the blobref.
   313  
   314      Returns:
   315        If start_out is not supplied, then all blobs will be kept in memory. If
   316        blobref_list is a single blobref, then the return value will be a string
   317        with the blob data or None if the blob was not present. If blobref_list
   318        was iterable, the return value will be a dictionary mapping blobref to
   319        blob data for each blob that was found.
   320  
   321        If start_out is supplied, the return value will be None. Callers can
   322        check for missing blobs by comparing their own input of the blobref_list
   323        argument to the blobrefs that are passed to start_out.
   324  
   325      Raises:
   326        ServerError if the server response is invalid for whatever reason.
   327        OSError or IOError if writing to any files breaks.
   328      """
   329      multiple = not isinstance(blobref_list, basestring)
   330      result = {}
   331      if start_out is None:
   332        def start_out(blobref):
   333          buffer = cStringIO.StringIO()
   334          return buffer
   335  
   336        def end_out(blobref, file_like):
   337          result[blobref] = file_like.getvalue()
   338      else:
   339        result = None  # Rely on user-supplied start_out for reporting blobrefs.
   340        if end_out is None:
   341          def end_out(blobref, file_like):
   342            file_like.close()
   343  
   344      self._setup_connection()
   345  
   346      # Note, we could use a 'preupload' here as a quick, bulk existence check,
   347      # but that may not always work depending on the access the user has.
   348      # It's possible the user has read-only access, and thus can only do
   349      # GET or HEAD on objects.
   350  
   351      for blobref in blobref_list:
   352        logging.debug('Getting blobref=%s', blobref)
   353        if self.basepath:
   354            fullpath = self.basepath + '/camli/'
   355        else:
   356            fullpath = '/camli/'
   357        self.connection.request('GET', fullpath + blobref,
   358                                headers={'Authorization': self._authorization})
   359        response = self.connection.getresponse()
   360        if response.status == 404:
   361          logging.debug('Server does not have blobref=%s', blobref)
   362          continue
   363        elif response.status != 200:
   364          raise ServerError('Bad response status: %d %s' %
   365                            (response.status, response.reason))
   366  
   367        if check_sha1:
   368          compute_hash = hashlib.sha1()
   369  
   370        out_file = start_out(blobref)
   371        while True:
   372          buf = response.read(self.buffer_size)
   373          if buf == '':
   374            end_out(blobref, out_file)
   375            break
   376  
   377          if check_sha1:
   378            compute_hash.update(buf)
   379  
   380          out_file.write(buf)
   381  
   382        if check_sha1:
   383          found = 'sha1-' + compute_hash.hexdigest()
   384          if found != blobref:
   385            raise ValueError('sha1 hash of blobref does not match; '
   386                             'found %s, expected %s' % (found, blobref))
   387  
   388      if result and not multiple:
   389        return result.values()[0]
   390      return result