github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/lib/python/camli/op.py (about) 1 #!/usr/bin/env python 2 # 3 # Camlistore uploader client for Python. 4 # 5 # Copyright 2010 Google Inc. 6 # 7 # Licensed under the Apache License, Version 2.0 (the "License"); 8 # you may not use this file except in compliance with the License. 9 # You may obtain a copy of the License at 10 # 11 # http://www.apache.org/licenses/LICENSE-2.0 12 # 13 # Unless required by applicable law or agreed to in writing, software 14 # distributed under the License is distributed on an "AS IS" BASIS, 15 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 # See the License for the specific language governing permissions and 17 # limitations under the License. 18 # 19 """Client library for Camlistore.""" 20 21 __author__ = 'Brett Slatkin (bslatkin@gmail.com)' 22 23 import base64 24 import cStringIO 25 import hashlib 26 import httplib 27 import logging 28 import mimetools 29 import urllib 30 import urlparse 31 32 import simplejson 33 34 __all__ = ['Error', 'ServerError', 'PayloadError', 'BUFFER_SIZE', 'CamliOp'] 35 36 37 BUFFER_SIZE = 512 * 1024 38 39 40 class Error(Exception): 41 """Base class for exceptions in this module.""" 42 43 44 class ServerError(Error): 45 """An unexpected error was returned by the server.""" 46 47 48 class PayloadError(ServerError): 49 """Something about a data payload was bad.""" 50 51 52 def buffered_sha1(data, buffer_size=BUFFER_SIZE): 53 """Calculates the sha1 hash of some data. 54 55 Args: 56 data: A string of data to write or an open file-like object. File-like 57 objects will be seeked back to their original position before this 58 function returns. 59 buffer_size: How much data to munge at a time. 60 61 Returns: 62 Hex sha1 string. 63 """ 64 compute = hashlib.sha1() 65 if isinstance(data, basestring): 66 compute.update(data) 67 else: 68 start = data.tell() 69 while True: 70 line = data.read(buffer_size) 71 if line == '': 72 break 73 compute.update(line) 74 data.seek(start) 75 return compute.hexdigest() 76 77 78 class CamliOp(object): 79 """Camlistore client class that is single threaded, using one socket.""" 80 81 def __init__(self, 82 server_address, 83 buffer_size=BUFFER_SIZE, 84 create_connection=httplib.HTTPConnection, 85 auth=None, 86 basepath=""): 87 """Initializer. 88 89 Args: 90 server_address: hostname:port for the server. 91 buffer_size: Byte size to use for in-memory buffering for various 92 client-related operations. 93 create_connection: Use for testing. 94 auth: Optional. 'username:password' to use for HTTP basic auth. 95 basepath: Optional path suffix. e.g. if the server is at 96 "localhost:3179/bs", the basepath should be "/bs". 97 """ 98 self.server_address = server_address 99 self.buffer_size = buffer_size 100 self._create_connection = create_connection 101 self._connection = None 102 self._authorization = '' 103 self.basepath = "" 104 if auth: 105 if len(auth.split(':')) != 2: 106 # Default to dummy username; current server doesn't care 107 # TODO(jrabbit): care when necessary 108 auth = "username:" + auth #If username not given use the implicit default, 'username' 109 self._authorization = ('Basic ' + base64.encodestring(auth).strip()) 110 if basepath: 111 if '/' not in basepath: 112 raise NameError("basepath must be in form '/bs'") 113 if basepath[-1] == '/': 114 basepath = basepath[:-1] 115 self.basepath = basepath 116 117 def _setup_connection(self): 118 """Sets up the HTTP connection.""" 119 self.connection = self._create_connection(self.server_address) 120 121 def put_blobs(self, blobs): 122 """Puts a set of blobs. 123 124 Args: 125 blobs: List of (data, blobref) tuples; list of open files; or list of 126 blob data strings. 127 128 Returns: 129 The set of blobs that were actually uploaded. If all blobs are already 130 present this set will be empty. 131 132 Raises: 133 ServerError if the server response is bad. 134 PayloadError if the server response is not in the right format. 135 OSError or IOError if reading any blobs breaks. 136 """ 137 if isinstance(blobs, dict): 138 raise TypeError('Must pass iterable of tuples, open files, or strings.') 139 140 blobref_dict = {} 141 for item in blobs: 142 if isinstance(item, tuple): 143 blob, blobref = item 144 else: 145 blob, blobref = item, None 146 if blobref is None: 147 blobref = 'sha1-' + buffered_sha1(blob, buffer_size=self.buffer_size) 148 blobref_dict[blobref] = blob 149 150 preupload = {'camliversion': '1'} 151 for index, blobref in enumerate(blobref_dict.keys()): 152 preupload['blob%d' % (index+1)] = blobref 153 154 # TODO: What is the max number of blobs that can be specified in a 155 # preupload request? The server probably has some reasonable limit and 156 # after that we need to do batching in smaller groups. 157 158 self._setup_connection() 159 if self.basepath: 160 fullpath = self.basepath + '/camli/stat' 161 else: 162 fullpath = '/camli/stat' 163 self.connection.request( 164 'POST', fullpath, urllib.urlencode(preupload), 165 {'Content-Type': 'application/x-www-form-urlencoded', 166 'Authorization': self._authorization}) 167 response = self.connection.getresponse() 168 logging.debug('Preupload HTTP response: %d %s', 169 response.status, response.reason) 170 if response.status != 200: 171 raise ServerError('Bad preupload response status: %d %s' % 172 (response.status, response.reason)) 173 174 data = response.read() 175 try: 176 response_dict = simplejson.loads(data) 177 except simplejson.decoder.JSONDecodeError: 178 raise PayloadError('Server returned bad preupload response: %r' % data) 179 180 logging.debug('Parsed preupload response: %r', response_dict) 181 if 'stat' not in response_dict: 182 raise PayloadError( 183 'Could not find "stat" in preupload response: %r' % 184 response_dict) 185 if 'uploadUrl' not in response_dict: 186 raise PayloadError( 187 'Could not find "uploadUrl" in preupload response: %r' % 188 response_dict) 189 190 already_have_blobrefs = set() 191 for blobref_json in response_dict['stat']: 192 if 'blobRef' not in blobref_json: 193 raise PayloadError( 194 'Cannot find "blobRef" in preupload response: %r', 195 response_dict) 196 already_have_blobrefs.add(blobref_json['blobRef']) 197 logging.debug('Already have blobs: %r', already_have_blobrefs) 198 199 missing_blobrefs = set(blobref_dict.iterkeys()) 200 missing_blobrefs.difference_update(already_have_blobrefs) 201 if not missing_blobrefs: 202 logging.debug('All blobs already present.') 203 return 204 205 # TODO(bslatkin): Figure out the 'Content-Length' header value by looking 206 # at the size of the files by seeking; required for multipart POST. 207 out = cStringIO.StringIO() 208 boundary = mimetools.choose_boundary() 209 boundary_start = '--' + boundary 210 211 blob_number = 0 212 for blobref in blobref_dict.iterkeys(): 213 if blobref in already_have_blobrefs: 214 logging.debug('Already have blobref=%s', blobref) 215 continue 216 blob = blobref_dict[blobref] 217 blob_number += 1 218 219 out.write(boundary_start) 220 out.write('\r\nContent-Type: application/octet-stream\r\n') 221 out.write('Content-Disposition: form-data; name="%s"; ' 222 'filename="%d"\r\n\r\n' % (blobref, blob_number)) 223 if isinstance(blob, basestring): 224 out.write(blob) 225 else: 226 while True: 227 buf = blob.read(self.buffer_size) 228 if buf == '': 229 break 230 out.write(buf) 231 out.write('\r\n') 232 out.write(boundary_start) 233 out.write('--\r\n') 234 request_body = out.getvalue() 235 236 pieces = list(urlparse.urlparse(response_dict['uploadUrl'])) 237 # TODO: Support upload servers on another base URL. 238 pieces[0], pieces[1] = '', '' 239 relative_url = urlparse.urlunparse(pieces) 240 self.connection.request( 241 'POST', relative_url, request_body, 242 {'Content-Type': 'multipart/form-data; boundary="%s"' % boundary, 243 'Content-Length': str(len(request_body)), 244 'Authorization': self._authorization}) 245 246 response = self.connection.getresponse() 247 logging.debug('Upload response: %d %s', response.status, response.reason) 248 if response.status not in (200, 301, 302, 303): 249 raise ServerError('Bad upload response status: %d %s' % 250 (response.status, response.reason)) 251 252 while response.status in (301, 302, 303): 253 # TODO(bslatkin): Support connections to servers on different addresses 254 # after redirects. For now just send another request to the same server. 255 location = response.getheader('Location') 256 pieces = list(urlparse.urlparse(location)) 257 pieces[0], pieces[1] = '', '' 258 new_relative_url = urlparse.urlunparse(pieces) 259 logging.debug('Redirect %s -> %s', relative_url, new_relative_url) 260 relative_url = new_relative_url 261 self.connection.request('GET', relative_url, headers={ 262 'Authorization': self._authorization}) 263 response = self.connection.getresponse() 264 265 if response.status != 200: 266 raise ServerError('Bad upload response status: %d %s' % 267 (response.status, response.reason)) 268 269 data = response.read() 270 try: 271 response_dict = simplejson.loads(data) 272 except simplejson.decoder.JSONDecodeError: 273 raise PayloadError('Server returned bad upload response: %r' % data) 274 275 if 'received' not in response_dict: 276 raise PayloadError('Could not find "received" in upload response: %r' % 277 response_dict) 278 279 received_blobrefs = set() 280 for blobref_json in response_dict['received']: 281 if 'blobRef' not in blobref_json: 282 raise PayloadError( 283 'Cannot find "blobRef" in upload response: %r', 284 response_dict) 285 received_blobrefs.add(blobref_json['blobRef']) 286 logging.debug('Received blobs: %r', received_blobrefs) 287 288 missing_blobrefs.difference_update(received_blobrefs) 289 if missing_blobrefs: 290 # TODO: Try to upload the missing ones. 291 raise ServerError('Some blobs not uploaded: %r', missing_blobrefs) 292 293 logging.debug('Upload of %d blobs successful.', len(blobref_dict)) 294 return received_blobrefs 295 296 def get_blobs(self, 297 blobref_list, 298 start_out=None, 299 end_out=None, 300 check_sha1=True): 301 """Gets a set of blobs. 302 303 Args: 304 blobref_list: A single blobref as a string or an iterable of strings that 305 are blobrefs. 306 start_out: Optional. A function taking the blobref's key, returns a 307 file-like object to which the blob should be written. Called before 308 the blob has started any writing. 309 end_out: Optional along with start_out. A function that takes the 310 blobref and open file-like object that does proper cleanup and closing 311 of the file. Called when all of the file's contents have been written. 312 check_sha1: Double-check that the file's contents match the blobref. 313 314 Returns: 315 If start_out is not supplied, then all blobs will be kept in memory. If 316 blobref_list is a single blobref, then the return value will be a string 317 with the blob data or None if the blob was not present. If blobref_list 318 was iterable, the return value will be a dictionary mapping blobref to 319 blob data for each blob that was found. 320 321 If start_out is supplied, the return value will be None. Callers can 322 check for missing blobs by comparing their own input of the blobref_list 323 argument to the blobrefs that are passed to start_out. 324 325 Raises: 326 ServerError if the server response is invalid for whatever reason. 327 OSError or IOError if writing to any files breaks. 328 """ 329 multiple = not isinstance(blobref_list, basestring) 330 result = {} 331 if start_out is None: 332 def start_out(blobref): 333 buffer = cStringIO.StringIO() 334 return buffer 335 336 def end_out(blobref, file_like): 337 result[blobref] = file_like.getvalue() 338 else: 339 result = None # Rely on user-supplied start_out for reporting blobrefs. 340 if end_out is None: 341 def end_out(blobref, file_like): 342 file_like.close() 343 344 self._setup_connection() 345 346 # Note, we could use a 'preupload' here as a quick, bulk existence check, 347 # but that may not always work depending on the access the user has. 348 # It's possible the user has read-only access, and thus can only do 349 # GET or HEAD on objects. 350 351 for blobref in blobref_list: 352 logging.debug('Getting blobref=%s', blobref) 353 if self.basepath: 354 fullpath = self.basepath + '/camli/' 355 else: 356 fullpath = '/camli/' 357 self.connection.request('GET', fullpath + blobref, 358 headers={'Authorization': self._authorization}) 359 response = self.connection.getresponse() 360 if response.status == 404: 361 logging.debug('Server does not have blobref=%s', blobref) 362 continue 363 elif response.status != 200: 364 raise ServerError('Bad response status: %d %s' % 365 (response.status, response.reason)) 366 367 if check_sha1: 368 compute_hash = hashlib.sha1() 369 370 out_file = start_out(blobref) 371 while True: 372 buf = response.read(self.buffer_size) 373 if buf == '': 374 end_out(blobref, out_file) 375 break 376 377 if check_sha1: 378 compute_hash.update(buf) 379 380 out_file.write(buf) 381 382 if check_sha1: 383 found = 'sha1-' + compute_hash.hexdigest() 384 if found != blobref: 385 raise ValueError('sha1 hash of blobref does not match; ' 386 'found %s, expected %s' % (found, blobref)) 387 388 if result and not multiple: 389 return result.values()[0] 390 return result