github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/third_party/stdlib/csv.py (about) 1 2 """ 3 csv.py - read/write/investigate CSV files 4 """ 5 6 import re 7 import functools 8 reduce = functools.reduce 9 # from functools import reduce 10 11 # TODO: Support from foo import * syntax. 12 import _csv 13 for name in _csv.__all__: 14 globals()[name] = getattr(_csv, name) 15 16 # from _csv import Error, __version__, writer, reader, register_dialect, \ 17 # unregister_dialect, get_dialect, list_dialects, \ 18 # field_size_limit, \ 19 # QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 20 # __doc__ 21 # from _csv import Dialect as _Dialect 22 _Dialect = _csv.Dialect 23 24 import StringIO as _StringIO 25 StringIO = _StringIO.StringIO 26 # try: 27 # from cStringIO import StringIO 28 # except ImportError: 29 # from StringIO import StringIO 30 31 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 32 "Error", "Dialect", "__doc__", "excel", "excel_tab", 33 "field_size_limit", "reader", "writer", 34 "register_dialect", "get_dialect", "list_dialects", "Sniffer", 35 "unregister_dialect", "__version__", "DictReader", "DictWriter" ] 36 37 class Dialect(object): 38 """Describe an Excel dialect. 39 40 This must be subclassed (see csv.excel). Valid attributes are: 41 delimiter, quotechar, escapechar, doublequote, skipinitialspace, 42 lineterminator, quoting. 43 44 """ 45 _name = "" 46 _valid = False 47 # placeholders 48 delimiter = None 49 quotechar = None 50 escapechar = None 51 doublequote = None 52 skipinitialspace = None 53 lineterminator = None 54 quoting = None 55 56 def __init__(self): 57 if self.__class__ != Dialect: 58 self._valid = True 59 self._validate() 60 61 def _validate(self): 62 try: 63 _Dialect(self) 64 except TypeError, e: 65 # We do this for compatibility with py2.3 66 raise Error(str(e)) 67 68 class excel(Dialect): 69 """Describe the usual properties of Excel-generated CSV files.""" 70 delimiter = ',' 71 quotechar = '"' 72 doublequote = True 73 skipinitialspace = False 74 lineterminator = '\r\n' 75 quoting = QUOTE_MINIMAL 76 register_dialect("excel", excel) 77 78 class excel_tab(excel): 79 """Describe the usual properties of Excel-generated TAB-delimited files.""" 80 delimiter = '\t' 81 register_dialect("excel-tab", excel_tab) 82 83 84 class DictReader(object): 85 def __init__(self, f, fieldnames=None, restkey=None, restval=None, 86 dialect="excel", *args, **kwds): 87 self._fieldnames = fieldnames # list of keys for the dict 88 self.restkey = restkey # key to catch long rows 89 self.restval = restval # default value for short rows 90 self.reader = reader(f, dialect, *args, **kwds) 91 self.dialect = dialect 92 self.line_num = 0 93 94 def __iter__(self): 95 return self 96 97 # @property 98 def fieldnames(self): 99 if self._fieldnames is None: 100 try: 101 self._fieldnames = self.reader.next() 102 except StopIteration: 103 pass 104 self.line_num = self.reader.line_num 105 return self._fieldnames 106 fieldnames = property(fieldnames) 107 108 # Issue 20004: Because DictReader is a classic class, this setter is 109 # ignored. At this point in 2.7's lifecycle, it is too late to change the 110 # base class for fear of breaking working code. If you want to change 111 # fieldnames without overwriting the getter, set _fieldnames directly. 112 @fieldnames.setter 113 def fieldnames(self, value): 114 self._fieldnames = value 115 116 def next(self): 117 if self.line_num == 0: 118 # Used only for its side effect. 119 self.fieldnames 120 row = self.reader.next() 121 self.line_num = self.reader.line_num 122 123 # unlike the basic reader, we prefer not to return blanks, 124 # because we will typically wind up with a dict full of None 125 # values 126 while row == []: 127 row = self.reader.next() 128 d = dict(zip(self.fieldnames, row)) 129 lf = len(self.fieldnames) 130 lr = len(row) 131 if lf < lr: 132 d[self.restkey] = row[lf:] 133 elif lf > lr: 134 for key in self.fieldnames[lr:]: 135 d[key] = self.restval 136 return d 137 138 139 class DictWriter(object): 140 def __init__(self, f, fieldnames, restval="", extrasaction="raise", 141 dialect="excel", *args, **kwds): 142 self.fieldnames = fieldnames # list of keys for the dict 143 self.restval = restval # for writing short dicts 144 if extrasaction.lower() not in ("raise", "ignore"): 145 raise ValueError, \ 146 ("extrasaction (%s) must be 'raise' or 'ignore'" % 147 extrasaction) 148 self.extrasaction = extrasaction 149 self.writer = writer(f, dialect, *args, **kwds) 150 151 def writeheader(self): 152 header = dict(zip(self.fieldnames, self.fieldnames)) 153 self.writerow(header) 154 155 def _dict_to_list(self, rowdict): 156 if self.extrasaction == "raise": 157 wrong_fields = [k for k in rowdict if k not in self.fieldnames] 158 if wrong_fields: 159 raise ValueError("dict contains fields not in fieldnames: " 160 + ", ".join([repr(x) for x in wrong_fields])) 161 return [rowdict.get(key, self.restval) for key in self.fieldnames] 162 163 def writerow(self, rowdict): 164 return self.writer.writerow(self._dict_to_list(rowdict)) 165 166 def writerows(self, rowdicts): 167 rows = [] 168 for rowdict in rowdicts: 169 rows.append(self._dict_to_list(rowdict)) 170 return self.writer.writerows(rows) 171 172 # Guard Sniffer's type checking against builds that exclude complex() 173 # try: 174 # complex 175 # except NameError: 176 # complex = float 177 complex = float 178 179 class Sniffer(object): 180 ''' 181 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 182 Returns a Dialect object. 183 ''' 184 def __init__(self): 185 # in case there is more than one possible delimiter 186 self.preferred = [',', '\t', ';', ' ', ':'] 187 188 189 def sniff(self, sample, delimiters=None): 190 """ 191 Returns a dialect (or None) corresponding to the sample 192 """ 193 194 quotechar, doublequote, delimiter, skipinitialspace = \ 195 self._guess_quote_and_delimiter(sample, delimiters) 196 if not delimiter: 197 delimiter, skipinitialspace = self._guess_delimiter(sample, 198 delimiters) 199 200 if not delimiter: 201 raise Error, "Could not determine delimiter" 202 203 class dialect(Dialect): 204 _name = "sniffed" 205 lineterminator = '\r\n' 206 quoting = QUOTE_MINIMAL 207 # escapechar = '' 208 209 dialect.doublequote = doublequote 210 dialect.delimiter = delimiter 211 # _csv.reader won't accept a quotechar of '' 212 dialect.quotechar = quotechar or '"' 213 dialect.skipinitialspace = skipinitialspace 214 215 return dialect 216 217 218 def _guess_quote_and_delimiter(self, data, delimiters): 219 """ 220 Looks for text enclosed between two identical quotes 221 (the probable quotechar) which are preceded and followed 222 by the same character (the probable delimiter). 223 For example: 224 ,'some text', 225 The quote with the most wins, same with the delimiter. 226 If there is no quotechar the delimiter can't be determined 227 this way. 228 """ 229 230 matches = [] 231 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 232 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 233 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 234 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 235 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 236 matches = regexp.findall(data) 237 if matches: 238 break 239 240 if not matches: 241 # (quotechar, doublequote, delimiter, skipinitialspace) 242 return ('', False, None, 0) 243 quotes = {} 244 delims = {} 245 spaces = 0 246 for m in matches: 247 n = regexp.groupindex['quote'] - 1 248 key = m[n] 249 if key: 250 quotes[key] = quotes.get(key, 0) + 1 251 try: 252 n = regexp.groupindex['delim'] - 1 253 key = m[n] 254 except KeyError: 255 continue 256 if key and (delimiters is None or key in delimiters): 257 delims[key] = delims.get(key, 0) + 1 258 try: 259 n = regexp.groupindex['space'] - 1 260 except KeyError: 261 continue 262 if m[n]: 263 spaces += 1 264 265 quotechar = reduce(lambda a, b, quotes = quotes: 266 (quotes[a] > quotes[b]) and a or b, quotes.keys()) 267 268 if delims: 269 delim = reduce(lambda a, b, delims = delims: 270 (delims[a] > delims[b]) and a or b, delims.keys()) 271 skipinitialspace = delims[delim] == spaces 272 if delim == '\n': # most likely a file with a single column 273 delim = '' 274 else: 275 # there is *no* delimiter, it's a single column of quoted data 276 delim = '' 277 skipinitialspace = 0 278 279 # if we see an extra quote between delimiters, we've got a 280 # double quoted format 281 dq_regexp = re.compile( 282 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ 283 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) 284 285 286 287 if dq_regexp.search(data): 288 doublequote = True 289 else: 290 doublequote = False 291 292 return (quotechar, doublequote, delim, skipinitialspace) 293 294 295 def _guess_delimiter(self, data, delimiters): 296 """ 297 The delimiter /should/ occur the same number of times on 298 each row. However, due to malformed data, it may not. We don't want 299 an all or nothing approach, so we allow for small variations in this 300 number. 301 1) build a table of the frequency of each character on every line. 302 2) build a table of frequencies of this frequency (meta-frequency?), 303 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 304 7 times in 2 rows' 305 3) use the mode of the meta-frequency to determine the /expected/ 306 frequency for that character 307 4) find out how often the character actually meets that goal 308 5) the character that best meets its goal is the delimiter 309 For performance reasons, the data is evaluated in chunks, so it can 310 try and evaluate the smallest portion of the data possible, evaluating 311 additional chunks as necessary. 312 """ 313 314 data = filter(None, data.split('\n')) 315 316 ascii = [chr(c) for c in range(127)] # 7-bit ASCII 317 318 # build frequency tables 319 chunkLength = min(10, len(data)) 320 iteration = 0 321 charFrequency = {} 322 modes = {} 323 delims = {} 324 start, end = 0, min(chunkLength, len(data)) 325 while start < len(data): 326 iteration += 1 327 for line in data[start:end]: 328 for char in ascii: 329 metaFrequency = charFrequency.get(char, {}) 330 # must count even if frequency is 0 331 freq = line.count(char) 332 # value is the mode 333 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 334 charFrequency[char] = metaFrequency 335 336 for char in charFrequency.keys(): 337 items = charFrequency[char].items() 338 if len(items) == 1 and items[0][0] == 0: 339 continue 340 # get the mode of the frequencies 341 if len(items) > 1: 342 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, 343 items) 344 # adjust the mode - subtract the sum of all 345 # other frequencies 346 items.remove(modes[char]) 347 modes[char] = (modes[char][0], modes[char][1] 348 - reduce(lambda a, b: (0, a[1] + b[1]), 349 items)[1]) 350 else: 351 modes[char] = items[0] 352 353 # build a list of possible delimiters 354 modeList = modes.items() 355 total = float(chunkLength * iteration) 356 # (rows of consistent data) / (number of rows) = 100% 357 consistency = 1.0 358 # minimum consistency threshold 359 threshold = 0.9 360 while len(delims) == 0 and consistency >= threshold: 361 for k, v in modeList: 362 if v[0] > 0 and v[1] > 0: 363 if ((v[1]/total) >= consistency and 364 (delimiters is None or k in delimiters)): 365 delims[k] = v 366 consistency -= 0.01 367 368 if len(delims) == 1: 369 delim = delims.keys()[0] 370 skipinitialspace = (data[0].count(delim) == 371 data[0].count("%c " % delim)) 372 return (delim, skipinitialspace) 373 374 # analyze another chunkLength lines 375 start = end 376 end += chunkLength 377 378 if not delims: 379 return ('', 0) 380 381 # if there's more than one, fall back to a 'preferred' list 382 if len(delims) > 1: 383 for d in self.preferred: 384 if d in delims.keys(): 385 skipinitialspace = (data[0].count(d) == 386 data[0].count("%c " % d)) 387 return (d, skipinitialspace) 388 389 # nothing else indicates a preference, pick the character that 390 # dominates(?) 391 items = [(v,k) for (k,v) in delims.items()] 392 items.sort() 393 delim = items[-1][1] 394 395 skipinitialspace = (data[0].count(delim) == 396 data[0].count("%c " % delim)) 397 return (delim, skipinitialspace) 398 399 400 def has_header(self, sample): 401 # Creates a dictionary of types of data in each column. If any 402 # column is of a single type (say, integers), *except* for the first 403 # row, then the first row is presumed to be labels. If the type 404 # can't be determined, it is assumed to be a string in which case 405 # the length of the string is the determining factor: if all of the 406 # rows except for the first are the same length, it's a header. 407 # Finally, a 'vote' is taken at the end for each column, adding or 408 # subtracting from the likelihood of the first row being a header. 409 410 rdr = reader(StringIO(sample), self.sniff(sample)) 411 412 header = rdr.next() # assume first row is header 413 414 columns = len(header) 415 columnTypes = {} 416 for i in range(columns): columnTypes[i] = None 417 418 checked = 0 419 for row in rdr: 420 # arbitrary number of rows to check, to keep it sane 421 if checked > 20: 422 break 423 checked += 1 424 425 if len(row) != columns: 426 continue # skip rows that have irregular number of columns 427 428 for col in columnTypes.keys(): 429 430 for thisType in [int, long, float, complex]: 431 try: 432 thisType(row[col]) 433 break 434 except (ValueError, OverflowError): 435 pass 436 else: 437 # fallback to length of string 438 thisType = len(row[col]) 439 440 # treat longs as ints 441 if thisType == long: 442 thisType = int 443 444 if thisType != columnTypes[col]: 445 if columnTypes[col] is None: # add new column type 446 columnTypes[col] = thisType 447 else: 448 # type is inconsistent, remove column from 449 # consideration 450 del columnTypes[col] 451 452 # finally, compare results against first row and "vote" 453 # on whether it's a header 454 hasHeader = 0 455 for col, colType in columnTypes.items(): 456 if type(colType) == type(0): # it's a length 457 if len(header[col]) != colType: 458 hasHeader += 1 459 else: 460 hasHeader -= 1 461 else: # attempt typecast 462 try: 463 colType(header[col]) 464 except (ValueError, TypeError): 465 hasHeader += 1 466 else: 467 hasHeader -= 1 468 469 return hasHeader > 0