github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/third_party/stdlib/csv.py (about)

     1  
     2  """
     3  csv.py - read/write/investigate CSV files
     4  """
     5  
     6  import re
     7  import functools
     8  reduce = functools.reduce
     9  # from functools import reduce
    10  
    11  # TODO: Support from foo import * syntax.
    12  import _csv
    13  for name in _csv.__all__:
    14    globals()[name] = getattr(_csv, name)
    15  
    16  # from _csv import Error, __version__, writer, reader, register_dialect, \
    17  #                  unregister_dialect, get_dialect, list_dialects, \
    18  #                  field_size_limit, \
    19  #                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
    20  #                  __doc__
    21  # from _csv import Dialect as _Dialect
    22  _Dialect = _csv.Dialect
    23  
    24  import StringIO as _StringIO
    25  StringIO = _StringIO.StringIO
    26  # try:
    27  #     from cStringIO import StringIO
    28  # except ImportError:
    29  #     from StringIO import StringIO
    30  
    31  __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
    32              "Error", "Dialect", "__doc__", "excel", "excel_tab",
    33              "field_size_limit", "reader", "writer",
    34              "register_dialect", "get_dialect", "list_dialects", "Sniffer",
    35              "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
    36  
    37  class Dialect(object):
    38      """Describe an Excel dialect.
    39  
    40      This must be subclassed (see csv.excel).  Valid attributes are:
    41      delimiter, quotechar, escapechar, doublequote, skipinitialspace,
    42      lineterminator, quoting.
    43  
    44      """
    45      _name = ""
    46      _valid = False
    47      # placeholders
    48      delimiter = None
    49      quotechar = None
    50      escapechar = None
    51      doublequote = None
    52      skipinitialspace = None
    53      lineterminator = None
    54      quoting = None
    55  
    56      def __init__(self):
    57          if self.__class__ != Dialect:
    58              self._valid = True
    59          self._validate()
    60  
    61      def _validate(self):
    62          try:
    63              _Dialect(self)
    64          except TypeError, e:
    65              # We do this for compatibility with py2.3
    66              raise Error(str(e))
    67  
    68  class excel(Dialect):
    69      """Describe the usual properties of Excel-generated CSV files."""
    70      delimiter = ','
    71      quotechar = '"'
    72      doublequote = True
    73      skipinitialspace = False
    74      lineterminator = '\r\n'
    75      quoting = QUOTE_MINIMAL
    76  register_dialect("excel", excel)
    77  
    78  class excel_tab(excel):
    79      """Describe the usual properties of Excel-generated TAB-delimited files."""
    80      delimiter = '\t'
    81  register_dialect("excel-tab", excel_tab)
    82  
    83  
    84  class DictReader(object):
    85      def __init__(self, f, fieldnames=None, restkey=None, restval=None,
    86                   dialect="excel", *args, **kwds):
    87          self._fieldnames = fieldnames   # list of keys for the dict
    88          self.restkey = restkey          # key to catch long rows
    89          self.restval = restval          # default value for short rows
    90          self.reader = reader(f, dialect, *args, **kwds)
    91          self.dialect = dialect
    92          self.line_num = 0
    93  
    94      def __iter__(self):
    95          return self
    96  
    97      # @property
    98      def fieldnames(self):
    99          if self._fieldnames is None:
   100              try:
   101                  self._fieldnames = self.reader.next()
   102              except StopIteration:
   103                  pass
   104          self.line_num = self.reader.line_num
   105          return self._fieldnames
   106      fieldnames = property(fieldnames)
   107  
   108      # Issue 20004: Because DictReader is a classic class, this setter is
   109      # ignored.  At this point in 2.7's lifecycle, it is too late to change the
   110      # base class for fear of breaking working code.  If you want to change
   111      # fieldnames without overwriting the getter, set _fieldnames directly.
   112      @fieldnames.setter
   113      def fieldnames(self, value):
   114          self._fieldnames = value
   115  
   116      def next(self):
   117          if self.line_num == 0:
   118              # Used only for its side effect.
   119              self.fieldnames
   120          row = self.reader.next()
   121          self.line_num = self.reader.line_num
   122  
   123          # unlike the basic reader, we prefer not to return blanks,
   124          # because we will typically wind up with a dict full of None
   125          # values
   126          while row == []:
   127              row = self.reader.next()
   128          d = dict(zip(self.fieldnames, row))
   129          lf = len(self.fieldnames)
   130          lr = len(row)
   131          if lf < lr:
   132              d[self.restkey] = row[lf:]
   133          elif lf > lr:
   134              for key in self.fieldnames[lr:]:
   135                  d[key] = self.restval
   136          return d
   137  
   138  
   139  class DictWriter(object):
   140      def __init__(self, f, fieldnames, restval="", extrasaction="raise",
   141                   dialect="excel", *args, **kwds):
   142          self.fieldnames = fieldnames    # list of keys for the dict
   143          self.restval = restval          # for writing short dicts
   144          if extrasaction.lower() not in ("raise", "ignore"):
   145              raise ValueError, \
   146                    ("extrasaction (%s) must be 'raise' or 'ignore'" %
   147                     extrasaction)
   148          self.extrasaction = extrasaction
   149          self.writer = writer(f, dialect, *args, **kwds)
   150  
   151      def writeheader(self):
   152          header = dict(zip(self.fieldnames, self.fieldnames))
   153          self.writerow(header)
   154  
   155      def _dict_to_list(self, rowdict):
   156          if self.extrasaction == "raise":
   157              wrong_fields = [k for k in rowdict if k not in self.fieldnames]
   158              if wrong_fields:
   159                  raise ValueError("dict contains fields not in fieldnames: "
   160                                   + ", ".join([repr(x) for x in wrong_fields]))
   161          return [rowdict.get(key, self.restval) for key in self.fieldnames]
   162  
   163      def writerow(self, rowdict):
   164          return self.writer.writerow(self._dict_to_list(rowdict))
   165  
   166      def writerows(self, rowdicts):
   167          rows = []
   168          for rowdict in rowdicts:
   169              rows.append(self._dict_to_list(rowdict))
   170          return self.writer.writerows(rows)
   171  
   172  # Guard Sniffer's type checking against builds that exclude complex()
   173  # try:
   174  #     complex
   175  # except NameError:
   176  #     complex = float
   177  complex = float
   178  
   179  class Sniffer(object):
   180      '''
   181      "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
   182      Returns a Dialect object.
   183      '''
   184      def __init__(self):
   185          # in case there is more than one possible delimiter
   186          self.preferred = [',', '\t', ';', ' ', ':']
   187  
   188  
   189      def sniff(self, sample, delimiters=None):
   190          """
   191          Returns a dialect (or None) corresponding to the sample
   192          """
   193  
   194          quotechar, doublequote, delimiter, skipinitialspace = \
   195                     self._guess_quote_and_delimiter(sample, delimiters)
   196          if not delimiter:
   197              delimiter, skipinitialspace = self._guess_delimiter(sample,
   198                                                                  delimiters)
   199  
   200          if not delimiter:
   201              raise Error, "Could not determine delimiter"
   202  
   203          class dialect(Dialect):
   204              _name = "sniffed"
   205              lineterminator = '\r\n'
   206              quoting = QUOTE_MINIMAL
   207              # escapechar = ''
   208  
   209          dialect.doublequote = doublequote
   210          dialect.delimiter = delimiter
   211          # _csv.reader won't accept a quotechar of ''
   212          dialect.quotechar = quotechar or '"'
   213          dialect.skipinitialspace = skipinitialspace
   214  
   215          return dialect
   216  
   217  
   218      def _guess_quote_and_delimiter(self, data, delimiters):
   219          """
   220          Looks for text enclosed between two identical quotes
   221          (the probable quotechar) which are preceded and followed
   222          by the same character (the probable delimiter).
   223          For example:
   224                           ,'some text',
   225          The quote with the most wins, same with the delimiter.
   226          If there is no quotechar the delimiter can't be determined
   227          this way.
   228          """
   229  
   230          matches = []
   231          for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
   232                        '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
   233                        '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
   234                        '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
   235              regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
   236              matches = regexp.findall(data)
   237              if matches:
   238                  break
   239  
   240          if not matches:
   241              # (quotechar, doublequote, delimiter, skipinitialspace)
   242              return ('', False, None, 0)
   243          quotes = {}
   244          delims = {}
   245          spaces = 0
   246          for m in matches:
   247              n = regexp.groupindex['quote'] - 1
   248              key = m[n]
   249              if key:
   250                  quotes[key] = quotes.get(key, 0) + 1
   251              try:
   252                  n = regexp.groupindex['delim'] - 1
   253                  key = m[n]
   254              except KeyError:
   255                  continue
   256              if key and (delimiters is None or key in delimiters):
   257                  delims[key] = delims.get(key, 0) + 1
   258              try:
   259                  n = regexp.groupindex['space'] - 1
   260              except KeyError:
   261                  continue
   262              if m[n]:
   263                  spaces += 1
   264  
   265          quotechar = reduce(lambda a, b, quotes = quotes:
   266                             (quotes[a] > quotes[b]) and a or b, quotes.keys())
   267  
   268          if delims:
   269              delim = reduce(lambda a, b, delims = delims:
   270                             (delims[a] > delims[b]) and a or b, delims.keys())
   271              skipinitialspace = delims[delim] == spaces
   272              if delim == '\n': # most likely a file with a single column
   273                  delim = ''
   274          else:
   275              # there is *no* delimiter, it's a single column of quoted data
   276              delim = ''
   277              skipinitialspace = 0
   278  
   279          # if we see an extra quote between delimiters, we've got a
   280          # double quoted format
   281          dq_regexp = re.compile(
   282                                 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
   283                                 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
   284  
   285  
   286  
   287          if dq_regexp.search(data):
   288              doublequote = True
   289          else:
   290              doublequote = False
   291  
   292          return (quotechar, doublequote, delim, skipinitialspace)
   293  
   294  
   295      def _guess_delimiter(self, data, delimiters):
   296          """
   297          The delimiter /should/ occur the same number of times on
   298          each row. However, due to malformed data, it may not. We don't want
   299          an all or nothing approach, so we allow for small variations in this
   300          number.
   301            1) build a table of the frequency of each character on every line.
   302            2) build a table of frequencies of this frequency (meta-frequency?),
   303               e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
   304               7 times in 2 rows'
   305            3) use the mode of the meta-frequency to determine the /expected/
   306               frequency for that character
   307            4) find out how often the character actually meets that goal
   308            5) the character that best meets its goal is the delimiter
   309          For performance reasons, the data is evaluated in chunks, so it can
   310          try and evaluate the smallest portion of the data possible, evaluating
   311          additional chunks as necessary.
   312          """
   313  
   314          data = filter(None, data.split('\n'))
   315  
   316          ascii = [chr(c) for c in range(127)] # 7-bit ASCII
   317  
   318          # build frequency tables
   319          chunkLength = min(10, len(data))
   320          iteration = 0
   321          charFrequency = {}
   322          modes = {}
   323          delims = {}
   324          start, end = 0, min(chunkLength, len(data))
   325          while start < len(data):
   326              iteration += 1
   327              for line in data[start:end]:
   328                  for char in ascii:
   329                      metaFrequency = charFrequency.get(char, {})
   330                      # must count even if frequency is 0
   331                      freq = line.count(char)
   332                      # value is the mode
   333                      metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
   334                      charFrequency[char] = metaFrequency
   335  
   336              for char in charFrequency.keys():
   337                  items = charFrequency[char].items()
   338                  if len(items) == 1 and items[0][0] == 0:
   339                      continue
   340                  # get the mode of the frequencies
   341                  if len(items) > 1:
   342                      modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
   343                                           items)
   344                      # adjust the mode - subtract the sum of all
   345                      # other frequencies
   346                      items.remove(modes[char])
   347                      modes[char] = (modes[char][0], modes[char][1]
   348                                     - reduce(lambda a, b: (0, a[1] + b[1]),
   349                                              items)[1])
   350                  else:
   351                      modes[char] = items[0]
   352  
   353              # build a list of possible delimiters
   354              modeList = modes.items()
   355              total = float(chunkLength * iteration)
   356              # (rows of consistent data) / (number of rows) = 100%
   357              consistency = 1.0
   358              # minimum consistency threshold
   359              threshold = 0.9
   360              while len(delims) == 0 and consistency >= threshold:
   361                  for k, v in modeList:
   362                      if v[0] > 0 and v[1] > 0:
   363                          if ((v[1]/total) >= consistency and
   364                              (delimiters is None or k in delimiters)):
   365                              delims[k] = v
   366                  consistency -= 0.01
   367  
   368              if len(delims) == 1:
   369                  delim = delims.keys()[0]
   370                  skipinitialspace = (data[0].count(delim) ==
   371                                      data[0].count("%c " % delim))
   372                  return (delim, skipinitialspace)
   373  
   374              # analyze another chunkLength lines
   375              start = end
   376              end += chunkLength
   377  
   378          if not delims:
   379              return ('', 0)
   380  
   381          # if there's more than one, fall back to a 'preferred' list
   382          if len(delims) > 1:
   383              for d in self.preferred:
   384                  if d in delims.keys():
   385                      skipinitialspace = (data[0].count(d) ==
   386                                          data[0].count("%c " % d))
   387                      return (d, skipinitialspace)
   388  
   389          # nothing else indicates a preference, pick the character that
   390          # dominates(?)
   391          items = [(v,k) for (k,v) in delims.items()]
   392          items.sort()
   393          delim = items[-1][1]
   394  
   395          skipinitialspace = (data[0].count(delim) ==
   396                              data[0].count("%c " % delim))
   397          return (delim, skipinitialspace)
   398  
   399  
   400      def has_header(self, sample):
   401          # Creates a dictionary of types of data in each column. If any
   402          # column is of a single type (say, integers), *except* for the first
   403          # row, then the first row is presumed to be labels. If the type
   404          # can't be determined, it is assumed to be a string in which case
   405          # the length of the string is the determining factor: if all of the
   406          # rows except for the first are the same length, it's a header.
   407          # Finally, a 'vote' is taken at the end for each column, adding or
   408          # subtracting from the likelihood of the first row being a header.
   409  
   410          rdr = reader(StringIO(sample), self.sniff(sample))
   411  
   412          header = rdr.next() # assume first row is header
   413  
   414          columns = len(header)
   415          columnTypes = {}
   416          for i in range(columns): columnTypes[i] = None
   417  
   418          checked = 0
   419          for row in rdr:
   420              # arbitrary number of rows to check, to keep it sane
   421              if checked > 20:
   422                  break
   423              checked += 1
   424  
   425              if len(row) != columns:
   426                  continue # skip rows that have irregular number of columns
   427  
   428              for col in columnTypes.keys():
   429  
   430                  for thisType in [int, long, float, complex]:
   431                      try:
   432                          thisType(row[col])
   433                          break
   434                      except (ValueError, OverflowError):
   435                          pass
   436                  else:
   437                      # fallback to length of string
   438                      thisType = len(row[col])
   439  
   440                  # treat longs as ints
   441                  if thisType == long:
   442                      thisType = int
   443  
   444                  if thisType != columnTypes[col]:
   445                      if columnTypes[col] is None: # add new column type
   446                          columnTypes[col] = thisType
   447                      else:
   448                          # type is inconsistent, remove column from
   449                          # consideration
   450                          del columnTypes[col]
   451  
   452          # finally, compare results against first row and "vote"
   453          # on whether it's a header
   454          hasHeader = 0
   455          for col, colType in columnTypes.items():
   456              if type(colType) == type(0): # it's a length
   457                  if len(header[col]) != colType:
   458                      hasHeader += 1
   459                  else:
   460                      hasHeader -= 1
   461              else: # attempt typecast
   462                  try:
   463                      colType(header[col])
   464                  except (ValueError, TypeError):
   465                      hasHeader += 1
   466                  else:
   467                      hasHeader -= 1
   468  
   469          return hasHeader > 0