github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/third_party/stdlib/textwrap.py (about)

     1  """Text wrapping and filling.
     2  """
     3  
     4  # Copyright (C) 1999-2001 Gregory P. Ward.
     5  # Copyright (C) 2002, 2003 Python Software Foundation.
     6  # Written by Greg Ward <gward@python.net>
     7  
     8  __revision__ = "$Id$"
     9  
    10  import string, re
    11  
    12  try:
    13      _unicode = unicode
    14  except NameError:
    15      # If Python is built without Unicode support, the unicode type
    16      # will not exist. Fake one.
    17      class _unicode(object):
    18          pass
    19  
    20  # Do the right thing with boolean values for all known Python versions
    21  # (so this module can be copied to projects that don't depend on Python
    22  # 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
    23  #try:
    24  #    True, False
    25  #except NameError:
    26  #    (True, False) = (1, 0)
    27  
    28  __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
    29  
    30  # Hardcode the recognized whitespace characters to the US-ASCII
    31  # whitespace characters.  The main reason for doing this is that in
    32  # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
    33  # that character winds up in string.whitespace.  Respecting
    34  # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
    35  # same as any other whitespace char, which is clearly wrong (it's a
    36  # *non-breaking* space), 2) possibly cause problems with Unicode,
    37  # since 0xa0 is not in range(128).
    38  _whitespace = '\t\n\x0b\x0c\r '
    39  
    40  class TextWrapper(object):
    41      """
    42      Object for wrapping/filling text.  The public interface consists of
    43      the wrap() and fill() methods; the other methods are just there for
    44      subclasses to override in order to tweak the default behaviour.
    45      If you want to completely replace the main wrapping algorithm,
    46      you'll probably have to override _wrap_chunks().
    47  
    48      Several instance attributes control various aspects of wrapping:
    49        width (default: 70)
    50          the maximum width of wrapped lines (unless break_long_words
    51          is false)
    52        initial_indent (default: "")
    53          string that will be prepended to the first line of wrapped
    54          output.  Counts towards the line's width.
    55        subsequent_indent (default: "")
    56          string that will be prepended to all lines save the first
    57          of wrapped output; also counts towards each line's width.
    58        expand_tabs (default: true)
    59          Expand tabs in input text to spaces before further processing.
    60          Each tab will become 1 .. 8 spaces, depending on its position in
    61          its line.  If false, each tab is treated as a single character.
    62        replace_whitespace (default: true)
    63          Replace all whitespace characters in the input text by spaces
    64          after tab expansion.  Note that if expand_tabs is false and
    65          replace_whitespace is true, every tab will be converted to a
    66          single space!
    67        fix_sentence_endings (default: false)
    68          Ensure that sentence-ending punctuation is always followed
    69          by two spaces.  Off by default because the algorithm is
    70          (unavoidably) imperfect.
    71        break_long_words (default: true)
    72          Break words longer than 'width'.  If false, those words will not
    73          be broken, and some lines might be longer than 'width'.
    74        break_on_hyphens (default: true)
    75          Allow breaking hyphenated words. If true, wrapping will occur
    76          preferably on whitespaces and right after hyphens part of
    77          compound words.
    78        drop_whitespace (default: true)
    79          Drop leading and trailing whitespace from lines.
    80      """
    81  
    82      # whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
    83      whitespace_trans = '\x00\x01\x02\x03\x04\x05\x06\x07\x08     \x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'
    84  
    85      unicode_whitespace_trans = {}
    86      uspace = ord(u' ')
    87      for x in map(ord, _whitespace):
    88          unicode_whitespace_trans[x] = uspace
    89  
    90      # This funky little regex is just the trick for splitting
    91      # text up into word-wrappable chunks.  E.g.
    92      #   "Hello there -- you goof-ball, use the -b option!"
    93      # splits into
    94      #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
    95      # (after stripping out empty strings).
    96      wordsep_re = re.compile(
    97          r'(\s+|'                                  # any whitespace
    98          r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
    99          r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
   100  
   101      # This less funky little regex just split on recognized spaces. E.g.
   102      #   "Hello there -- you goof-ball, use the -b option!"
   103      # splits into
   104      #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
   105      wordsep_simple_re = re.compile(r'(\s+)')
   106  
   107      # XXX this is not locale- or charset-aware -- string.lowercase
   108      # is US-ASCII only (and therefore English-only)
   109      sentence_end_re = re.compile(r'[%s]'              # lowercase letter
   110                                   r'[\.\!\?]'          # sentence-ending punct.
   111                                   r'[\"\']?'           # optional end-of-quote
   112                                   r'\Z'                # end of chunk
   113                                   % string.lowercase)
   114  
   115  
   116      def __init__(self,
   117                   width=70,
   118                   initial_indent="",
   119                   subsequent_indent="",
   120                   expand_tabs=True,
   121                   replace_whitespace=True,
   122                   fix_sentence_endings=False,
   123                   break_long_words=True,
   124                   drop_whitespace=True,
   125                   break_on_hyphens=True):
   126          self.width = width
   127          self.initial_indent = initial_indent
   128          self.subsequent_indent = subsequent_indent
   129          self.expand_tabs = expand_tabs
   130          self.replace_whitespace = replace_whitespace
   131          self.fix_sentence_endings = fix_sentence_endings
   132          self.break_long_words = break_long_words
   133          self.drop_whitespace = drop_whitespace
   134          self.break_on_hyphens = break_on_hyphens
   135  
   136          # recompile the regexes for Unicode mode -- done in this clumsy way for
   137          # backwards compatibility because it's rather common to monkey-patch
   138          # the TextWrapper class' wordsep_re attribute.
   139          self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
   140          self.wordsep_simple_re_uni = re.compile(
   141              self.wordsep_simple_re.pattern, re.U)
   142  
   143  
   144      # -- Private methods -----------------------------------------------
   145      # (possibly useful for subclasses to override)
   146  
   147      def _munge_whitespace(self, text):
   148          """_munge_whitespace(text : string) -> string
   149  
   150          Munge whitespace in text: expand tabs and convert all other
   151          whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
   152          becomes " foo    bar  baz".
   153          """
   154          if self.expand_tabs:
   155              # text = text.expandtabs()
   156              text = ' '.join((' '.join(text.split('\n'))).split('\t'))
   157          if self.replace_whitespace:
   158              # if isinstance(text, str):
   159              #     text = text.translate(self.whitespace_trans)
   160              # elif isinstance(text, _unicode):
   161              #     text = text.translate(self.unicode_whitespace_trans)
   162              text = ' '.join(' '.join(text.split('\n')).split('\t'))
   163          return text
   164  
   165  
   166      def _split(self, text):
   167          """_split(text : string) -> [string]
   168  
   169          Split the text to wrap into indivisible chunks.  Chunks are
   170          not quite the same as words; see _wrap_chunks() for full
   171          details.  As an example, the text
   172            Look, goof-ball -- use the -b option!
   173          breaks into the following chunks:
   174            'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
   175            'use', ' ', 'the', ' ', '-b', ' ', 'option!'
   176          if break_on_hyphens is True, or in:
   177            'Look,', ' ', 'goof-ball', ' ', '--', ' ',
   178            'use', ' ', 'the', ' ', '-b', ' ', option!'
   179          otherwise.
   180          """
   181          if isinstance(text, _unicode):
   182              if self.break_on_hyphens:
   183                  pat = self.wordsep_re_uni
   184              else:
   185                  pat = self.wordsep_simple_re_uni
   186          else:
   187              if self.break_on_hyphens:
   188                  pat = self.wordsep_re
   189              else:
   190                  pat = self.wordsep_simple_re
   191          chunks = pat.split(text)
   192          # chunks = filter(None, chunks)  # remove empty chunks
   193          chunks = [x for x in chunks if x is not None]
   194          return chunks
   195  
   196      def _fix_sentence_endings(self, chunks):
   197          """_fix_sentence_endings(chunks : [string])
   198  
   199          Correct for sentence endings buried in 'chunks'.  Eg. when the
   200          original text contains "... foo.\\nBar ...", munge_whitespace()
   201          and split() will convert that to [..., "foo.", " ", "Bar", ...]
   202          which has one too few spaces; this method simply changes the one
   203          space to two.
   204          """
   205          i = 0
   206          patsearch = self.sentence_end_re.search
   207          while i < len(chunks)-1:
   208              if chunks[i+1] == " " and patsearch(chunks[i]):
   209                  chunks[i+1] = "  "
   210                  i += 2
   211              else:
   212                  i += 1
   213  
   214      def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
   215          """_handle_long_word(chunks : [string],
   216                               cur_line : [string],
   217                               cur_len : int, width : int)
   218  
   219          Handle a chunk of text (most likely a word, not whitespace) that
   220          is too long to fit in any line.
   221          """
   222          # Figure out when indent is larger than the specified width, and make
   223          # sure at least one character is stripped off on every pass
   224          if width < 1:
   225              space_left = 1
   226          else:
   227              space_left = width - cur_len
   228  
   229          # If we're allowed to break long words, then do so: put as much
   230          # of the next chunk onto the current line as will fit.
   231          if self.break_long_words:
   232              cur_line.append(reversed_chunks[-1][:space_left])
   233              reversed_chunks[-1] = reversed_chunks[-1][space_left:]
   234  
   235          # Otherwise, we have to preserve the long word intact.  Only add
   236          # it to the current line if there's nothing already there --
   237          # that minimizes how much we violate the width constraint.
   238          elif not cur_line:
   239              cur_line.append(reversed_chunks.pop())
   240  
   241          # If we're not allowed to break long words, and there's already
   242          # text on the current line, do nothing.  Next time through the
   243          # main loop of _wrap_chunks(), we'll wind up here again, but
   244          # cur_len will be zero, so the next line will be entirely
   245          # devoted to the long word that we can't handle right now.
   246  
   247      def _wrap_chunks(self, chunks):
   248          """_wrap_chunks(chunks : [string]) -> [string]
   249  
   250          Wrap a sequence of text chunks and return a list of lines of
   251          length 'self.width' or less.  (If 'break_long_words' is false,
   252          some lines may be longer than this.)  Chunks correspond roughly
   253          to words and the whitespace between them: each chunk is
   254          indivisible (modulo 'break_long_words'), but a line break can
   255          come between any two chunks.  Chunks should not have internal
   256          whitespace; ie. a chunk is either all whitespace or a "word".
   257          Whitespace chunks will be removed from the beginning and end of
   258          lines, but apart from that whitespace is preserved.
   259          """
   260          lines = []
   261          if self.width <= 0:
   262              raise ValueError("invalid width %r (must be > 0)" % self.width)
   263  
   264          # Arrange in reverse order so items can be efficiently popped
   265          # from a stack of chucks.
   266          chunks.reverse()
   267  
   268          while chunks:
   269  
   270              # Start the list of chunks that will make up the current line.
   271              # cur_len is just the length of all the chunks in cur_line.
   272              cur_line = []
   273              cur_len = 0
   274  
   275              # Figure out which static string will prefix this line.
   276              if lines:
   277                  indent = self.subsequent_indent
   278              else:
   279                  indent = self.initial_indent
   280  
   281              # Maximum width for this line.
   282              width = self.width - len(indent)
   283  
   284              # First chunk on line is whitespace -- drop it, unless this
   285              # is the very beginning of the text (ie. no lines started yet).
   286              if self.drop_whitespace and chunks[-1].strip() == '' and lines:
   287                  # del chunks[-1]
   288                  chunks.pop()
   289  
   290              while chunks:
   291                  l = len(chunks[-1])
   292  
   293                  # Can at least squeeze this chunk onto the current line.
   294                  if cur_len + l <= width:
   295                      cur_line.append(chunks.pop())
   296                      cur_len += l
   297  
   298                  # Nope, this line is full.
   299                  else:
   300                      break
   301  
   302              # The current line is full, and the next chunk is too big to
   303              # fit on *any* line (not just this one).
   304              if chunks and len(chunks[-1]) > width:
   305                  self._handle_long_word(chunks, cur_line, cur_len, width)
   306  
   307              # If the last chunk on this line is all whitespace, drop it.
   308              if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
   309                  # del cur_line[-1]
   310                  cur_line.pop()
   311  
   312              # Convert current line back to a string and store it in list
   313              # of all lines (return value).
   314              if cur_line:
   315                  lines.append(indent + ''.join(cur_line))
   316  
   317          return lines
   318  
   319  
   320      # -- Public interface ----------------------------------------------
   321  
   322      def wrap(self, text):
   323          """wrap(text : string) -> [string]
   324  
   325          Reformat the single paragraph in 'text' so it fits in lines of
   326          no more than 'self.width' columns, and return a list of wrapped
   327          lines.  Tabs in 'text' are expanded with string.expandtabs(),
   328          and all other whitespace characters (including newline) are
   329          converted to space.
   330          """
   331          text = self._munge_whitespace(text)
   332          chunks = self._split(text)
   333          if self.fix_sentence_endings:
   334              self._fix_sentence_endings(chunks)
   335          return self._wrap_chunks(chunks)
   336  
   337      def fill(self, text):
   338          """fill(text : string) -> string
   339  
   340          Reformat the single paragraph in 'text' to fit in lines of no
   341          more than 'self.width' columns, and return a new string
   342          containing the entire wrapped paragraph.
   343          """
   344          return "\n".join(self.wrap(text))
   345  
   346  
   347  # -- Convenience interface ---------------------------------------------
   348  
   349  def wrap(text, width=70, **kwargs):
   350      """Wrap a single paragraph of text, returning a list of wrapped lines.
   351  
   352      Reformat the single paragraph in 'text' so it fits in lines of no
   353      more than 'width' columns, and return a list of wrapped lines.  By
   354      default, tabs in 'text' are expanded with string.expandtabs(), and
   355      all other whitespace characters (including newline) are converted to
   356      space.  See TextWrapper class for available keyword args to customize
   357      wrapping behaviour.
   358      """
   359      w = TextWrapper(width=width, **kwargs)
   360      return w.wrap(text)
   361  
   362  def fill(text, width=70, **kwargs):
   363      """Fill a single paragraph of text, returning a new string.
   364  
   365      Reformat the single paragraph in 'text' to fit in lines of no more
   366      than 'width' columns, and return a new string containing the entire
   367      wrapped paragraph.  As with wrap(), tabs are expanded and other
   368      whitespace characters converted to space.  See TextWrapper class for
   369      available keyword args to customize wrapping behaviour.
   370      """
   371      w = TextWrapper(width=width, **kwargs)
   372      return w.fill(text)
   373  
   374  
   375  # -- Loosely related functionality -------------------------------------
   376  
   377  _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
   378  _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
   379  
   380  def dedent(text):
   381      """Remove any common leading whitespace from every line in `text`.
   382  
   383      This can be used to make triple-quoted strings line up with the left
   384      edge of the display, while still presenting them in the source code
   385      in indented form.
   386  
   387      Note that tabs and spaces are both treated as whitespace, but they
   388      are not equal: the lines "  hello" and "\\thello" are
   389      considered to have no common leading whitespace.  (This behaviour is
   390      new in Python 2.5; older versions of this module incorrectly
   391      expanded tabs before searching for common leading whitespace.)
   392      """
   393      # Look for the longest leading string of spaces and tabs common to
   394      # all lines.
   395      margin = None
   396      text = _whitespace_only_re.sub('', text)
   397      indents = _leading_whitespace_re.findall(text)
   398      for indent in indents:
   399          if margin is None:
   400              margin = indent
   401  
   402          # Current line more deeply indented than previous winner:
   403          # no change (previous winner is still on top).
   404          elif indent.startswith(margin):
   405              pass
   406  
   407          # Current line consistent with and no deeper than previous winner:
   408          # it's the new winner.
   409          elif margin.startswith(indent):
   410              margin = indent
   411  
   412          # Find the largest common whitespace between current line and previous
   413          # winner.
   414          else:
   415              for i, (x, y) in enumerate(zip(margin, indent)):
   416                  if x != y:
   417                      margin = margin[:i]
   418                      break
   419              else:
   420                  margin = margin[:len(indent)]
   421  
   422      # sanity check (testing/debugging only)
   423      if 0 and margin:
   424          for line in text.split("\n"):
   425              assert not line or line.startswith(margin), \
   426                     "line = %r, margin = %r" % (line, margin)
   427  
   428      if margin:
   429          text = re.sub(r'(?m)^' + margin, '', text)
   430      return text
   431  
   432  if __name__ == "__main__":
   433      #print dedent("\tfoo\n\tbar")
   434      #print dedent("  \thello there\n  \t  how are you?")
   435      print dedent("Hello there.\n  This is indented.")