github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/third_party/stdlib/textwrap.py (about) 1 """Text wrapping and filling. 2 """ 3 4 # Copyright (C) 1999-2001 Gregory P. Ward. 5 # Copyright (C) 2002, 2003 Python Software Foundation. 6 # Written by Greg Ward <gward@python.net> 7 8 __revision__ = "$Id$" 9 10 import string, re 11 12 try: 13 _unicode = unicode 14 except NameError: 15 # If Python is built without Unicode support, the unicode type 16 # will not exist. Fake one. 17 class _unicode(object): 18 pass 19 20 # Do the right thing with boolean values for all known Python versions 21 # (so this module can be copied to projects that don't depend on Python 22 # 2.3, e.g. Optik and Docutils) by uncommenting the block of code below. 23 #try: 24 # True, False 25 #except NameError: 26 # (True, False) = (1, 0) 27 28 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent'] 29 30 # Hardcode the recognized whitespace characters to the US-ASCII 31 # whitespace characters. The main reason for doing this is that in 32 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales 33 # that character winds up in string.whitespace. Respecting 34 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the 35 # same as any other whitespace char, which is clearly wrong (it's a 36 # *non-breaking* space), 2) possibly cause problems with Unicode, 37 # since 0xa0 is not in range(128). 38 _whitespace = '\t\n\x0b\x0c\r ' 39 40 class TextWrapper(object): 41 """ 42 Object for wrapping/filling text. The public interface consists of 43 the wrap() and fill() methods; the other methods are just there for 44 subclasses to override in order to tweak the default behaviour. 45 If you want to completely replace the main wrapping algorithm, 46 you'll probably have to override _wrap_chunks(). 47 48 Several instance attributes control various aspects of wrapping: 49 width (default: 70) 50 the maximum width of wrapped lines (unless break_long_words 51 is false) 52 initial_indent (default: "") 53 string that will be prepended to the first line of wrapped 54 output. Counts towards the line's width. 55 subsequent_indent (default: "") 56 string that will be prepended to all lines save the first 57 of wrapped output; also counts towards each line's width. 58 expand_tabs (default: true) 59 Expand tabs in input text to spaces before further processing. 60 Each tab will become 1 .. 8 spaces, depending on its position in 61 its line. If false, each tab is treated as a single character. 62 replace_whitespace (default: true) 63 Replace all whitespace characters in the input text by spaces 64 after tab expansion. Note that if expand_tabs is false and 65 replace_whitespace is true, every tab will be converted to a 66 single space! 67 fix_sentence_endings (default: false) 68 Ensure that sentence-ending punctuation is always followed 69 by two spaces. Off by default because the algorithm is 70 (unavoidably) imperfect. 71 break_long_words (default: true) 72 Break words longer than 'width'. If false, those words will not 73 be broken, and some lines might be longer than 'width'. 74 break_on_hyphens (default: true) 75 Allow breaking hyphenated words. If true, wrapping will occur 76 preferably on whitespaces and right after hyphens part of 77 compound words. 78 drop_whitespace (default: true) 79 Drop leading and trailing whitespace from lines. 80 """ 81 82 # whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace)) 83 whitespace_trans = '\x00\x01\x02\x03\x04\x05\x06\x07\x08 \x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' 84 85 unicode_whitespace_trans = {} 86 uspace = ord(u' ') 87 for x in map(ord, _whitespace): 88 unicode_whitespace_trans[x] = uspace 89 90 # This funky little regex is just the trick for splitting 91 # text up into word-wrappable chunks. E.g. 92 # "Hello there -- you goof-ball, use the -b option!" 93 # splits into 94 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 95 # (after stripping out empty strings). 96 wordsep_re = re.compile( 97 r'(\s+|' # any whitespace 98 r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words 99 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash 100 101 # This less funky little regex just split on recognized spaces. E.g. 102 # "Hello there -- you goof-ball, use the -b option!" 103 # splits into 104 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ 105 wordsep_simple_re = re.compile(r'(\s+)') 106 107 # XXX this is not locale- or charset-aware -- string.lowercase 108 # is US-ASCII only (and therefore English-only) 109 sentence_end_re = re.compile(r'[%s]' # lowercase letter 110 r'[\.\!\?]' # sentence-ending punct. 111 r'[\"\']?' # optional end-of-quote 112 r'\Z' # end of chunk 113 % string.lowercase) 114 115 116 def __init__(self, 117 width=70, 118 initial_indent="", 119 subsequent_indent="", 120 expand_tabs=True, 121 replace_whitespace=True, 122 fix_sentence_endings=False, 123 break_long_words=True, 124 drop_whitespace=True, 125 break_on_hyphens=True): 126 self.width = width 127 self.initial_indent = initial_indent 128 self.subsequent_indent = subsequent_indent 129 self.expand_tabs = expand_tabs 130 self.replace_whitespace = replace_whitespace 131 self.fix_sentence_endings = fix_sentence_endings 132 self.break_long_words = break_long_words 133 self.drop_whitespace = drop_whitespace 134 self.break_on_hyphens = break_on_hyphens 135 136 # recompile the regexes for Unicode mode -- done in this clumsy way for 137 # backwards compatibility because it's rather common to monkey-patch 138 # the TextWrapper class' wordsep_re attribute. 139 self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U) 140 self.wordsep_simple_re_uni = re.compile( 141 self.wordsep_simple_re.pattern, re.U) 142 143 144 # -- Private methods ----------------------------------------------- 145 # (possibly useful for subclasses to override) 146 147 def _munge_whitespace(self, text): 148 """_munge_whitespace(text : string) -> string 149 150 Munge whitespace in text: expand tabs and convert all other 151 whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz" 152 becomes " foo bar baz". 153 """ 154 if self.expand_tabs: 155 # text = text.expandtabs() 156 text = ' '.join((' '.join(text.split('\n'))).split('\t')) 157 if self.replace_whitespace: 158 # if isinstance(text, str): 159 # text = text.translate(self.whitespace_trans) 160 # elif isinstance(text, _unicode): 161 # text = text.translate(self.unicode_whitespace_trans) 162 text = ' '.join(' '.join(text.split('\n')).split('\t')) 163 return text 164 165 166 def _split(self, text): 167 """_split(text : string) -> [string] 168 169 Split the text to wrap into indivisible chunks. Chunks are 170 not quite the same as words; see _wrap_chunks() for full 171 details. As an example, the text 172 Look, goof-ball -- use the -b option! 173 breaks into the following chunks: 174 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 175 'use', ' ', 'the', ' ', '-b', ' ', 'option!' 176 if break_on_hyphens is True, or in: 177 'Look,', ' ', 'goof-ball', ' ', '--', ' ', 178 'use', ' ', 'the', ' ', '-b', ' ', option!' 179 otherwise. 180 """ 181 if isinstance(text, _unicode): 182 if self.break_on_hyphens: 183 pat = self.wordsep_re_uni 184 else: 185 pat = self.wordsep_simple_re_uni 186 else: 187 if self.break_on_hyphens: 188 pat = self.wordsep_re 189 else: 190 pat = self.wordsep_simple_re 191 chunks = pat.split(text) 192 # chunks = filter(None, chunks) # remove empty chunks 193 chunks = [x for x in chunks if x is not None] 194 return chunks 195 196 def _fix_sentence_endings(self, chunks): 197 """_fix_sentence_endings(chunks : [string]) 198 199 Correct for sentence endings buried in 'chunks'. Eg. when the 200 original text contains "... foo.\\nBar ...", munge_whitespace() 201 and split() will convert that to [..., "foo.", " ", "Bar", ...] 202 which has one too few spaces; this method simply changes the one 203 space to two. 204 """ 205 i = 0 206 patsearch = self.sentence_end_re.search 207 while i < len(chunks)-1: 208 if chunks[i+1] == " " and patsearch(chunks[i]): 209 chunks[i+1] = " " 210 i += 2 211 else: 212 i += 1 213 214 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): 215 """_handle_long_word(chunks : [string], 216 cur_line : [string], 217 cur_len : int, width : int) 218 219 Handle a chunk of text (most likely a word, not whitespace) that 220 is too long to fit in any line. 221 """ 222 # Figure out when indent is larger than the specified width, and make 223 # sure at least one character is stripped off on every pass 224 if width < 1: 225 space_left = 1 226 else: 227 space_left = width - cur_len 228 229 # If we're allowed to break long words, then do so: put as much 230 # of the next chunk onto the current line as will fit. 231 if self.break_long_words: 232 cur_line.append(reversed_chunks[-1][:space_left]) 233 reversed_chunks[-1] = reversed_chunks[-1][space_left:] 234 235 # Otherwise, we have to preserve the long word intact. Only add 236 # it to the current line if there's nothing already there -- 237 # that minimizes how much we violate the width constraint. 238 elif not cur_line: 239 cur_line.append(reversed_chunks.pop()) 240 241 # If we're not allowed to break long words, and there's already 242 # text on the current line, do nothing. Next time through the 243 # main loop of _wrap_chunks(), we'll wind up here again, but 244 # cur_len will be zero, so the next line will be entirely 245 # devoted to the long word that we can't handle right now. 246 247 def _wrap_chunks(self, chunks): 248 """_wrap_chunks(chunks : [string]) -> [string] 249 250 Wrap a sequence of text chunks and return a list of lines of 251 length 'self.width' or less. (If 'break_long_words' is false, 252 some lines may be longer than this.) Chunks correspond roughly 253 to words and the whitespace between them: each chunk is 254 indivisible (modulo 'break_long_words'), but a line break can 255 come between any two chunks. Chunks should not have internal 256 whitespace; ie. a chunk is either all whitespace or a "word". 257 Whitespace chunks will be removed from the beginning and end of 258 lines, but apart from that whitespace is preserved. 259 """ 260 lines = [] 261 if self.width <= 0: 262 raise ValueError("invalid width %r (must be > 0)" % self.width) 263 264 # Arrange in reverse order so items can be efficiently popped 265 # from a stack of chucks. 266 chunks.reverse() 267 268 while chunks: 269 270 # Start the list of chunks that will make up the current line. 271 # cur_len is just the length of all the chunks in cur_line. 272 cur_line = [] 273 cur_len = 0 274 275 # Figure out which static string will prefix this line. 276 if lines: 277 indent = self.subsequent_indent 278 else: 279 indent = self.initial_indent 280 281 # Maximum width for this line. 282 width = self.width - len(indent) 283 284 # First chunk on line is whitespace -- drop it, unless this 285 # is the very beginning of the text (ie. no lines started yet). 286 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 287 # del chunks[-1] 288 chunks.pop() 289 290 while chunks: 291 l = len(chunks[-1]) 292 293 # Can at least squeeze this chunk onto the current line. 294 if cur_len + l <= width: 295 cur_line.append(chunks.pop()) 296 cur_len += l 297 298 # Nope, this line is full. 299 else: 300 break 301 302 # The current line is full, and the next chunk is too big to 303 # fit on *any* line (not just this one). 304 if chunks and len(chunks[-1]) > width: 305 self._handle_long_word(chunks, cur_line, cur_len, width) 306 307 # If the last chunk on this line is all whitespace, drop it. 308 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': 309 # del cur_line[-1] 310 cur_line.pop() 311 312 # Convert current line back to a string and store it in list 313 # of all lines (return value). 314 if cur_line: 315 lines.append(indent + ''.join(cur_line)) 316 317 return lines 318 319 320 # -- Public interface ---------------------------------------------- 321 322 def wrap(self, text): 323 """wrap(text : string) -> [string] 324 325 Reformat the single paragraph in 'text' so it fits in lines of 326 no more than 'self.width' columns, and return a list of wrapped 327 lines. Tabs in 'text' are expanded with string.expandtabs(), 328 and all other whitespace characters (including newline) are 329 converted to space. 330 """ 331 text = self._munge_whitespace(text) 332 chunks = self._split(text) 333 if self.fix_sentence_endings: 334 self._fix_sentence_endings(chunks) 335 return self._wrap_chunks(chunks) 336 337 def fill(self, text): 338 """fill(text : string) -> string 339 340 Reformat the single paragraph in 'text' to fit in lines of no 341 more than 'self.width' columns, and return a new string 342 containing the entire wrapped paragraph. 343 """ 344 return "\n".join(self.wrap(text)) 345 346 347 # -- Convenience interface --------------------------------------------- 348 349 def wrap(text, width=70, **kwargs): 350 """Wrap a single paragraph of text, returning a list of wrapped lines. 351 352 Reformat the single paragraph in 'text' so it fits in lines of no 353 more than 'width' columns, and return a list of wrapped lines. By 354 default, tabs in 'text' are expanded with string.expandtabs(), and 355 all other whitespace characters (including newline) are converted to 356 space. See TextWrapper class for available keyword args to customize 357 wrapping behaviour. 358 """ 359 w = TextWrapper(width=width, **kwargs) 360 return w.wrap(text) 361 362 def fill(text, width=70, **kwargs): 363 """Fill a single paragraph of text, returning a new string. 364 365 Reformat the single paragraph in 'text' to fit in lines of no more 366 than 'width' columns, and return a new string containing the entire 367 wrapped paragraph. As with wrap(), tabs are expanded and other 368 whitespace characters converted to space. See TextWrapper class for 369 available keyword args to customize wrapping behaviour. 370 """ 371 w = TextWrapper(width=width, **kwargs) 372 return w.fill(text) 373 374 375 # -- Loosely related functionality ------------------------------------- 376 377 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) 378 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 379 380 def dedent(text): 381 """Remove any common leading whitespace from every line in `text`. 382 383 This can be used to make triple-quoted strings line up with the left 384 edge of the display, while still presenting them in the source code 385 in indented form. 386 387 Note that tabs and spaces are both treated as whitespace, but they 388 are not equal: the lines " hello" and "\\thello" are 389 considered to have no common leading whitespace. (This behaviour is 390 new in Python 2.5; older versions of this module incorrectly 391 expanded tabs before searching for common leading whitespace.) 392 """ 393 # Look for the longest leading string of spaces and tabs common to 394 # all lines. 395 margin = None 396 text = _whitespace_only_re.sub('', text) 397 indents = _leading_whitespace_re.findall(text) 398 for indent in indents: 399 if margin is None: 400 margin = indent 401 402 # Current line more deeply indented than previous winner: 403 # no change (previous winner is still on top). 404 elif indent.startswith(margin): 405 pass 406 407 # Current line consistent with and no deeper than previous winner: 408 # it's the new winner. 409 elif margin.startswith(indent): 410 margin = indent 411 412 # Find the largest common whitespace between current line and previous 413 # winner. 414 else: 415 for i, (x, y) in enumerate(zip(margin, indent)): 416 if x != y: 417 margin = margin[:i] 418 break 419 else: 420 margin = margin[:len(indent)] 421 422 # sanity check (testing/debugging only) 423 if 0 and margin: 424 for line in text.split("\n"): 425 assert not line or line.startswith(margin), \ 426 "line = %r, margin = %r" % (line, margin) 427 428 if margin: 429 text = re.sub(r'(?m)^' + margin, '', text) 430 return text 431 432 if __name__ == "__main__": 433 #print dedent("\tfoo\n\tbar") 434 #print dedent(" \thello there\n \t how are you?") 435 print dedent("Hello there.\n This is indented.")