github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/textutil/wrap_writer.go (about) 1 // Copyright 2015 The Vanadium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package textutil 6 7 import ( 8 "fmt" 9 "io" 10 "unicode" 11 ) 12 13 // WrapWriter implements an io.Writer filter that formats input text into output 14 // lines with a given target width in runes. 15 // 16 // Each input rune is classified into one of three kinds: 17 // EOL: end-of-line, consisting of \f, \n, \r, \v, U+2028 or U+2029 18 // Space: defined by unicode.IsSpace 19 // Letter: everything else 20 // 21 // The input text is expected to consist of words, defined as sequences of 22 // letters. Sequences of words form paragraphs, where paragraphs are separated 23 // by either blank lines (that contain no letters), or an explicit U+2029 24 // ParagraphSeparator. Input lines with leading spaces are treated verbatim. 25 // 26 // Paragraphs are output as word-wrapped lines; line breaks only occur at word 27 // boundaries. Output lines are usually no longer than the target width. The 28 // exceptions are single words longer than the target width, which are output on 29 // their own line, and verbatim lines, which may be arbitrarily longer or 30 // shorter than the width. 31 // 32 // Output lines never contain trailing spaces. Only verbatim output lines may 33 // contain leading spaces. Spaces separating input words are output verbatim, 34 // unless it would result in a line with leading or trailing spaces. 35 // 36 // EOL runes within the input text are never written to the output; the output 37 // line terminator and paragraph separator may be configured, and some EOL may 38 // be output as a single space ' ' to maintain word separation. 39 // 40 // The algorithm greedily fills each output line with as many words as it can, 41 // assuming that all Unicode code points have the same width. Invalid UTF-8 is 42 // silently transformed to the replacement character U+FFFD and treated as a 43 // single rune. 44 // 45 // Flush must be called after the last call to Write; the input is buffered. 46 // 47 // Implementation note: line breaking is a complicated topic. This approach 48 // attempts to be simple and useful; a full implementation conforming to 49 // Unicode Standard Annex #14 would be complicated, and is not implemented. 50 // Languages that don't use spaces to separate words (e.g. CJK) won't work 51 // well under the current approach. 52 // 53 // http://www.unicode.org/reports/tr14 [Unicode Line Breaking Algorithm] 54 // http://www.unicode.org/versions/Unicode4.0.0/ch05.pdf [5.8 Newline Guidelines] 55 type WrapWriter struct { 56 // State configured by the user. 57 w io.Writer 58 runeDecoder RuneChunkDecoder 59 width runePos 60 lineTerm []byte 61 paragraphSep string 62 indents []string 63 forceVerbatim bool 64 65 // The buffer contains a single output line. 66 lineBuf byteRuneBuffer 67 68 // Keep track of the previous state and rune. 69 prevState state 70 prevRune rune 71 72 // Keep track of blank input lines. 73 inputLineHasLetter bool 74 75 // lineBuf positions where the line starts (after separators and indents), a 76 // new word has started and the last word has ended. 77 lineStart bytePos 78 newWordStart bytePos 79 lastWordEnd bytePos 80 81 // Keep track of paragraph terminations and line indices, so we can output the 82 // paragraph separator and indents correctly. 83 terminateParagraph bool 84 paragraphLineIndex int 85 wroteFirstLine bool 86 } 87 88 type state int 89 90 const ( 91 stateWordWrap state = iota // Perform word-wrapping [start state] 92 stateVerbatim // Verbatim output-line, no word-wrapping 93 stateSkipSpace // Skip spaces in input line. 94 ) 95 96 // NewWrapWriter returns a new WrapWriter with the given target width in runes, 97 // producing output on the underlying writer w. The dec and enc are used to 98 // respectively decode runes from Write calls, and encode runes to w. 99 func NewWrapWriter(w io.Writer, width int, dec RuneChunkDecoder, enc RuneEncoder) *WrapWriter { 100 ret := &WrapWriter{ 101 w: w, 102 runeDecoder: dec, 103 width: runePos(width), 104 lineTerm: []byte("\n"), 105 paragraphSep: "\n", 106 prevState: stateWordWrap, 107 prevRune: LineSeparator, 108 lineBuf: byteRuneBuffer{enc: enc}, 109 } 110 ret.resetLine() 111 return ret 112 } 113 114 // NewUTF8WrapWriter returns a new WrapWriter filter that implements io.Writer, 115 // and decodes and encodes runes in UTF-8. 116 func NewUTF8WrapWriter(w io.Writer, width int) *WrapWriter { 117 return NewWrapWriter(w, width, &UTF8ChunkDecoder{}, UTF8Encoder{}) 118 } 119 120 // Width returns the target width in runes. If width < 0 the width is 121 // unlimited; each paragraph is output as a single line. 122 func (w *WrapWriter) Width() int { return int(w.width) } 123 124 // SetLineTerminator sets the line terminator for subsequent Write calls. Every 125 // output line is terminated with term; EOL runes from the input are never 126 // written to the output. A new WrapWriter instance uses "\n" as the default 127 // line terminator. 128 // 129 // Calls Flush internally, and returns any Flush error. 130 func (w *WrapWriter) SetLineTerminator(term string) error { 131 if err := w.Flush(); err != nil { 132 return err 133 } 134 w.lineTerm = []byte(term) 135 w.resetLine() 136 return nil 137 } 138 139 // SetParagraphSeparator sets the paragraph separator for subsequent Write 140 // calls. Every consecutive pair of non-empty paragraphs is separated with sep; 141 // EOL runes from the input are never written to the output. A new WrapWriter 142 // instance uses "\n" as the default paragraph separator. 143 // 144 // Calls Flush internally, and returns any Flush error. 145 func (w *WrapWriter) SetParagraphSeparator(sep string) error { 146 if err := w.Flush(); err != nil { 147 return err 148 } 149 w.paragraphSep = sep 150 w.resetLine() 151 return nil 152 } 153 154 // SetIndents sets the indentation for subsequent Write calls. Multiple indents 155 // may be set, corresponding to the indent to use for the corresponding 156 // paragraph line. E.g. SetIndents("AA", "BBB", C") means the first line in 157 // each paragraph is indented with "AA", the second line in each paragraph is 158 // indented with "BBB", and all subsequent lines in each paragraph are indented 159 // with "C". 160 // 161 // SetIndents() is equivalent to SetIndents(""), SetIndents("", ""), etc. 162 // 163 // A new WrapWriter instance has no indents by default. 164 // 165 // Calls Flush internally, and returns any Flush error. 166 func (w *WrapWriter) SetIndents(indents ...string) error { 167 if err := w.Flush(); err != nil { 168 return err 169 } 170 // Copy indents in case the user passed the slice via SetIndents(p...), and 171 // canonicalize the all empty case to nil. 172 allEmpty := true 173 w.indents = make([]string, len(indents)) 174 for ix, indent := range indents { 175 w.indents[ix] = indent 176 if indent != "" { 177 allEmpty = false 178 } 179 } 180 if allEmpty { 181 w.indents = nil 182 } 183 w.resetLine() 184 return nil 185 } 186 187 // ForceVerbatim forces w to stay in verbatim mode if v is true, or lets w 188 // perform its regular line writing algorithm if v is false. This is useful if 189 // there is a sequence of lines that should be written verbatim, even if the 190 // lines don't start with spaces. 191 // 192 // Calls Flush internally, and returns any Flush error. 193 func (w *WrapWriter) ForceVerbatim(v bool) error { 194 w.forceVerbatim = v 195 return w.Flush() 196 } 197 198 // Write implements io.Writer by buffering data into the WrapWriter w. Actual 199 // writes to the underlying writer may occur, and may include data buffered in 200 // either this Write call or previous Write calls. 201 // 202 // Flush must be called after the last call to Write. 203 func (w *WrapWriter) Write(data []byte) (int, error) { 204 return WriteRuneChunk(w.runeDecoder, w.addRune, data) 205 } 206 207 // Flush flushes any remaining buffered text, and resets the paragraph line 208 // count back to 0, so that indents will be applied starting from the first 209 // line. It does not imply a paragraph separator; repeated calls to Flush with 210 // no intervening calls to other methods is equivalent to a single Flush. 211 // 212 // Flush must be called after the last call to Write, and may be called an 213 // arbitrary number of times before the last Write. 214 func (w *WrapWriter) Flush() error { 215 if err := FlushRuneChunk(w.runeDecoder, w.addRune); err != nil { 216 return err 217 } 218 // Add U+2028 to force the last line (if any) to be written. 219 if err := w.addRune(LineSeparator); err != nil { 220 return err 221 } 222 // Reset the paragraph line count. 223 w.paragraphLineIndex = 0 224 w.resetLine() 225 return nil 226 } 227 228 // addRune is called every time w.runeDecoder decodes a full rune. 229 func (w *WrapWriter) addRune(r rune) error { 230 state, lineBreak := w.nextState(r, w.updateRune(r)) 231 if lineBreak { 232 if err := w.writeLine(); err != nil { 233 return err 234 } 235 } 236 w.bufferRune(r, state, lineBreak) 237 w.prevState = state 238 w.prevRune = r 239 return nil 240 } 241 242 // We classify each incoming rune into three kinds for easier handling. 243 type kind int 244 245 const ( 246 kindEOL kind = iota 247 kindSpace 248 kindLetter 249 ) 250 251 func runeKind(r rune) kind { 252 switch r { 253 case '\f', '\n', '\r', '\v', LineSeparator, ParagraphSeparator: 254 return kindEOL 255 } 256 if unicode.IsSpace(r) { 257 return kindSpace 258 } 259 return kindLetter 260 } 261 262 func (w *WrapWriter) updateRune(r rune) bool { 263 forceLineBreak := false 264 switch kind := runeKind(r); kind { 265 case kindEOL: 266 // Update lastWordEnd if the last word just ended. 267 if w.newWordStart != -1 { 268 w.newWordStart = -1 269 w.lastWordEnd = w.lineBuf.ByteLen() 270 } 271 switch { 272 case w.prevRune == '\r' && r == '\n': 273 // Treat "\r\n" as a single EOL; we've already handled the logic for '\r', 274 // so there's nothing to do when we see '\n'. 275 case r == LineSeparator: 276 // Treat U+2028 as a pure line break; it's never a paragraph break. 277 forceLineBreak = true 278 case r == ParagraphSeparator || !w.inputLineHasLetter: 279 // The paragraph has just been terminated if we see an explicit U+2029, or 280 // if we see a blank line, which may contain spaces. 281 forceLineBreak = true 282 w.terminateParagraph = true 283 } 284 w.inputLineHasLetter = false 285 case kindSpace: 286 // Update lastWordEnd if the last word just ended. 287 if w.newWordStart != -1 { 288 w.newWordStart = -1 289 w.lastWordEnd = w.lineBuf.ByteLen() 290 } 291 case kindLetter: 292 // Update newWordStart if a new word just started. 293 if w.newWordStart == -1 { 294 w.newWordStart = w.lineBuf.ByteLen() 295 } 296 w.inputLineHasLetter = true 297 w.terminateParagraph = false 298 default: 299 panic(fmt.Errorf("textutil: updateRune unhandled kind %d", kind)) 300 } 301 return forceLineBreak 302 } 303 304 // nextState returns the next state and whether we should break the line. 305 // 306 // Here's a handy table that describes all the scenarios in which we will line 307 // break input text, grouped by the reason for the break. The current position 308 // is the last non-* rune in each pattern, which is where we decide to break. 309 // 310 // w.prevState Next state Buffer reset 311 // ----------- ---------- ------------ 312 // ===== Force line break (U+2028 / U+2029, blank line) ===== 313 // a..*|*** * wordWrap empty 314 // a._.|*** * wordWrap empty 315 // a+**|*** * wordWrap empty 316 // 317 // ===== verbatim: wait for any EOL ===== 318 // _*.*|*** verbatim wordWrap empty 319 // 320 // ===== wordWrap: switch to verbatim ===== 321 // a._*|*** wordWrap verbatim empty 322 // 323 // ===== wordWrap: line is too wide ===== 324 // abc.|*** wordWrap wordWrap empty 325 // abcd|.** wordWrap wordWrap empty 326 // abcd|e.* wordWrap wordWrap empty 327 // a_cd|.** wordWrap wordWrap empty 328 // 329 // abc_|*** wordWrap skipSpace empty 330 // abcd|_** wordWrap skipSpace empty 331 // abcd|e_* wordWrap skipSpace empty 332 // a_cd|_** wordWrap skipSpace empty 333 // 334 // a_cd|e** wordWrap start newWordStart 335 // 336 // LEGEND 337 // abcde Letter 338 // . End-of-line 339 // + End-of-line (only U+2028 / U+2029) 340 // _ Space 341 // * Any rune (letter, line-end or space) 342 // | Visual indication of width=4, has no width itself. 343 // 344 // Note that Flush calls behave exactly as if an explicit U+2028 line separator 345 // were added to the end of all buffered data. 346 func (w *WrapWriter) nextState(r rune, forceLineBreak bool) (state, bool) { 347 kind := runeKind(r) 348 if w.forceVerbatim { 349 return stateVerbatim, forceLineBreak || kind == kindEOL 350 } 351 if forceLineBreak { 352 return stateWordWrap, true 353 } 354 // Handle non word-wrap states, which are easy. 355 switch w.prevState { 356 case stateVerbatim: 357 if kind == kindEOL { 358 return stateWordWrap, true 359 } 360 return stateVerbatim, false 361 case stateSkipSpace: 362 if kind == kindSpace { 363 return stateSkipSpace, false 364 } 365 return stateWordWrap, false 366 } 367 // Handle stateWordWrap, which is more complicated. 368 369 // Switch to the verbatim state when we see a space right after an EOL. 370 if runeKind(w.prevRune) == kindEOL && kind == kindSpace { 371 return stateVerbatim, true 372 } 373 // Break on EOL or space when the line is too wide. See above table. 374 if w.width >= 0 && w.width <= w.lineBuf.RuneLen()+1 { 375 switch kind { 376 case kindEOL: 377 return stateWordWrap, true 378 case kindSpace: 379 return stateSkipSpace, true 380 } 381 // case kindLetter falls through 382 } 383 // Handle the newWordStart case in the above table. 384 if w.width >= 0 && w.width < w.lineBuf.RuneLen()+1 && w.newWordStart != w.lineStart { 385 return stateWordWrap, true 386 } 387 // Stay in the wordWrap state and don't break the line. 388 return stateWordWrap, false 389 } 390 391 func (w *WrapWriter) writeLine() error { 392 if w.lastWordEnd == -1 { 393 // Don't write blank lines, but we must reset the line in case the paragraph 394 // has just been terminated. 395 w.resetLine() 396 return nil 397 } 398 // Write the line (without trailing spaces) followed by the line terminator. 399 line := w.lineBuf.Bytes()[:w.lastWordEnd] 400 if _, err := w.w.Write(line); err != nil { 401 return err 402 } 403 if _, err := w.w.Write(w.lineTerm); err != nil { 404 return err 405 } 406 // Reset the line buffer. 407 w.wroteFirstLine = true 408 w.paragraphLineIndex++ 409 if w.newWordStart != -1 { 410 // If we have an unterminated new word, we must be in the newWordStart case 411 // in the table above. Handle the special buffer reset here. 412 newWord := string(w.lineBuf.Bytes()[w.newWordStart:]) 413 w.resetLine() 414 w.newWordStart = w.lineBuf.ByteLen() 415 w.lineBuf.WriteString(newWord) 416 } else { 417 w.resetLine() 418 } 419 return nil 420 } 421 422 func (w *WrapWriter) resetLine() { 423 w.lineBuf.Reset() 424 w.newWordStart = -1 425 w.lastWordEnd = -1 426 // Write the paragraph separator if the previous paragraph has terminated. 427 // This consumes no runes from the line width. 428 if w.wroteFirstLine && w.terminateParagraph { 429 w.lineBuf.WriteString0Runes(w.paragraphSep) 430 w.paragraphLineIndex = 0 431 } 432 // Add indent; a non-empty indent consumes runes from the line width. 433 var indent string 434 switch { 435 case w.paragraphLineIndex < len(w.indents): 436 indent = w.indents[w.paragraphLineIndex] 437 case len(w.indents) > 0: 438 indent = w.indents[len(w.indents)-1] 439 } 440 w.lineBuf.WriteString(indent) 441 w.lineStart = w.lineBuf.ByteLen() 442 } 443 444 func (w *WrapWriter) bufferRune(r rune, state state, lineBreak bool) { 445 // Never add leading spaces to the buffer in the wordWrap state. 446 wordWrapNoLeadingSpaces := state == stateWordWrap && !lineBreak 447 switch kind := runeKind(r); kind { 448 case kindEOL: 449 // When we're word-wrapping and we see a letter followed by EOL, we convert 450 // the EOL into a single space in the buffer, to break the previous word 451 // from the next word. 452 if wordWrapNoLeadingSpaces && runeKind(w.prevRune) == kindLetter { 453 w.lineBuf.WriteRune(' ') 454 } 455 case kindSpace: 456 if wordWrapNoLeadingSpaces || state == stateVerbatim { 457 w.lineBuf.WriteRune(r) 458 } 459 case kindLetter: 460 w.lineBuf.WriteRune(r) 461 default: 462 panic(fmt.Errorf("textutil: bufferRune unhandled kind %d", kind)) 463 } 464 }