github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/textutil/wrap_writer.go (about)

     1  // Copyright 2015 The Vanadium Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package textutil
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"unicode"
    11  )
    12  
    13  // WrapWriter implements an io.Writer filter that formats input text into output
    14  // lines with a given target width in runes.
    15  //
    16  // Each input rune is classified into one of three kinds:
    17  //   EOL:    end-of-line, consisting of \f, \n, \r, \v, U+2028 or U+2029
    18  //   Space:  defined by unicode.IsSpace
    19  //   Letter: everything else
    20  //
    21  // The input text is expected to consist of words, defined as sequences of
    22  // letters.  Sequences of words form paragraphs, where paragraphs are separated
    23  // by either blank lines (that contain no letters), or an explicit U+2029
    24  // ParagraphSeparator.  Input lines with leading spaces are treated verbatim.
    25  //
    26  // Paragraphs are output as word-wrapped lines; line breaks only occur at word
    27  // boundaries.  Output lines are usually no longer than the target width.  The
    28  // exceptions are single words longer than the target width, which are output on
    29  // their own line, and verbatim lines, which may be arbitrarily longer or
    30  // shorter than the width.
    31  //
    32  // Output lines never contain trailing spaces.  Only verbatim output lines may
    33  // contain leading spaces.  Spaces separating input words are output verbatim,
    34  // unless it would result in a line with leading or trailing spaces.
    35  //
    36  // EOL runes within the input text are never written to the output; the output
    37  // line terminator and paragraph separator may be configured, and some EOL may
    38  // be output as a single space ' ' to maintain word separation.
    39  //
    40  // The algorithm greedily fills each output line with as many words as it can,
    41  // assuming that all Unicode code points have the same width.  Invalid UTF-8 is
    42  // silently transformed to the replacement character U+FFFD and treated as a
    43  // single rune.
    44  //
    45  // Flush must be called after the last call to Write; the input is buffered.
    46  //
    47  //   Implementation note: line breaking is a complicated topic.  This approach
    48  //   attempts to be simple and useful; a full implementation conforming to
    49  //   Unicode Standard Annex #14 would be complicated, and is not implemented.
    50  //   Languages that don't use spaces to separate words (e.g. CJK) won't work
    51  //   well under the current approach.
    52  //
    53  //   http://www.unicode.org/reports/tr14 [Unicode Line Breaking Algorithm]
    54  //   http://www.unicode.org/versions/Unicode4.0.0/ch05.pdf [5.8 Newline Guidelines]
    55  type WrapWriter struct {
    56  	// State configured by the user.
    57  	w             io.Writer
    58  	runeDecoder   RuneChunkDecoder
    59  	width         runePos
    60  	lineTerm      []byte
    61  	paragraphSep  string
    62  	indents       []string
    63  	forceVerbatim bool
    64  
    65  	// The buffer contains a single output line.
    66  	lineBuf byteRuneBuffer
    67  
    68  	// Keep track of the previous state and rune.
    69  	prevState state
    70  	prevRune  rune
    71  
    72  	// Keep track of blank input lines.
    73  	inputLineHasLetter bool
    74  
    75  	// lineBuf positions where the line starts (after separators and indents), a
    76  	// new word has started and the last word has ended.
    77  	lineStart    bytePos
    78  	newWordStart bytePos
    79  	lastWordEnd  bytePos
    80  
    81  	// Keep track of paragraph terminations and line indices, so we can output the
    82  	// paragraph separator and indents correctly.
    83  	terminateParagraph bool
    84  	paragraphLineIndex int
    85  	wroteFirstLine     bool
    86  }
    87  
    88  type state int
    89  
    90  const (
    91  	stateWordWrap  state = iota // Perform word-wrapping [start state]
    92  	stateVerbatim               // Verbatim output-line, no word-wrapping
    93  	stateSkipSpace              // Skip spaces in input line.
    94  )
    95  
    96  // NewWrapWriter returns a new WrapWriter with the given target width in runes,
    97  // producing output on the underlying writer w.  The dec and enc are used to
    98  // respectively decode runes from Write calls, and encode runes to w.
    99  func NewWrapWriter(w io.Writer, width int, dec RuneChunkDecoder, enc RuneEncoder) *WrapWriter {
   100  	ret := &WrapWriter{
   101  		w:            w,
   102  		runeDecoder:  dec,
   103  		width:        runePos(width),
   104  		lineTerm:     []byte("\n"),
   105  		paragraphSep: "\n",
   106  		prevState:    stateWordWrap,
   107  		prevRune:     LineSeparator,
   108  		lineBuf:      byteRuneBuffer{enc: enc},
   109  	}
   110  	ret.resetLine()
   111  	return ret
   112  }
   113  
   114  // NewUTF8WrapWriter returns a new WrapWriter filter that implements io.Writer,
   115  // and decodes and encodes runes in UTF-8.
   116  func NewUTF8WrapWriter(w io.Writer, width int) *WrapWriter {
   117  	return NewWrapWriter(w, width, &UTF8ChunkDecoder{}, UTF8Encoder{})
   118  }
   119  
   120  // Width returns the target width in runes.  If width < 0 the width is
   121  // unlimited; each paragraph is output as a single line.
   122  func (w *WrapWriter) Width() int { return int(w.width) }
   123  
   124  // SetLineTerminator sets the line terminator for subsequent Write calls.  Every
   125  // output line is terminated with term; EOL runes from the input are never
   126  // written to the output.  A new WrapWriter instance uses "\n" as the default
   127  // line terminator.
   128  //
   129  // Calls Flush internally, and returns any Flush error.
   130  func (w *WrapWriter) SetLineTerminator(term string) error {
   131  	if err := w.Flush(); err != nil {
   132  		return err
   133  	}
   134  	w.lineTerm = []byte(term)
   135  	w.resetLine()
   136  	return nil
   137  }
   138  
   139  // SetParagraphSeparator sets the paragraph separator for subsequent Write
   140  // calls.  Every consecutive pair of non-empty paragraphs is separated with sep;
   141  // EOL runes from the input are never written to the output.  A new WrapWriter
   142  // instance uses "\n" as the default paragraph separator.
   143  //
   144  // Calls Flush internally, and returns any Flush error.
   145  func (w *WrapWriter) SetParagraphSeparator(sep string) error {
   146  	if err := w.Flush(); err != nil {
   147  		return err
   148  	}
   149  	w.paragraphSep = sep
   150  	w.resetLine()
   151  	return nil
   152  }
   153  
   154  // SetIndents sets the indentation for subsequent Write calls.  Multiple indents
   155  // may be set, corresponding to the indent to use for the corresponding
   156  // paragraph line.  E.g. SetIndents("AA", "BBB", C") means the first line in
   157  // each paragraph is indented with "AA", the second line in each paragraph is
   158  // indented with "BBB", and all subsequent lines in each paragraph are indented
   159  // with "C".
   160  //
   161  // SetIndents() is equivalent to SetIndents(""), SetIndents("", ""), etc.
   162  //
   163  // A new WrapWriter instance has no indents by default.
   164  //
   165  // Calls Flush internally, and returns any Flush error.
   166  func (w *WrapWriter) SetIndents(indents ...string) error {
   167  	if err := w.Flush(); err != nil {
   168  		return err
   169  	}
   170  	// Copy indents in case the user passed the slice via SetIndents(p...), and
   171  	// canonicalize the all empty case to nil.
   172  	allEmpty := true
   173  	w.indents = make([]string, len(indents))
   174  	for ix, indent := range indents {
   175  		w.indents[ix] = indent
   176  		if indent != "" {
   177  			allEmpty = false
   178  		}
   179  	}
   180  	if allEmpty {
   181  		w.indents = nil
   182  	}
   183  	w.resetLine()
   184  	return nil
   185  }
   186  
   187  // ForceVerbatim forces w to stay in verbatim mode if v is true, or lets w
   188  // perform its regular line writing algorithm if v is false.  This is useful if
   189  // there is a sequence of lines that should be written verbatim, even if the
   190  // lines don't start with spaces.
   191  //
   192  // Calls Flush internally, and returns any Flush error.
   193  func (w *WrapWriter) ForceVerbatim(v bool) error {
   194  	w.forceVerbatim = v
   195  	return w.Flush()
   196  }
   197  
   198  // Write implements io.Writer by buffering data into the WrapWriter w.  Actual
   199  // writes to the underlying writer may occur, and may include data buffered in
   200  // either this Write call or previous Write calls.
   201  //
   202  // Flush must be called after the last call to Write.
   203  func (w *WrapWriter) Write(data []byte) (int, error) {
   204  	return WriteRuneChunk(w.runeDecoder, w.addRune, data)
   205  }
   206  
   207  // Flush flushes any remaining buffered text, and resets the paragraph line
   208  // count back to 0, so that indents will be applied starting from the first
   209  // line.  It does not imply a paragraph separator; repeated calls to Flush with
   210  // no intervening calls to other methods is equivalent to a single Flush.
   211  //
   212  // Flush must be called after the last call to Write, and may be called an
   213  // arbitrary number of times before the last Write.
   214  func (w *WrapWriter) Flush() error {
   215  	if err := FlushRuneChunk(w.runeDecoder, w.addRune); err != nil {
   216  		return err
   217  	}
   218  	// Add U+2028 to force the last line (if any) to be written.
   219  	if err := w.addRune(LineSeparator); err != nil {
   220  		return err
   221  	}
   222  	// Reset the paragraph line count.
   223  	w.paragraphLineIndex = 0
   224  	w.resetLine()
   225  	return nil
   226  }
   227  
   228  // addRune is called every time w.runeDecoder decodes a full rune.
   229  func (w *WrapWriter) addRune(r rune) error {
   230  	state, lineBreak := w.nextState(r, w.updateRune(r))
   231  	if lineBreak {
   232  		if err := w.writeLine(); err != nil {
   233  			return err
   234  		}
   235  	}
   236  	w.bufferRune(r, state, lineBreak)
   237  	w.prevState = state
   238  	w.prevRune = r
   239  	return nil
   240  }
   241  
   242  // We classify each incoming rune into three kinds for easier handling.
   243  type kind int
   244  
   245  const (
   246  	kindEOL kind = iota
   247  	kindSpace
   248  	kindLetter
   249  )
   250  
   251  func runeKind(r rune) kind {
   252  	switch r {
   253  	case '\f', '\n', '\r', '\v', LineSeparator, ParagraphSeparator:
   254  		return kindEOL
   255  	}
   256  	if unicode.IsSpace(r) {
   257  		return kindSpace
   258  	}
   259  	return kindLetter
   260  }
   261  
   262  func (w *WrapWriter) updateRune(r rune) bool {
   263  	forceLineBreak := false
   264  	switch kind := runeKind(r); kind {
   265  	case kindEOL:
   266  		// Update lastWordEnd if the last word just ended.
   267  		if w.newWordStart != -1 {
   268  			w.newWordStart = -1
   269  			w.lastWordEnd = w.lineBuf.ByteLen()
   270  		}
   271  		switch {
   272  		case w.prevRune == '\r' && r == '\n':
   273  			// Treat "\r\n" as a single EOL; we've already handled the logic for '\r',
   274  			// so there's nothing to do when we see '\n'.
   275  		case r == LineSeparator:
   276  			// Treat U+2028 as a pure line break; it's never a paragraph break.
   277  			forceLineBreak = true
   278  		case r == ParagraphSeparator || !w.inputLineHasLetter:
   279  			// The paragraph has just been terminated if we see an explicit U+2029, or
   280  			// if we see a blank line, which may contain spaces.
   281  			forceLineBreak = true
   282  			w.terminateParagraph = true
   283  		}
   284  		w.inputLineHasLetter = false
   285  	case kindSpace:
   286  		// Update lastWordEnd if the last word just ended.
   287  		if w.newWordStart != -1 {
   288  			w.newWordStart = -1
   289  			w.lastWordEnd = w.lineBuf.ByteLen()
   290  		}
   291  	case kindLetter:
   292  		// Update newWordStart if a new word just started.
   293  		if w.newWordStart == -1 {
   294  			w.newWordStart = w.lineBuf.ByteLen()
   295  		}
   296  		w.inputLineHasLetter = true
   297  		w.terminateParagraph = false
   298  	default:
   299  		panic(fmt.Errorf("textutil: updateRune unhandled kind %d", kind))
   300  	}
   301  	return forceLineBreak
   302  }
   303  
   304  // nextState returns the next state and whether we should break the line.
   305  //
   306  // Here's a handy table that describes all the scenarios in which we will line
   307  // break input text, grouped by the reason for the break.  The current position
   308  // is the last non-* rune in each pattern, which is where we decide to break.
   309  //
   310  //              w.prevState   Next state   Buffer reset
   311  //              -----------   ----------   ------------
   312  //   ===== Force line break (U+2028 / U+2029, blank line) =====
   313  //   a..*|***   *             wordWrap     empty
   314  //   a._.|***   *             wordWrap     empty
   315  //   a+**|***   *             wordWrap     empty
   316  //
   317  //   ===== verbatim: wait for any EOL =====
   318  //   _*.*|***   verbatim      wordWrap     empty
   319  //
   320  //   ===== wordWrap: switch to verbatim =====
   321  //   a._*|***   wordWrap      verbatim     empty
   322  //
   323  //   ===== wordWrap: line is too wide =====
   324  //   abc.|***   wordWrap      wordWrap     empty
   325  //   abcd|.**   wordWrap      wordWrap     empty
   326  //   abcd|e.*   wordWrap      wordWrap     empty
   327  //   a_cd|.**   wordWrap      wordWrap     empty
   328  //
   329  //   abc_|***   wordWrap      skipSpace    empty
   330  //   abcd|_**   wordWrap      skipSpace    empty
   331  //   abcd|e_*   wordWrap      skipSpace    empty
   332  //   a_cd|_**   wordWrap      skipSpace    empty
   333  //
   334  //   a_cd|e**   wordWrap      start        newWordStart
   335  //
   336  //   LEGEND
   337  //     abcde  Letter
   338  //     .      End-of-line
   339  //     +      End-of-line (only U+2028 / U+2029)
   340  //     _      Space
   341  //     *      Any rune (letter, line-end or space)
   342  //     |      Visual indication of width=4, has no width itself.
   343  //
   344  // Note that Flush calls behave exactly as if an explicit U+2028 line separator
   345  // were added to the end of all buffered data.
   346  func (w *WrapWriter) nextState(r rune, forceLineBreak bool) (state, bool) {
   347  	kind := runeKind(r)
   348  	if w.forceVerbatim {
   349  		return stateVerbatim, forceLineBreak || kind == kindEOL
   350  	}
   351  	if forceLineBreak {
   352  		return stateWordWrap, true
   353  	}
   354  	// Handle non word-wrap states, which are easy.
   355  	switch w.prevState {
   356  	case stateVerbatim:
   357  		if kind == kindEOL {
   358  			return stateWordWrap, true
   359  		}
   360  		return stateVerbatim, false
   361  	case stateSkipSpace:
   362  		if kind == kindSpace {
   363  			return stateSkipSpace, false
   364  		}
   365  		return stateWordWrap, false
   366  	}
   367  	// Handle stateWordWrap, which is more complicated.
   368  
   369  	// Switch to the verbatim state when we see a space right after an EOL.
   370  	if runeKind(w.prevRune) == kindEOL && kind == kindSpace {
   371  		return stateVerbatim, true
   372  	}
   373  	// Break on EOL or space when the line is too wide.  See above table.
   374  	if w.width >= 0 && w.width <= w.lineBuf.RuneLen()+1 {
   375  		switch kind {
   376  		case kindEOL:
   377  			return stateWordWrap, true
   378  		case kindSpace:
   379  			return stateSkipSpace, true
   380  		}
   381  		// case kindLetter falls through
   382  	}
   383  	// Handle the newWordStart case in the above table.
   384  	if w.width >= 0 && w.width < w.lineBuf.RuneLen()+1 && w.newWordStart != w.lineStart {
   385  		return stateWordWrap, true
   386  	}
   387  	// Stay in the wordWrap state and don't break the line.
   388  	return stateWordWrap, false
   389  }
   390  
   391  func (w *WrapWriter) writeLine() error {
   392  	if w.lastWordEnd == -1 {
   393  		// Don't write blank lines, but we must reset the line in case the paragraph
   394  		// has just been terminated.
   395  		w.resetLine()
   396  		return nil
   397  	}
   398  	// Write the line (without trailing spaces) followed by the line terminator.
   399  	line := w.lineBuf.Bytes()[:w.lastWordEnd]
   400  	if _, err := w.w.Write(line); err != nil {
   401  		return err
   402  	}
   403  	if _, err := w.w.Write(w.lineTerm); err != nil {
   404  		return err
   405  	}
   406  	// Reset the line buffer.
   407  	w.wroteFirstLine = true
   408  	w.paragraphLineIndex++
   409  	if w.newWordStart != -1 {
   410  		// If we have an unterminated new word, we must be in the newWordStart case
   411  		// in the table above.  Handle the special buffer reset here.
   412  		newWord := string(w.lineBuf.Bytes()[w.newWordStart:])
   413  		w.resetLine()
   414  		w.newWordStart = w.lineBuf.ByteLen()
   415  		w.lineBuf.WriteString(newWord)
   416  	} else {
   417  		w.resetLine()
   418  	}
   419  	return nil
   420  }
   421  
   422  func (w *WrapWriter) resetLine() {
   423  	w.lineBuf.Reset()
   424  	w.newWordStart = -1
   425  	w.lastWordEnd = -1
   426  	// Write the paragraph separator if the previous paragraph has terminated.
   427  	// This consumes no runes from the line width.
   428  	if w.wroteFirstLine && w.terminateParagraph {
   429  		w.lineBuf.WriteString0Runes(w.paragraphSep)
   430  		w.paragraphLineIndex = 0
   431  	}
   432  	// Add indent; a non-empty indent consumes runes from the line width.
   433  	var indent string
   434  	switch {
   435  	case w.paragraphLineIndex < len(w.indents):
   436  		indent = w.indents[w.paragraphLineIndex]
   437  	case len(w.indents) > 0:
   438  		indent = w.indents[len(w.indents)-1]
   439  	}
   440  	w.lineBuf.WriteString(indent)
   441  	w.lineStart = w.lineBuf.ByteLen()
   442  }
   443  
   444  func (w *WrapWriter) bufferRune(r rune, state state, lineBreak bool) {
   445  	// Never add leading spaces to the buffer in the wordWrap state.
   446  	wordWrapNoLeadingSpaces := state == stateWordWrap && !lineBreak
   447  	switch kind := runeKind(r); kind {
   448  	case kindEOL:
   449  		// When we're word-wrapping and we see a letter followed by EOL, we convert
   450  		// the EOL into a single space in the buffer, to break the previous word
   451  		// from the next word.
   452  		if wordWrapNoLeadingSpaces && runeKind(w.prevRune) == kindLetter {
   453  			w.lineBuf.WriteRune(' ')
   454  		}
   455  	case kindSpace:
   456  		if wordWrapNoLeadingSpaces || state == stateVerbatim {
   457  			w.lineBuf.WriteRune(r)
   458  		}
   459  	case kindLetter:
   460  		w.lineBuf.WriteRune(r)
   461  	default:
   462  		panic(fmt.Errorf("textutil: bufferRune unhandled kind %d", kind))
   463  	}
   464  }