github.com/NeowayLabs/nash@v0.2.2-0.20200127205349-a227041ffd50/stdbin/strings/strings.go (about)

     1  package main
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"io"
     7  	"unicode/utf8"
     8  )
     9  
    10  func Do(input io.Reader, minTextSize uint) *bufio.Scanner {
    11  	outputReader, outputWriter := io.Pipe()
    12  	go searchstrings(input, minTextSize, outputWriter)
    13  	scanner := bufio.NewScanner(outputReader)
    14  
    15  	const maxBufferSize = 1024 * 1024
    16  	b := make([]byte, maxBufferSize)
    17  	scanner.Buffer(b, maxBufferSize)
    18  	return scanner
    19  }
    20  
    21  func searchstrings(input io.Reader, minTextSize uint, output *io.PipeWriter) {
    22  
    23  	newline := byte('\n')
    24  	searcher := wordSearcher{minTextSize: minTextSize}
    25  
    26  	write := func(data []byte) error {
    27  		data = append(data, newline)
    28  		n, err := output.Write(data)
    29  		if n != len(data) {
    30  			return fmt.Errorf(
    31  				"expected to write[%d] wrote[%d]",
    32  				len(data),
    33  				n,
    34  			)
    35  		}
    36  		return err
    37  	}
    38  
    39  	handleIOError := func(err error) bool {
    40  		if err != nil {
    41  			var finalwriteerr error
    42  
    43  			if text, ok := searcher.flushBuffer(); ok {
    44  				finalwriteerr = write(text)
    45  			}
    46  			if err == io.EOF {
    47  				if finalwriteerr == nil {
    48  					output.Close()
    49  				} else {
    50  					output.CloseWithError(fmt.Errorf(
    51  						"error[%s] writing last data",
    52  						finalwriteerr,
    53  					))
    54  				}
    55  			} else {
    56  				output.CloseWithError(err)
    57  			}
    58  			return true
    59  		}
    60  		return false
    61  	}
    62  
    63  	data := make([]byte, 1)
    64  	for {
    65  		// WHY: Don't see the point of checking N when reading a single byte
    66  		n, err := input.Read(data)
    67  
    68  		if n <= 0 {
    69  			if handleIOError(err) {
    70  				return
    71  			}
    72  			// WHY:
    73  			// Implementations of Read are discouraged from
    74  			// returning a zero byte count with a nil error,
    75  			// except when len(p) == 0.
    76  			// Callers should treat a return of 0 and nil as
    77  			// indicating that nothing happened; in particular it
    78  			// does not indicate EOF.
    79  			continue
    80  		}
    81  
    82  		if text, ok := searcher.next(data[0]); ok {
    83  			err = write(text)
    84  		}
    85  
    86  		if handleIOError(err) {
    87  			return
    88  		}
    89  	}
    90  }
    91  
    92  type byteType int
    93  
    94  type wordSearcher struct {
    95  	buffer         []byte
    96  	possibleRune   []byte
    97  	waitingForRune bool
    98  	minTextSize    uint
    99  }
   100  
   101  const (
   102  	binaryType byteType = iota
   103  	asciiType
   104  	runeStartType
   105  )
   106  
   107  func (w *wordSearcher) next(b byte) ([]byte, bool) {
   108  	if w.waitingForRune {
   109  		return w.nextRune(b)
   110  	}
   111  	return w.nextASCII(b)
   112  }
   113  
   114  func (w *wordSearcher) nextRune(b byte) ([]byte, bool) {
   115  
   116  	const maxUTFSize = 4
   117  
   118  	if b == 0 {
   119  		w.resetRuneSearch()
   120  		return w.flushBuffer()
   121  	}
   122  
   123  	if word := string([]byte{b}); utf8.ValidString(word) {
   124  		w.resetRuneSearch()
   125  		data, ok := w.flushBuffer()
   126  		w.writeOnBuffer(b)
   127  		return data, ok
   128  	}
   129  
   130  	if utf8.RuneStart(b) {
   131  		w.resetRuneSearch()
   132  		data, ok := w.flushBuffer()
   133  		w.startRuneSearch(b)
   134  		return data, ok
   135  	}
   136  
   137  	w.writeOnPossibleRune(b)
   138  	if utf8.ValidString(string(w.possibleRune)) {
   139  		w.writeOnBuffer(w.possibleRune...)
   140  		w.resetRuneSearch()
   141  		return nil, false
   142  	}
   143  
   144  	if len(w.possibleRune) == maxUTFSize {
   145  		w.resetRuneSearch()
   146  		return w.flushBuffer()
   147  	}
   148  
   149  	return nil, false
   150  }
   151  
   152  func (w *wordSearcher) resetRuneSearch() {
   153  	w.waitingForRune = false
   154  	w.possibleRune = nil
   155  }
   156  
   157  func (w *wordSearcher) nextASCII(b byte) ([]byte, bool) {
   158  	switch bytetype(b) {
   159  	case binaryType:
   160  		{
   161  			return w.flushBuffer()
   162  		}
   163  	case asciiType:
   164  		{
   165  			w.writeOnBuffer(b)
   166  		}
   167  	case runeStartType:
   168  		{
   169  			w.startRuneSearch(b)
   170  		}
   171  	}
   172  	return nil, false
   173  }
   174  
   175  func (w *wordSearcher) startRuneSearch(b byte) {
   176  	w.waitingForRune = true
   177  	w.writeOnPossibleRune(b)
   178  }
   179  
   180  func (w *wordSearcher) writeOnBuffer(b ...byte) {
   181  	w.buffer = append(w.buffer, b...)
   182  }
   183  
   184  func (w *wordSearcher) writeOnPossibleRune(b byte) {
   185  	w.possibleRune = append(w.possibleRune, b)
   186  }
   187  
   188  func (w *wordSearcher) bufferLenInRunes() uint {
   189  	return uint(len([]rune(string(w.buffer))))
   190  }
   191  
   192  func (w *wordSearcher) flushBuffer() ([]byte, bool) {
   193  	if len(w.buffer) == 0 {
   194  		return nil, false
   195  	}
   196  	if w.bufferLenInRunes() < w.minTextSize {
   197  		w.buffer = nil
   198  		return nil, false
   199  	}
   200  	b := w.buffer
   201  	w.buffer = nil
   202  	return b, true
   203  }
   204  
   205  func bytetype(b byte) byteType {
   206  	if b == 0 {
   207  		return binaryType
   208  	}
   209  	if word := string([]byte{b}); utf8.ValidString(word) {
   210  		return asciiType
   211  	}
   212  	if utf8.RuneStart(b) {
   213  		return runeStartType
   214  	}
   215  	return binaryType
   216  }