github.com/NeowayLabs/nash@v0.2.2-0.20200127205349-a227041ffd50/stdbin/strings/strings.go (about) 1 package main 2 3 import ( 4 "bufio" 5 "fmt" 6 "io" 7 "unicode/utf8" 8 ) 9 10 func Do(input io.Reader, minTextSize uint) *bufio.Scanner { 11 outputReader, outputWriter := io.Pipe() 12 go searchstrings(input, minTextSize, outputWriter) 13 scanner := bufio.NewScanner(outputReader) 14 15 const maxBufferSize = 1024 * 1024 16 b := make([]byte, maxBufferSize) 17 scanner.Buffer(b, maxBufferSize) 18 return scanner 19 } 20 21 func searchstrings(input io.Reader, minTextSize uint, output *io.PipeWriter) { 22 23 newline := byte('\n') 24 searcher := wordSearcher{minTextSize: minTextSize} 25 26 write := func(data []byte) error { 27 data = append(data, newline) 28 n, err := output.Write(data) 29 if n != len(data) { 30 return fmt.Errorf( 31 "expected to write[%d] wrote[%d]", 32 len(data), 33 n, 34 ) 35 } 36 return err 37 } 38 39 handleIOError := func(err error) bool { 40 if err != nil { 41 var finalwriteerr error 42 43 if text, ok := searcher.flushBuffer(); ok { 44 finalwriteerr = write(text) 45 } 46 if err == io.EOF { 47 if finalwriteerr == nil { 48 output.Close() 49 } else { 50 output.CloseWithError(fmt.Errorf( 51 "error[%s] writing last data", 52 finalwriteerr, 53 )) 54 } 55 } else { 56 output.CloseWithError(err) 57 } 58 return true 59 } 60 return false 61 } 62 63 data := make([]byte, 1) 64 for { 65 // WHY: Don't see the point of checking N when reading a single byte 66 n, err := input.Read(data) 67 68 if n <= 0 { 69 if handleIOError(err) { 70 return 71 } 72 // WHY: 73 // Implementations of Read are discouraged from 74 // returning a zero byte count with a nil error, 75 // except when len(p) == 0. 76 // Callers should treat a return of 0 and nil as 77 // indicating that nothing happened; in particular it 78 // does not indicate EOF. 79 continue 80 } 81 82 if text, ok := searcher.next(data[0]); ok { 83 err = write(text) 84 } 85 86 if handleIOError(err) { 87 return 88 } 89 } 90 } 91 92 type byteType int 93 94 type wordSearcher struct { 95 buffer []byte 96 possibleRune []byte 97 waitingForRune bool 98 minTextSize uint 99 } 100 101 const ( 102 binaryType byteType = iota 103 asciiType 104 runeStartType 105 ) 106 107 func (w *wordSearcher) next(b byte) ([]byte, bool) { 108 if w.waitingForRune { 109 return w.nextRune(b) 110 } 111 return w.nextASCII(b) 112 } 113 114 func (w *wordSearcher) nextRune(b byte) ([]byte, bool) { 115 116 const maxUTFSize = 4 117 118 if b == 0 { 119 w.resetRuneSearch() 120 return w.flushBuffer() 121 } 122 123 if word := string([]byte{b}); utf8.ValidString(word) { 124 w.resetRuneSearch() 125 data, ok := w.flushBuffer() 126 w.writeOnBuffer(b) 127 return data, ok 128 } 129 130 if utf8.RuneStart(b) { 131 w.resetRuneSearch() 132 data, ok := w.flushBuffer() 133 w.startRuneSearch(b) 134 return data, ok 135 } 136 137 w.writeOnPossibleRune(b) 138 if utf8.ValidString(string(w.possibleRune)) { 139 w.writeOnBuffer(w.possibleRune...) 140 w.resetRuneSearch() 141 return nil, false 142 } 143 144 if len(w.possibleRune) == maxUTFSize { 145 w.resetRuneSearch() 146 return w.flushBuffer() 147 } 148 149 return nil, false 150 } 151 152 func (w *wordSearcher) resetRuneSearch() { 153 w.waitingForRune = false 154 w.possibleRune = nil 155 } 156 157 func (w *wordSearcher) nextASCII(b byte) ([]byte, bool) { 158 switch bytetype(b) { 159 case binaryType: 160 { 161 return w.flushBuffer() 162 } 163 case asciiType: 164 { 165 w.writeOnBuffer(b) 166 } 167 case runeStartType: 168 { 169 w.startRuneSearch(b) 170 } 171 } 172 return nil, false 173 } 174 175 func (w *wordSearcher) startRuneSearch(b byte) { 176 w.waitingForRune = true 177 w.writeOnPossibleRune(b) 178 } 179 180 func (w *wordSearcher) writeOnBuffer(b ...byte) { 181 w.buffer = append(w.buffer, b...) 182 } 183 184 func (w *wordSearcher) writeOnPossibleRune(b byte) { 185 w.possibleRune = append(w.possibleRune, b) 186 } 187 188 func (w *wordSearcher) bufferLenInRunes() uint { 189 return uint(len([]rune(string(w.buffer)))) 190 } 191 192 func (w *wordSearcher) flushBuffer() ([]byte, bool) { 193 if len(w.buffer) == 0 { 194 return nil, false 195 } 196 if w.bufferLenInRunes() < w.minTextSize { 197 w.buffer = nil 198 return nil, false 199 } 200 b := w.buffer 201 w.buffer = nil 202 return b, true 203 } 204 205 func bytetype(b byte) byteType { 206 if b == 0 { 207 return binaryType 208 } 209 if word := string([]byte{b}); utf8.ValidString(word) { 210 return asciiType 211 } 212 if utf8.RuneStart(b) { 213 return runeStartType 214 } 215 return binaryType 216 }