github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/cmd/compile/internal/syntax/source.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements source, a buffered rune reader 6 // which is specialized for the needs of the Go scanner: 7 // Contiguous sequences of runes (literals) are extracted 8 // directly as []byte without the need to re-encode the 9 // runes in UTF-8 (as would be necessary with bufio.Reader). 10 // 11 // This file is self-contained (go tool compile source.go 12 // compiles) and thus could be made into its own package. 13 14 package syntax 15 16 import ( 17 "io" 18 "unicode/utf8" 19 ) 20 21 // starting points for line and column numbers 22 const linebase = 1 23 const colbase = 1 24 25 // buf [...read...|...|...unread...|s|...free...] 26 // ^ ^ ^ ^ 27 // | | | | 28 // suf r0 r w 29 30 type source struct { 31 src io.Reader 32 errh func(line, pos uint, msg string) 33 34 // source buffer 35 buf [4 << 10]byte 36 offs int // source offset of buf 37 r0, r, w int // previous/current read and write buf positions, excluding sentinel 38 line0, line uint // previous/current line 39 col0, col uint // previous/current column (byte offsets from line start) 40 ioerr error // pending io error 41 42 // literal buffer 43 lit []byte // literal prefix 44 suf int // literal suffix; suf >= 0 means we are scanning a literal 45 } 46 47 // init initializes source to read from src and to report errors via errh. 48 // errh must not be nil. 49 func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) { 50 s.src = src 51 s.errh = errh 52 53 s.buf[0] = utf8.RuneSelf // terminate with sentinel 54 s.offs = 0 55 s.r0, s.r, s.w = 0, 0, 0 56 s.line0, s.line = 0, linebase 57 s.col0, s.col = 0, colbase 58 s.ioerr = nil 59 60 s.lit = s.lit[:0] 61 s.suf = -1 62 } 63 64 // ungetr ungets the most recently read rune. 65 func (s *source) ungetr() { 66 s.r, s.line, s.col = s.r0, s.line0, s.col0 67 } 68 69 // ungetr2 is like ungetr but enables a 2nd ungetr. 70 // It must not be called if one of the runes seen 71 // was a newline. 72 func (s *source) ungetr2() { 73 s.ungetr() 74 // line must not have changed 75 s.r0-- 76 s.col0-- 77 } 78 79 func (s *source) error(msg string) { 80 s.errh(s.line0, s.col0, msg) 81 } 82 83 // getr reads and returns the next rune. 84 // 85 // If a read or source encoding error occurs, getr 86 // calls the error handler installed with init. 87 // The handler must exist. 88 // 89 // The (line, col) position passed to the error handler 90 // is always at the current source reading position. 91 func (s *source) getr() rune { 92 redo: 93 s.r0, s.line0, s.col0 = s.r, s.line, s.col 94 95 // We could avoid at least one test that is always taken in the 96 // for loop below by duplicating the common case code (ASCII) 97 // here since we always have at least the sentinel (utf8.RuneSelf) 98 // in the buffer. Measure and optimize if necessary. 99 100 // make sure we have at least one rune in buffer, or we are at EOF 101 for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) { 102 s.fill() // s.w-s.r < len(s.buf) => buffer is not full 103 } 104 105 // common case: ASCII and enough bytes 106 // (invariant: s.buf[s.w] == utf8.RuneSelf) 107 if b := s.buf[s.r]; b < utf8.RuneSelf { 108 s.r++ 109 // TODO(gri) Optimization: Instead of adjusting s.col for each character, 110 // remember the line offset instead and then compute the offset as needed 111 // (which is less often). 112 s.col++ 113 if b == 0 { 114 s.error("invalid NUL character") 115 goto redo 116 } 117 if b == '\n' { 118 s.line++ 119 s.col = colbase 120 } 121 return rune(b) 122 } 123 124 // EOF 125 if s.r == s.w { 126 if s.ioerr != io.EOF { 127 // ensure we never start with a '/' (e.g., rooted path) in the error message 128 s.error("I/O error: " + s.ioerr.Error()) 129 } 130 return -1 131 } 132 133 // uncommon case: not ASCII 134 r, w := utf8.DecodeRune(s.buf[s.r:s.w]) 135 s.r += w 136 s.col += uint(w) 137 138 if r == utf8.RuneError && w == 1 { 139 s.error("invalid UTF-8 encoding") 140 goto redo 141 } 142 143 // BOM's are only allowed as the first character in a file 144 const BOM = 0xfeff 145 if r == BOM { 146 if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) 147 s.error("invalid BOM in the middle of the file") 148 } 149 goto redo 150 } 151 152 return r 153 } 154 155 func (s *source) fill() { 156 // Slide unread bytes to beginning but preserve last read char 157 // (for one ungetr call) plus one extra byte (for a 2nd ungetr 158 // call, only for ".." character sequence and float literals 159 // starting with "."). 160 if s.r0 > 1 { 161 // save literal prefix, if any 162 // (We see at most one ungetr call while reading 163 // a literal, so make sure s.r0 remains in buf.) 164 if s.suf >= 0 { 165 s.lit = append(s.lit, s.buf[s.suf:s.r0]...) 166 s.suf = 1 // == s.r0 after slide below 167 } 168 n := s.r0 - 1 169 copy(s.buf[:], s.buf[n:s.w]) 170 s.offs += n 171 s.r0 = 1 // eqv: s.r0 -= n 172 s.r -= n 173 s.w -= n 174 } 175 176 // read more data: try a limited number of times 177 for i := 100; i > 0; i-- { 178 n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel 179 if n < 0 { 180 panic("negative read") // incorrect underlying io.Reader implementation 181 } 182 s.w += n 183 if n > 0 || err != nil { 184 s.buf[s.w] = utf8.RuneSelf // sentinel 185 if err != nil { 186 s.ioerr = err 187 } 188 return 189 } 190 } 191 192 s.ioerr = io.ErrNoProgress 193 } 194 195 func (s *source) startLit() { 196 s.suf = s.r0 197 s.lit = s.lit[:0] // reuse lit 198 } 199 200 func (s *source) stopLit() []byte { 201 lit := s.buf[s.suf:s.r] 202 if len(s.lit) > 0 { 203 lit = append(s.lit, lit...) 204 } 205 s.killLit() 206 return lit 207 } 208 209 func (s *source) killLit() { 210 s.suf = -1 // no pending literal 211 }