github.com/stingnevermore/go@v0.0.0-20180120041312-3810f5bfed72/src/cmd/compile/internal/syntax/source.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements source, a buffered rune reader 6 // which is specialized for the needs of the Go scanner: 7 // Contiguous sequences of runes (literals) are extracted 8 // directly as []byte without the need to re-encode the 9 // runes in UTF-8 (as would be necessary with bufio.Reader). 10 // 11 // This file is self-contained (go tool compile source.go 12 // compiles) and thus could be made into its own package. 13 14 package syntax 15 16 import ( 17 "io" 18 "unicode/utf8" 19 ) 20 21 // starting points for line and column numbers 22 const linebase = 1 23 const colbase = 1 24 25 // buf [...read...|...|...unread...|s|...free...] 26 // ^ ^ ^ ^ 27 // | | | | 28 // suf r0 r w 29 30 type source struct { 31 src io.Reader 32 errh func(line, pos uint, msg string) 33 34 // source buffer 35 buf [4 << 10]byte 36 offs int // source offset of buf 37 r0, r, w int // previous/current read and write buf positions, excluding sentinel 38 line0, line uint // previous/current line 39 col0, col uint // previous/current column (byte offsets from line start) 40 ioerr error // pending io error 41 42 // literal buffer 43 lit []byte // literal prefix 44 suf int // literal suffix; suf >= 0 means we are scanning a literal 45 } 46 47 // init initializes source to read from src and to report errors via errh. 48 // errh must not be nil. 49 func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) { 50 s.src = src 51 s.errh = errh 52 53 s.buf[0] = utf8.RuneSelf // terminate with sentinel 54 s.offs = 0 55 s.r0, s.r, s.w = 0, 0, 0 56 s.line0, s.line = 0, linebase 57 s.col0, s.col = 0, colbase 58 s.ioerr = nil 59 60 s.lit = s.lit[:0] 61 s.suf = -1 62 } 63 64 // ungetr ungets the most recently read rune. 65 func (s *source) ungetr() { 66 s.r, s.line, s.col = s.r0, s.line0, s.col0 67 } 68 69 // ungetr2 is like ungetr but enables a 2nd ungetr. 70 // It must not be called if one of the runes seen 71 // was a newline. 72 func (s *source) ungetr2() { 73 s.ungetr() 74 // line must not have changed 75 s.r0-- 76 s.col0-- 77 } 78 79 func (s *source) error(msg string) { 80 s.errh(s.line0, s.col0, msg) 81 } 82 83 // getr reads and returns the next rune. 84 // 85 // If a read or source encoding error occurs, getr 86 // calls the error handler installed with init. 87 // The handler must exist. 88 // 89 // The (line, col) position passed to the error handler 90 // is always at the current source reading position. 91 func (s *source) getr() rune { 92 redo: 93 s.r0, s.line0, s.col0 = s.r, s.line, s.col 94 95 // We could avoid at least one test that is always taken in the 96 // for loop below by duplicating the common case code (ASCII) 97 // here since we always have at least the sentinel (utf8.RuneSelf) 98 // in the buffer. Measure and optimize if necessary. 99 100 // make sure we have at least one rune in buffer, or we are at EOF 101 for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) { 102 s.fill() // s.w-s.r < len(s.buf) => buffer is not full 103 } 104 105 // common case: ASCII and enough bytes 106 // (invariant: s.buf[s.w] == utf8.RuneSelf) 107 if b := s.buf[s.r]; b < utf8.RuneSelf { 108 s.r++ 109 // TODO(gri) Optimization: Instead of adjusting s.col for each character, 110 // remember the line offset instead and then compute the offset as needed 111 // (which is less often). 112 s.col++ 113 if b == 0 { 114 s.error("invalid NUL character") 115 goto redo 116 } 117 if b == '\n' { 118 s.line++ 119 s.col = colbase 120 } 121 return rune(b) 122 } 123 124 // EOF 125 if s.r == s.w { 126 if s.ioerr != io.EOF { 127 s.error(s.ioerr.Error()) 128 } 129 return -1 130 } 131 132 // uncommon case: not ASCII 133 r, w := utf8.DecodeRune(s.buf[s.r:s.w]) 134 s.r += w 135 s.col += uint(w) 136 137 if r == utf8.RuneError && w == 1 { 138 s.error("invalid UTF-8 encoding") 139 goto redo 140 } 141 142 // BOM's are only allowed as the first character in a file 143 const BOM = 0xfeff 144 if r == BOM { 145 if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) 146 s.error("invalid BOM in the middle of the file") 147 } 148 goto redo 149 } 150 151 return r 152 } 153 154 func (s *source) fill() { 155 // Slide unread bytes to beginning but preserve last read char 156 // (for one ungetr call) plus one extra byte (for a 2nd ungetr 157 // call, only for ".." character sequence and float literals 158 // starting with "."). 159 if s.r0 > 1 { 160 // save literal prefix, if any 161 // (We see at most one ungetr call while reading 162 // a literal, so make sure s.r0 remains in buf.) 163 if s.suf >= 0 { 164 s.lit = append(s.lit, s.buf[s.suf:s.r0]...) 165 s.suf = 1 // == s.r0 after slide below 166 } 167 n := s.r0 - 1 168 copy(s.buf[:], s.buf[n:s.w]) 169 s.offs += n 170 s.r0 = 1 // eqv: s.r0 -= n 171 s.r -= n 172 s.w -= n 173 } 174 175 // read more data: try a limited number of times 176 for i := 100; i > 0; i-- { 177 n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel 178 if n < 0 { 179 panic("negative read") // incorrect underlying io.Reader implementation 180 } 181 s.w += n 182 if n > 0 || err != nil { 183 s.buf[s.w] = utf8.RuneSelf // sentinel 184 if err != nil { 185 s.ioerr = err 186 } 187 return 188 } 189 } 190 191 s.ioerr = io.ErrNoProgress 192 } 193 194 func (s *source) startLit() { 195 s.suf = s.r0 196 s.lit = s.lit[:0] // reuse lit 197 } 198 199 func (s *source) stopLit() []byte { 200 lit := s.buf[s.suf:s.r] 201 if len(s.lit) > 0 { 202 lit = append(s.lit, lit...) 203 } 204 s.suf = -1 // no pending literal 205 return lit 206 }