github.com/riscv/riscv-go@v0.0.0-20200123204226-124ebd6fcc8e/src/cmd/compile/internal/syntax/source.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements source, a buffered rune reader 6 // which is specialized for the needs of the Go scanner: 7 // Contiguous sequences of runes (literals) are extracted 8 // directly as []byte without the need to re-encode the 9 // runes in UTF-8 (as would be necessary with bufio.Reader). 10 // 11 // This file is self-contained (go tool compile source.go 12 // compiles) and thus could be made into its own package. 13 14 package syntax 15 16 import ( 17 "io" 18 "unicode/utf8" 19 ) 20 21 // buf [...read...|...|...unread...|s|...free...] 22 // ^ ^ ^ ^ 23 // | | | | 24 // suf r0 r w 25 26 type source struct { 27 src io.Reader 28 errh func(line, pos uint, msg string) 29 30 // source buffer 31 buf [4 << 10]byte 32 offs int // source offset of buf 33 r0, r, w int // previous/current read and write buf positions, excluding sentinel 34 line0, line uint // previous/current line 35 col0, col uint // previous/current column (byte offsets from line start) 36 ioerr error // pending io error 37 38 // literal buffer 39 lit []byte // literal prefix 40 suf int // literal suffix; suf >= 0 means we are scanning a literal 41 } 42 43 // init initializes source to read from src and to report errors via errh. 44 // errh must not be nil. 45 func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) { 46 s.src = src 47 s.errh = errh 48 49 s.buf[0] = utf8.RuneSelf // terminate with sentinel 50 s.offs = 0 51 s.r0, s.r, s.w = 0, 0, 0 52 s.line0, s.line = 1, 1 53 s.col0, s.col = 0, 0 54 s.ioerr = nil 55 56 s.lit = s.lit[:0] 57 s.suf = -1 58 } 59 60 // ungetr ungets the most recently read rune. 61 func (s *source) ungetr() { 62 s.r, s.line, s.col = s.r0, s.line0, s.col0 63 } 64 65 // ungetr2 is like ungetr but enables a 2nd ungetr. 66 // It must not be called if one of the runes seen 67 // was a newline. 68 func (s *source) ungetr2() { 69 s.ungetr() 70 // line must not have changed 71 s.r0-- 72 s.col0-- 73 } 74 75 func (s *source) error(msg string) { 76 s.errh(s.line0, s.col0, msg) 77 } 78 79 // getr reads and returns the next rune. 80 // 81 // If a read or source encoding error occurs, getr 82 // calls the error handler installed with init. 83 // The handler must exist. 84 // 85 // The (line, col) position passed to the error handler 86 // is always at the current source reading position. 87 func (s *source) getr() rune { 88 redo: 89 s.r0, s.line0, s.col0 = s.r, s.line, s.col 90 91 // We could avoid at least one test that is always taken in the 92 // for loop below by duplicating the common case code (ASCII) 93 // here since we always have at least the sentinel (utf8.RuneSelf) 94 // in the buffer. Measure and optimize if necessary. 95 96 // make sure we have at least one rune in buffer, or we are at EOF 97 for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) { 98 s.fill() // s.w-s.r < len(s.buf) => buffer is not full 99 } 100 101 // common case: ASCII and enough bytes 102 // (invariant: s.buf[s.w] == utf8.RuneSelf) 103 if b := s.buf[s.r]; b < utf8.RuneSelf { 104 s.r++ 105 // TODO(gri) Optimization: Instead of adjusting s.col for each character, 106 // remember the line offset instead and then compute the offset as needed 107 // (which is less often). 108 s.col++ 109 if b == 0 { 110 s.error("invalid NUL character") 111 goto redo 112 } 113 if b == '\n' { 114 s.line++ 115 s.col = 0 116 } 117 return rune(b) 118 } 119 120 // EOF 121 if s.r == s.w { 122 if s.ioerr != io.EOF { 123 s.error(s.ioerr.Error()) 124 } 125 return -1 126 } 127 128 // uncommon case: not ASCII 129 r, w := utf8.DecodeRune(s.buf[s.r:s.w]) 130 s.r += w 131 s.col += uint(w) 132 133 if r == utf8.RuneError && w == 1 { 134 s.error("invalid UTF-8 encoding") 135 goto redo 136 } 137 138 // BOM's are only allowed as the first character in a file 139 const BOM = 0xfeff 140 if r == BOM { 141 if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) 142 s.error("invalid BOM in the middle of the file") 143 } 144 goto redo 145 } 146 147 return r 148 } 149 150 func (s *source) fill() { 151 // Slide unread bytes to beginning but preserve last read char 152 // (for one ungetr call) plus one extra byte (for a 2nd ungetr 153 // call, only for ".." character sequence and float literals 154 // starting with "."). 155 if s.r0 > 1 { 156 // save literal prefix, if any 157 // (We see at most one ungetr call while reading 158 // a literal, so make sure s.r0 remains in buf.) 159 if s.suf >= 0 { 160 s.lit = append(s.lit, s.buf[s.suf:s.r0]...) 161 s.suf = 1 // == s.r0 after slide below 162 } 163 s.offs += s.r0 - 1 164 r := s.r - s.r0 + 1 // last read char plus one byte 165 s.w = r + copy(s.buf[r:], s.buf[s.r:s.w]) 166 s.r = r 167 s.r0 = 1 168 } 169 170 // read more data: try a limited number of times 171 for i := 100; i > 0; i-- { 172 n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel 173 if n < 0 { 174 panic("negative read") // incorrect underlying io.Reader implementation 175 } 176 s.w += n 177 if n > 0 || err != nil { 178 s.buf[s.w] = utf8.RuneSelf // sentinel 179 if err != nil { 180 s.ioerr = err 181 } 182 return 183 } 184 } 185 186 s.ioerr = io.ErrNoProgress 187 } 188 189 func (s *source) startLit() { 190 s.suf = s.r0 191 s.lit = s.lit[:0] // reuse lit 192 } 193 194 func (s *source) stopLit() []byte { 195 lit := s.buf[s.suf:s.r] 196 if len(s.lit) > 0 { 197 lit = append(s.lit, lit...) 198 } 199 s.suf = -1 // no pending literal 200 return lit 201 }