github.com/gagliardetto/golang-go@v0.0.0-20201020153340-53909ea70814/cmd/compile/internal/syntax/source.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements source, a buffered rune reader
     6  // which is specialized for the needs of the Go scanner:
     7  // Contiguous sequences of runes (literals) are extracted
     8  // directly as []byte without the need to re-encode the
     9  // runes in UTF-8 (as would be necessary with bufio.Reader).
    10  //
    11  // This file is self-contained (go tool compile source.go
    12  // compiles) and thus could be made into its own package.
    13  
    14  package syntax
    15  
    16  import (
    17  	"io"
    18  	"unicode/utf8"
    19  )
    20  
    21  // starting points for line and column numbers
    22  const linebase = 1
    23  const colbase = 1
    24  
    25  // max. number of bytes to unread
    26  const maxunread = 10
    27  
    28  // buf [...read...|...|...unread...|s|...free...]
    29  //         ^      ^   ^            ^
    30  //         |      |   |            |
    31  //        suf     r0  r            w
    32  
    33  type source struct {
    34  	src  io.Reader
    35  	errh func(line, pos uint, msg string)
    36  
    37  	// source buffer
    38  	buf         [4 << 10]byte
    39  	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
    40  	line0, line uint  // previous/current line
    41  	col0, col   uint  // previous/current column (byte offsets from line start)
    42  	ioerr       error // pending io error
    43  
    44  	// literal buffer
    45  	lit []byte // literal prefix
    46  	suf int    // literal suffix; suf >= 0 means we are scanning a literal
    47  }
    48  
    49  // init initializes source to read from src and to report errors via errh.
    50  // errh must not be nil.
    51  func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
    52  	s.src = src
    53  	s.errh = errh
    54  
    55  	s.buf[0] = utf8.RuneSelf // terminate with sentinel
    56  	s.r0, s.r, s.w = 0, 0, 0
    57  	s.line0, s.line = 0, linebase
    58  	s.col0, s.col = 0, colbase
    59  	s.ioerr = nil
    60  
    61  	s.lit = s.lit[:0]
    62  	s.suf = -1
    63  }
    64  
    65  // ungetr sets the reading position to a previous reading
    66  // position, usually the one of the most recently read
    67  // rune, but possibly earlier (see unread below).
    68  func (s *source) ungetr() {
    69  	s.r, s.line, s.col = s.r0, s.line0, s.col0
    70  }
    71  
    72  // unread moves the previous reading position to a position
    73  // that is n bytes earlier in the source. The next ungetr
    74  // call will set the reading position to that moved position.
    75  // The "unread" runes must be single byte and not contain any
    76  // newlines; and 0 <= n <= maxunread must hold.
    77  func (s *source) unread(n int) {
    78  	s.r0 -= n
    79  	s.col0 -= uint(n)
    80  }
    81  
    82  func (s *source) error(msg string) {
    83  	s.errh(s.line0, s.col0, msg)
    84  }
    85  
    86  // getr reads and returns the next rune.
    87  //
    88  // If a read or source encoding error occurs, getr
    89  // calls the error handler installed with init.
    90  // The handler must exist.
    91  //
    92  // The (line, col) position passed to the error handler
    93  // is always at the current source reading position.
    94  func (s *source) getr() rune {
    95  redo:
    96  	s.r0, s.line0, s.col0 = s.r, s.line, s.col
    97  
    98  	// We could avoid at least one test that is always taken in the
    99  	// for loop below by duplicating the common case code (ASCII)
   100  	// here since we always have at least the sentinel (utf8.RuneSelf)
   101  	// in the buffer. Measure and optimize if necessary.
   102  
   103  	// make sure we have at least one rune in buffer, or we are at EOF
   104  	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
   105  		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
   106  	}
   107  
   108  	// common case: ASCII and enough bytes
   109  	// (invariant: s.buf[s.w] == utf8.RuneSelf)
   110  	if b := s.buf[s.r]; b < utf8.RuneSelf {
   111  		s.r++
   112  		// TODO(gri) Optimization: Instead of adjusting s.col for each character,
   113  		// remember the line offset instead and then compute the offset as needed
   114  		// (which is less often).
   115  		s.col++
   116  		if b == 0 {
   117  			s.error("invalid NUL character")
   118  			goto redo
   119  		}
   120  		if b == '\n' {
   121  			s.line++
   122  			s.col = colbase
   123  		}
   124  		return rune(b)
   125  	}
   126  
   127  	// EOF
   128  	if s.r == s.w {
   129  		if s.ioerr != io.EOF {
   130  			// ensure we never start with a '/' (e.g., rooted path) in the error message
   131  			s.error("I/O error: " + s.ioerr.Error())
   132  		}
   133  		return -1
   134  	}
   135  
   136  	// uncommon case: not ASCII
   137  	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
   138  	s.r += w
   139  	s.col += uint(w)
   140  
   141  	if r == utf8.RuneError && w == 1 {
   142  		s.error("invalid UTF-8 encoding")
   143  		goto redo
   144  	}
   145  
   146  	// BOM's are only allowed as the first character in a file
   147  	const BOM = 0xfeff
   148  	if r == BOM {
   149  		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to maxunread)
   150  			s.error("invalid BOM in the middle of the file")
   151  		}
   152  		goto redo
   153  	}
   154  
   155  	return r
   156  }
   157  
   158  func (s *source) fill() {
   159  	// Slide unread bytes to beginning but preserve last read char
   160  	// (for one ungetr call) plus maxunread extra bytes (for one
   161  	// unread call).
   162  	if s.r0 > maxunread {
   163  		n := s.r0 - maxunread // number of bytes to slide down
   164  		// save literal prefix, if any
   165  		// (make sure we keep maxunread bytes and the last
   166  		// read char in the buffer)
   167  		if s.suf >= 0 {
   168  			// we have a literal
   169  			if s.suf < n {
   170  				// save literal prefix
   171  				s.lit = append(s.lit, s.buf[s.suf:n]...)
   172  				s.suf = 0
   173  			} else {
   174  				s.suf -= n
   175  			}
   176  		}
   177  		copy(s.buf[:], s.buf[n:s.w])
   178  		s.r0 = maxunread // eqv: s.r0 -= n
   179  		s.r -= n
   180  		s.w -= n
   181  	}
   182  
   183  	// read more data: try a limited number of times
   184  	for i := 100; i > 0; i-- {
   185  		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
   186  		if n < 0 {
   187  			panic("negative read") // incorrect underlying io.Reader implementation
   188  		}
   189  		s.w += n
   190  		if n > 0 || err != nil {
   191  			s.buf[s.w] = utf8.RuneSelf // sentinel
   192  			if err != nil {
   193  				s.ioerr = err
   194  			}
   195  			return
   196  		}
   197  	}
   198  
   199  	s.buf[s.w] = utf8.RuneSelf // sentinel
   200  	s.ioerr = io.ErrNoProgress
   201  }
   202  
   203  func (s *source) startLit() {
   204  	s.suf = s.r0
   205  	s.lit = s.lit[:0] // reuse lit
   206  }
   207  
   208  func (s *source) stopLit() []byte {
   209  	lit := s.buf[s.suf:s.r]
   210  	if len(s.lit) > 0 {
   211  		lit = append(s.lit, lit...)
   212  	}
   213  	s.killLit()
   214  	return lit
   215  }
   216  
   217  func (s *source) killLit() {
   218  	s.suf = -1 // no pending literal
   219  }