github.com/tidwall/go@v0.0.0-20170415222209-6694a6888b7d/src/cmd/compile/internal/syntax/source.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements source, a buffered rune reader
     6  // which is specialized for the needs of the Go scanner:
     7  // Contiguous sequences of runes (literals) are extracted
     8  // directly as []byte without the need to re-encode the
     9  // runes in UTF-8 (as would be necessary with bufio.Reader).
    10  //
    11  // This file is self-contained (go tool compile source.go
    12  // compiles) and thus could be made into its own package.
    13  
    14  package syntax
    15  
    16  import (
    17  	"io"
    18  	"unicode/utf8"
    19  )
    20  
    21  // starting points for line and column numbers
    22  const linebase = 1
    23  const colbase = 1
    24  
    25  // buf [...read...|...|...unread...|s|...free...]
    26  //         ^      ^   ^            ^
    27  //         |      |   |            |
    28  //        suf     r0  r            w
    29  
    30  type source struct {
    31  	src  io.Reader
    32  	errh func(line, pos uint, msg string)
    33  
    34  	// source buffer
    35  	buf         [4 << 10]byte
    36  	offs        int   // source offset of buf
    37  	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
    38  	line0, line uint  // previous/current line
    39  	col0, col   uint  // previous/current column (byte offsets from line start)
    40  	ioerr       error // pending io error
    41  
    42  	// literal buffer
    43  	lit []byte // literal prefix
    44  	suf int    // literal suffix; suf >= 0 means we are scanning a literal
    45  }
    46  
    47  // init initializes source to read from src and to report errors via errh.
    48  // errh must not be nil.
    49  func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
    50  	s.src = src
    51  	s.errh = errh
    52  
    53  	s.buf[0] = utf8.RuneSelf // terminate with sentinel
    54  	s.offs = 0
    55  	s.r0, s.r, s.w = 0, 0, 0
    56  	s.line0, s.line = 0, linebase
    57  	s.col0, s.col = 0, colbase
    58  	s.ioerr = nil
    59  
    60  	s.lit = s.lit[:0]
    61  	s.suf = -1
    62  }
    63  
    64  // ungetr ungets the most recently read rune.
    65  func (s *source) ungetr() {
    66  	s.r, s.line, s.col = s.r0, s.line0, s.col0
    67  }
    68  
    69  // ungetr2 is like ungetr but enables a 2nd ungetr.
    70  // It must not be called if one of the runes seen
    71  // was a newline.
    72  func (s *source) ungetr2() {
    73  	s.ungetr()
    74  	// line must not have changed
    75  	s.r0--
    76  	s.col0--
    77  }
    78  
    79  func (s *source) error(msg string) {
    80  	s.errh(s.line0, s.col0, msg)
    81  }
    82  
    83  // getr reads and returns the next rune.
    84  //
    85  // If a read or source encoding error occurs, getr
    86  // calls the error handler installed with init.
    87  // The handler must exist.
    88  //
    89  // The (line, col) position passed to the error handler
    90  // is always at the current source reading position.
    91  func (s *source) getr() rune {
    92  redo:
    93  	s.r0, s.line0, s.col0 = s.r, s.line, s.col
    94  
    95  	// We could avoid at least one test that is always taken in the
    96  	// for loop below by duplicating the common case code (ASCII)
    97  	// here since we always have at least the sentinel (utf8.RuneSelf)
    98  	// in the buffer. Measure and optimize if necessary.
    99  
   100  	// make sure we have at least one rune in buffer, or we are at EOF
   101  	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
   102  		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
   103  	}
   104  
   105  	// common case: ASCII and enough bytes
   106  	// (invariant: s.buf[s.w] == utf8.RuneSelf)
   107  	if b := s.buf[s.r]; b < utf8.RuneSelf {
   108  		s.r++
   109  		// TODO(gri) Optimization: Instead of adjusting s.col for each character,
   110  		// remember the line offset instead and then compute the offset as needed
   111  		// (which is less often).
   112  		s.col++
   113  		if b == 0 {
   114  			s.error("invalid NUL character")
   115  			goto redo
   116  		}
   117  		if b == '\n' {
   118  			s.line++
   119  			s.col = colbase
   120  		}
   121  		return rune(b)
   122  	}
   123  
   124  	// EOF
   125  	if s.r == s.w {
   126  		if s.ioerr != io.EOF {
   127  			s.error(s.ioerr.Error())
   128  		}
   129  		return -1
   130  	}
   131  
   132  	// uncommon case: not ASCII
   133  	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
   134  	s.r += w
   135  	s.col += uint(w)
   136  
   137  	if r == utf8.RuneError && w == 1 {
   138  		s.error("invalid UTF-8 encoding")
   139  		goto redo
   140  	}
   141  
   142  	// BOM's are only allowed as the first character in a file
   143  	const BOM = 0xfeff
   144  	if r == BOM {
   145  		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
   146  			s.error("invalid BOM in the middle of the file")
   147  		}
   148  		goto redo
   149  	}
   150  
   151  	return r
   152  }
   153  
   154  func (s *source) fill() {
   155  	// Slide unread bytes to beginning but preserve last read char
   156  	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
   157  	// call, only for ".." character sequence and float literals
   158  	// starting with ".").
   159  	if s.r0 > 1 {
   160  		// save literal prefix, if any
   161  		// (We see at most one ungetr call while reading
   162  		// a literal, so make sure s.r0 remains in buf.)
   163  		if s.suf >= 0 {
   164  			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
   165  			s.suf = 1 // == s.r0 after slide below
   166  		}
   167  		s.offs += s.r0 - 1
   168  		r := s.r - s.r0 + 1 // last read char plus one byte
   169  		s.w = r + copy(s.buf[r:], s.buf[s.r:s.w])
   170  		s.r = r
   171  		s.r0 = 1
   172  	}
   173  
   174  	// read more data: try a limited number of times
   175  	for i := 100; i > 0; i-- {
   176  		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
   177  		if n < 0 {
   178  			panic("negative read") // incorrect underlying io.Reader implementation
   179  		}
   180  		s.w += n
   181  		if n > 0 || err != nil {
   182  			s.buf[s.w] = utf8.RuneSelf // sentinel
   183  			if err != nil {
   184  				s.ioerr = err
   185  			}
   186  			return
   187  		}
   188  	}
   189  
   190  	s.ioerr = io.ErrNoProgress
   191  }
   192  
   193  func (s *source) startLit() {
   194  	s.suf = s.r0
   195  	s.lit = s.lit[:0] // reuse lit
   196  }
   197  
   198  func (s *source) stopLit() []byte {
   199  	lit := s.buf[s.suf:s.r]
   200  	if len(s.lit) > 0 {
   201  		lit = append(s.lit, lit...)
   202  	}
   203  	s.suf = -1 // no pending literal
   204  	return lit
   205  }