github.com/coinstack/gopher-lua@v0.0.0-20180626044619-c9c62d4ee45e/parse/lexer.go (about)

     1  package parse
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"fmt"
     7  	"github.com/coinstack/gopher-lua/ast"
     8  	"io"
     9  	"reflect"
    10  	"strconv"
    11  	"strings"
    12  )
    13  
    14  const EOF = -1
    15  const whitespace1 = 1<<'\t' | 1<<' '
    16  const whitespace2 = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
    17  
    18  type Error struct {
    19  	Pos     ast.Position
    20  	Message string
    21  	Token   string
    22  }
    23  
    24  func (e *Error) Error() string {
    25  	pos := e.Pos
    26  	if pos.Line == EOF {
    27  		return fmt.Sprintf("%v at EOF:   %s\n", pos.Source, e.Message)
    28  	} else {
    29  		return fmt.Sprintf("%v line:%d(column:%d) near '%v':   %s\n", pos.Source, pos.Line, pos.Column, e.Token, e.Message)
    30  	}
    31  }
    32  
    33  func writeChar(buf *bytes.Buffer, c int) { buf.WriteByte(byte(c)) }
    34  
    35  func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' }
    36  
    37  func isIdent(ch int, pos int) bool {
    38  	return ch == '_' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z' || isDecimal(ch) && pos > 0
    39  }
    40  
    41  func isDigit(ch int) bool {
    42  	return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
    43  }
    44  
    45  type Scanner struct {
    46  	Pos    ast.Position
    47  	reader *bufio.Reader
    48  }
    49  
    50  func NewScanner(reader io.Reader, source string) *Scanner {
    51  	return &Scanner{
    52  		Pos:    ast.Position{source, 1, 0},
    53  		reader: bufio.NewReaderSize(reader, 4096),
    54  	}
    55  }
    56  
    57  func (sc *Scanner) Error(tok string, msg string) *Error { return &Error{sc.Pos, msg, tok} }
    58  
    59  func (sc *Scanner) TokenError(tok ast.Token, msg string) *Error { return &Error{tok.Pos, msg, tok.Str} }
    60  
    61  func (sc *Scanner) readNext() int {
    62  	ch, err := sc.reader.ReadByte()
    63  	if err == io.EOF {
    64  		return EOF
    65  	}
    66  	return int(ch)
    67  }
    68  
    69  func (sc *Scanner) Newline(ch int) {
    70  	if ch < 0 {
    71  		return
    72  	}
    73  	sc.Pos.Line += 1
    74  	sc.Pos.Column = 0
    75  	next := sc.Peek()
    76  	if ch == '\n' && next == '\r' || ch == '\r' && next == '\n' {
    77  		sc.reader.ReadByte()
    78  	}
    79  }
    80  
    81  func (sc *Scanner) Next() int {
    82  	ch := sc.readNext()
    83  	switch ch {
    84  	case '\n', '\r':
    85  		sc.Newline(ch)
    86  		ch = int('\n')
    87  	case EOF:
    88  		sc.Pos.Line = EOF
    89  		sc.Pos.Column = 0
    90  	default:
    91  		sc.Pos.Column++
    92  	}
    93  	return ch
    94  }
    95  
    96  func (sc *Scanner) Peek() int {
    97  	ch := sc.readNext()
    98  	if ch != EOF {
    99  		sc.reader.UnreadByte()
   100  	}
   101  	return ch
   102  }
   103  
   104  func (sc *Scanner) skipWhiteSpace(whitespace int64) int {
   105  	ch := sc.Next()
   106  	for ; whitespace&(1<<uint(ch)) != 0; ch = sc.Next() {
   107  	}
   108  	return ch
   109  }
   110  
   111  func (sc *Scanner) skipComments(ch int) error {
   112  	// multiline comment
   113  	if sc.Peek() == '[' {
   114  		ch = sc.Next()
   115  		if sc.Peek() == '[' || sc.Peek() == '=' {
   116  			var buf bytes.Buffer
   117  			if err := sc.scanMultilineString(sc.Next(), &buf); err != nil {
   118  				return sc.Error(buf.String(), "invalid multiline comment")
   119  			}
   120  			return nil
   121  		}
   122  	}
   123  	for {
   124  		if ch == '\n' || ch == '\r' || ch < 0 {
   125  			break
   126  		}
   127  		ch = sc.Next()
   128  	}
   129  	return nil
   130  }
   131  
   132  func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error {
   133  	writeChar(buf, ch)
   134  	for isIdent(sc.Peek(), 1) {
   135  		writeChar(buf, sc.Next())
   136  	}
   137  	return nil
   138  }
   139  
   140  func (sc *Scanner) scanDecimal(ch int, buf *bytes.Buffer) error {
   141  	writeChar(buf, ch)
   142  	for isDecimal(sc.Peek()) {
   143  		writeChar(buf, sc.Next())
   144  	}
   145  	return nil
   146  }
   147  
   148  func (sc *Scanner) scanNumber(ch int, buf *bytes.Buffer) error {
   149  	if ch == '0' { // octal
   150  		if sc.Peek() == 'x' || sc.Peek() == 'X' {
   151  			writeChar(buf, ch)
   152  			writeChar(buf, sc.Next())
   153  			hasvalue := false
   154  			for isDigit(sc.Peek()) {
   155  				writeChar(buf, sc.Next())
   156  				hasvalue = true
   157  			}
   158  			if !hasvalue {
   159  				return sc.Error(buf.String(), "illegal hexadecimal number")
   160  			}
   161  			return nil
   162  		} else if sc.Peek() != '.' && isDecimal(sc.Peek()) {
   163  			ch = sc.Next()
   164  		}
   165  	}
   166  	sc.scanDecimal(ch, buf)
   167  	if sc.Peek() == '.' {
   168  		sc.scanDecimal(sc.Next(), buf)
   169  	}
   170  	if ch = sc.Peek(); ch == 'e' || ch == 'E' {
   171  		writeChar(buf, sc.Next())
   172  		if ch = sc.Peek(); ch == '-' || ch == '+' {
   173  			writeChar(buf, sc.Next())
   174  		}
   175  		sc.scanDecimal(sc.Next(), buf)
   176  	}
   177  
   178  	return nil
   179  }
   180  
   181  func (sc *Scanner) scanString(quote int, buf *bytes.Buffer) error {
   182  	ch := sc.Next()
   183  	for ch != quote {
   184  		if ch == '\n' || ch == '\r' || ch < 0 {
   185  			return sc.Error(buf.String(), "unterminated string")
   186  		}
   187  		if ch == '\\' {
   188  			if err := sc.scanEscape(ch, buf); err != nil {
   189  				return err
   190  			}
   191  		} else {
   192  			writeChar(buf, ch)
   193  		}
   194  		ch = sc.Next()
   195  	}
   196  	return nil
   197  }
   198  
   199  func (sc *Scanner) scanEscape(ch int, buf *bytes.Buffer) error {
   200  	ch = sc.Next()
   201  	switch ch {
   202  	case 'a':
   203  		buf.WriteByte('\a')
   204  	case 'b':
   205  		buf.WriteByte('\b')
   206  	case 'f':
   207  		buf.WriteByte('\f')
   208  	case 'n':
   209  		buf.WriteByte('\n')
   210  	case 'r':
   211  		buf.WriteByte('\r')
   212  	case 't':
   213  		buf.WriteByte('\t')
   214  	case 'v':
   215  		buf.WriteByte('\v')
   216  	case '\\':
   217  		buf.WriteByte('\\')
   218  	case '"':
   219  		buf.WriteByte('"')
   220  	case '\'':
   221  		buf.WriteByte('\'')
   222  	case '\n':
   223  		buf.WriteByte('\n')
   224  	case '\r':
   225  		buf.WriteByte('\n')
   226  		sc.Newline('\r')
   227  	default:
   228  		if '0' <= ch && ch <= '9' {
   229  			bytes := []byte{byte(ch)}
   230  			for i := 0; i < 2 && isDecimal(sc.Peek()); i++ {
   231  				bytes = append(bytes, byte(sc.Next()))
   232  			}
   233  			val, _ := strconv.ParseInt(string(bytes), 10, 32)
   234  			writeChar(buf, int(val))
   235  		} else {
   236  			buf.WriteByte('\\')
   237  			writeChar(buf, ch)
   238  			return sc.Error(buf.String(), "Invalid escape sequence")
   239  		}
   240  	}
   241  	return nil
   242  }
   243  
   244  func (sc *Scanner) countSep(ch int) (int, int) {
   245  	count := 0
   246  	for ; ch == '='; count = count + 1 {
   247  		ch = sc.Next()
   248  	}
   249  	return count, ch
   250  }
   251  
   252  func (sc *Scanner) scanMultilineString(ch int, buf *bytes.Buffer) error {
   253  	var count1, count2 int
   254  	count1, ch = sc.countSep(ch)
   255  	if ch != '[' {
   256  		return sc.Error(string(ch), "invalid multiline string")
   257  	}
   258  	ch = sc.Next()
   259  	if ch == '\n' || ch == '\r' {
   260  		ch = sc.Next()
   261  	}
   262  	for {
   263  		if ch < 0 {
   264  			return sc.Error(buf.String(), "unterminated multiline string")
   265  		} else if ch == ']' {
   266  			count2, ch = sc.countSep(sc.Next())
   267  			if count1 == count2 && ch == ']' {
   268  				goto finally
   269  			}
   270  			buf.WriteByte(']')
   271  			buf.WriteString(strings.Repeat("=", count2))
   272  			continue
   273  		}
   274  		writeChar(buf, ch)
   275  		ch = sc.Next()
   276  	}
   277  
   278  finally:
   279  	return nil
   280  }
   281  
   282  var reservedWords = map[string]int{
   283  	"and": TAnd, "break": TBreak, "do": TDo, "else": TElse, "elseif": TElseIf,
   284  	"end": TEnd, "false": TFalse, "for": TFor, "function": TFunction,
   285  	"if": TIf, "in": TIn, "local": TLocal, "nil": TNil, "not": TNot, "or": TOr,
   286  	"return": TReturn, "repeat": TRepeat, "then": TThen, "true": TTrue,
   287  	"until": TUntil, "while": TWhile}
   288  
   289  func (sc *Scanner) Scan(lexer *Lexer) (ast.Token, error) {
   290  redo:
   291  	var err error
   292  	tok := ast.Token{}
   293  	newline := false
   294  
   295  	ch := sc.skipWhiteSpace(whitespace1)
   296  	if ch == '\n' || ch == '\r' {
   297  		newline = true
   298  		ch = sc.skipWhiteSpace(whitespace2)
   299  	}
   300  
   301  	if ch == '(' && lexer.PrevTokenType == ')' {
   302  		lexer.PNewLine = newline
   303  	} else {
   304  		lexer.PNewLine = false
   305  	}
   306  
   307  	var _buf bytes.Buffer
   308  	buf := &_buf
   309  	tok.Pos = sc.Pos
   310  
   311  	switch {
   312  	case isIdent(ch, 0):
   313  		tok.Type = TIdent
   314  		err = sc.scanIdent(ch, buf)
   315  		tok.Str = buf.String()
   316  		if err != nil {
   317  			goto finally
   318  		}
   319  		if typ, ok := reservedWords[tok.Str]; ok {
   320  			tok.Type = typ
   321  		}
   322  	case isDecimal(ch):
   323  		tok.Type = TNumber
   324  		err = sc.scanNumber(ch, buf)
   325  		tok.Str = buf.String()
   326  	default:
   327  		switch ch {
   328  		case EOF:
   329  			tok.Type = EOF
   330  		case '-':
   331  			if sc.Peek() == '-' {
   332  				err = sc.skipComments(sc.Next())
   333  				if err != nil {
   334  					goto finally
   335  				}
   336  				goto redo
   337  			} else {
   338  				tok.Type = ch
   339  				tok.Str = string(ch)
   340  			}
   341  		case '"', '\'':
   342  			tok.Type = TString
   343  			err = sc.scanString(ch, buf)
   344  			tok.Str = buf.String()
   345  		case '[':
   346  			if c := sc.Peek(); c == '[' || c == '=' {
   347  				tok.Type = TString
   348  				err = sc.scanMultilineString(sc.Next(), buf)
   349  				tok.Str = buf.String()
   350  			} else {
   351  				tok.Type = ch
   352  				tok.Str = string(ch)
   353  			}
   354  		case '=':
   355  			if sc.Peek() == '=' {
   356  				tok.Type = TEqeq
   357  				tok.Str = "=="
   358  				sc.Next()
   359  			} else {
   360  				tok.Type = ch
   361  				tok.Str = string(ch)
   362  			}
   363  		case '~':
   364  			if sc.Peek() == '=' {
   365  				tok.Type = TNeq
   366  				tok.Str = "~="
   367  				sc.Next()
   368  			} else {
   369  				err = sc.Error("~", "Invalid '~' token")
   370  			}
   371  		case '<':
   372  			if sc.Peek() == '=' {
   373  				tok.Type = TLte
   374  				tok.Str = "<="
   375  				sc.Next()
   376  			} else {
   377  				tok.Type = ch
   378  				tok.Str = string(ch)
   379  			}
   380  		case '>':
   381  			if sc.Peek() == '=' {
   382  				tok.Type = TGte
   383  				tok.Str = ">="
   384  				sc.Next()
   385  			} else {
   386  				tok.Type = ch
   387  				tok.Str = string(ch)
   388  			}
   389  		case '.':
   390  			ch2 := sc.Peek()
   391  			switch {
   392  			case isDecimal(ch2):
   393  				tok.Type = TNumber
   394  				err = sc.scanNumber(ch, buf)
   395  				tok.Str = buf.String()
   396  			case ch2 == '.':
   397  				writeChar(buf, ch)
   398  				writeChar(buf, sc.Next())
   399  				if sc.Peek() == '.' {
   400  					writeChar(buf, sc.Next())
   401  					tok.Type = T3Comma
   402  				} else {
   403  					tok.Type = T2Comma
   404  				}
   405  			default:
   406  				tok.Type = '.'
   407  			}
   408  			tok.Str = buf.String()
   409  		case '+', '*', '/', '%', '^', '#', '(', ')', '{', '}', ']', ';', ':', ',':
   410  			tok.Type = ch
   411  			tok.Str = string(ch)
   412  		default:
   413  			writeChar(buf, ch)
   414  			err = sc.Error(buf.String(), "Invalid token")
   415  			goto finally
   416  		}
   417  	}
   418  
   419  finally:
   420  	tok.Name = TokenName(int(tok.Type))
   421  	return tok, err
   422  }
   423  
   424  // yacc interface {{{
   425  
   426  type Lexer struct {
   427  	scanner       *Scanner
   428  	Stmts         []ast.Stmt
   429  	PNewLine      bool
   430  	Token         ast.Token
   431  	PrevTokenType int
   432  }
   433  
   434  func (lx *Lexer) Lex(lval *yySymType) int {
   435  	lx.PrevTokenType = lx.Token.Type
   436  	tok, err := lx.scanner.Scan(lx)
   437  	if err != nil {
   438  		panic(err)
   439  	}
   440  	if tok.Type < 0 {
   441  		return 0
   442  	}
   443  	lval.token = tok
   444  	lx.Token = tok
   445  	return int(tok.Type)
   446  }
   447  
   448  func (lx *Lexer) Error(message string) {
   449  	panic(lx.scanner.Error(lx.Token.Str, message))
   450  }
   451  
   452  func (lx *Lexer) TokenError(tok ast.Token, message string) {
   453  	panic(lx.scanner.TokenError(tok, message))
   454  }
   455  
   456  func Parse(reader io.Reader, name string) (chunk []ast.Stmt, err error) {
   457  	lexer := &Lexer{NewScanner(reader, name), nil, false, ast.Token{Str: ""}, TNil}
   458  	chunk = nil
   459  	defer func() {
   460  		if e := recover(); e != nil {
   461  			err, _ = e.(error)
   462  		}
   463  	}()
   464  	yyParse(lexer)
   465  	chunk = lexer.Stmts
   466  	return
   467  }
   468  
   469  // }}}
   470  
   471  // Dump {{{
   472  
   473  func isInlineDumpNode(rv reflect.Value) bool {
   474  	switch rv.Kind() {
   475  	case reflect.Struct, reflect.Slice, reflect.Interface, reflect.Ptr:
   476  		return false
   477  	default:
   478  		return true
   479  	}
   480  }
   481  
   482  func dump(node interface{}, level int, s string) string {
   483  	rt := reflect.TypeOf(node)
   484  	if fmt.Sprint(rt) == "<nil>" {
   485  		return strings.Repeat(s, level) + "<nil>"
   486  	}
   487  
   488  	rv := reflect.ValueOf(node)
   489  	buf := []string{}
   490  	switch rt.Kind() {
   491  	case reflect.Slice:
   492  		if rv.Len() == 0 {
   493  			return strings.Repeat(s, level) + "<empty>"
   494  		}
   495  		for i := 0; i < rv.Len(); i++ {
   496  			buf = append(buf, dump(rv.Index(i).Interface(), level, s))
   497  		}
   498  	case reflect.Ptr:
   499  		vt := rv.Elem()
   500  		tt := rt.Elem()
   501  		indicies := []int{}
   502  		for i := 0; i < tt.NumField(); i++ {
   503  			if strings.Index(tt.Field(i).Name, "Base") > -1 {
   504  				continue
   505  			}
   506  			indicies = append(indicies, i)
   507  		}
   508  		switch {
   509  		case len(indicies) == 0:
   510  			return strings.Repeat(s, level) + "<empty>"
   511  		case len(indicies) == 1 && isInlineDumpNode(vt.Field(indicies[0])):
   512  			for _, i := range indicies {
   513  				buf = append(buf, strings.Repeat(s, level)+"- Node$"+tt.Name()+": "+dump(vt.Field(i).Interface(), 0, s))
   514  			}
   515  		default:
   516  			buf = append(buf, strings.Repeat(s, level)+"- Node$"+tt.Name())
   517  			for _, i := range indicies {
   518  				if isInlineDumpNode(vt.Field(i)) {
   519  					inf := dump(vt.Field(i).Interface(), 0, s)
   520  					buf = append(buf, strings.Repeat(s, level+1)+tt.Field(i).Name+": "+inf)
   521  				} else {
   522  					buf = append(buf, strings.Repeat(s, level+1)+tt.Field(i).Name+": ")
   523  					buf = append(buf, dump(vt.Field(i).Interface(), level+2, s))
   524  				}
   525  			}
   526  		}
   527  	default:
   528  		buf = append(buf, strings.Repeat(s, level)+fmt.Sprint(node))
   529  	}
   530  	return strings.Join(buf, "\n")
   531  }
   532  
   533  func Dump(chunk []ast.Stmt) string {
   534  	return dump(chunk, 0, "   ")
   535  }
   536  
   537  // }}