github.com/NeowayLabs/nash@v0.2.2-0.20200127205349-a227041ffd50/scanner/lex.go

github.com/NeowayLabs/nash@v0.2.2-0.20200127205349-a227041ffd50/scanner/lex.go (about)

     1  // Package scanner is the lexical parser.
     2  package scanner
     3  
     4  import (
     5  	"fmt"
     6  	"strings"
     7  	"unicode"
     8  	"unicode/utf8"
     9  
    10  	"github.com/madlambda/nash/token"
    11  )
    12  
    13  type (
    14  	Token struct {
    15  		typ token.Token
    16  		token.FileInfo
    17  
    18  		val string
    19  	}
    20  
    21  	stateFn func(*Lexer) stateFn
    22  
    23  	// Lexer holds the state of the scanner
    24  	Lexer struct {
    25  		name  string // identify the source, used only for error reports
    26  		input string // the string being scanned
    27  		start int    // start position of current token
    28  
    29  		width  int        // width of last rune read
    30  		Tokens chan Token // channel of scanned tokens
    31  
    32  		// file positions
    33  		pos         int // file offset
    34  		line        int // current line position
    35  		lineStart   int // line of the symbol's start
    36  		prevColumn  int // previous column value
    37  		column      int // current column position
    38  		columnStart int // column of the symbol's start
    39  
    40  		openParens int
    41  
    42  		addSemicolon bool
    43  	}
    44  )
    45  
    46  const (
    47  	eof = -1
    48  )
    49  
    50  func (i Token) Type() token.Token { return i.typ }
    51  func (i Token) Value() string     { return i.val }
    52  
    53  func (i Token) String() string {
    54  	switch i.typ {
    55  	case token.Illegal:
    56  		return "ERROR: " + i.val
    57  	case token.EOF:
    58  		return "EOF"
    59  	}
    60  
    61  	if len(i.typ.String()) > 10 {
    62  		return fmt.Sprintf("%s...", i.typ.String()[0:10])
    63  	}
    64  
    65  	return fmt.Sprintf("%s", i.typ)
    66  }
    67  
    68  // run lexes the input by executing state functions until the state is nil
    69  func (l *Lexer) run() {
    70  	l.line, l.lineStart, l.column, l.columnStart = 1, 1, 0, 0
    71  
    72  	for state := lexStart; state != nil; {
    73  		state = state(l)
    74  	}
    75  
    76  	l.emit(token.EOF)
    77  	close(l.Tokens) // No more tokens will be delivered
    78  }
    79  
    80  func (l *Lexer) emitVal(t token.Token, val string, line, column int) {
    81  	l.Tokens <- Token{
    82  		FileInfo: token.NewFileInfo(line, column),
    83  
    84  		typ: t,
    85  		val: val,
    86  	}
    87  
    88  	l.start = l.pos
    89  	l.lineStart = l.line
    90  	l.columnStart = l.column
    91  }
    92  
    93  func (l *Lexer) emit(t token.Token) {
    94  	l.Tokens <- Token{
    95  		FileInfo: token.NewFileInfo(l.lineStart, l.columnStart),
    96  
    97  		typ: t,
    98  		val: l.input[l.start:l.pos],
    99  	}
   100  
   101  	l.start = l.pos
   102  	l.lineStart = l.line
   103  	l.columnStart = l.column
   104  }
   105  
   106  // peek returns but does not consume the next rune from input
   107  func (l *Lexer) peek() rune {
   108  	rune := l.next()
   109  	l.backup()
   110  	return rune
   111  }
   112  
   113  // next consumes the next rune from input
   114  func (l *Lexer) next() rune {
   115  	var r rune
   116  
   117  	if l.pos >= len(l.input) {
   118  		l.width = 0
   119  		return eof
   120  	}
   121  
   122  	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
   123  
   124  	l.pos += l.width
   125  	l.prevColumn = l.column
   126  
   127  	if r == '\n' {
   128  		l.line++
   129  		l.column = 0
   130  	} else {
   131  		l.column++
   132  	}
   133  
   134  	return r
   135  }
   136  
   137  // ignore skips over the pending input before this point
   138  func (l *Lexer) ignore() {
   139  	l.start = l.pos
   140  	l.lineStart = l.line
   141  	l.columnStart = l.column
   142  }
   143  
   144  // backup steps back one rune
   145  func (l *Lexer) backup() {
   146  	l.pos -= l.width
   147  
   148  	r, _ := utf8.DecodeRuneInString(l.input[l.pos:])
   149  
   150  	l.column = l.prevColumn
   151  
   152  	if r == '\n' {
   153  		l.line--
   154  	}
   155  }
   156  
   157  // acceptRun consumes a run of runes from the valid setup
   158  func (l *Lexer) acceptRun(valid string) {
   159  	for strings.IndexRune(valid, l.next()) >= 0 {
   160  
   161  	}
   162  
   163  	l.backup()
   164  }
   165  
   166  // errorf emit an error token
   167  func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
   168  	fname := l.name
   169  
   170  	if fname == "" {
   171  		fname = "<none>"
   172  	}
   173  
   174  	errMsg := fmt.Sprintf(format, args...)
   175  
   176  	arguments := make([]interface{}, 0, len(args)+2)
   177  	arguments = append(arguments, fname, l.line, l.column, errMsg)
   178  
   179  	l.Tokens <- Token{
   180  		FileInfo: token.NewFileInfo(l.line, l.column),
   181  
   182  		typ: token.Illegal,
   183  		val: fmt.Sprintf("%s:%d:%d: %s", arguments...),
   184  	}
   185  
   186  	l.start = len(l.input)
   187  	l.lineStart = l.line
   188  	l.columnStart = l.column
   189  	l.pos = l.start
   190  
   191  	return nil // finish the state machine
   192  }
   193  
   194  func Lex(name, input string) *Lexer {
   195  	l := &Lexer{
   196  		name:   name,
   197  		input:  input,
   198  		Tokens: make(chan Token),
   199  	}
   200  
   201  	go l.run() // concurrently run state machine
   202  
   203  	return l
   204  }
   205  
   206  func lexStart(l *Lexer) stateFn {
   207  	r := l.next()
   208  
   209  	switch {
   210  	case r == eof:
   211  		if l.addSemicolon {
   212  			l.emitVal(token.Semicolon, ";", l.line, l.column)
   213  		}
   214  
   215  		l.addSemicolon = false
   216  
   217  		return nil
   218  	case '0' <= r && r <= '9':
   219  		digits := "0123456789"
   220  
   221  		l.acceptRun(digits)
   222  
   223  		next := l.peek()
   224  
   225  		// >[2=]
   226  		// cmd[2]
   227  		if next == '=' || next == ']' || (!isIdentifier(l.peek()) && !isArgument(l.peek())) {
   228  			l.emit(token.Number)
   229  		} else if isIdentifier(l.peek()) {
   230  			absorbIdentifier(l)
   231  
   232  			if isArgument(l.peek()) {
   233  				absorbArgument(l)
   234  
   235  				l.emit(token.Arg)
   236  			} else {
   237  				l.emit(token.Ident)
   238  			}
   239  		} else if isArgument(l.peek()) {
   240  			absorbArgument(l)
   241  			l.emit(token.Arg)
   242  		}
   243  
   244  		return lexStart
   245  	case r == ';':
   246  		l.emit(token.Semicolon)
   247  		return lexStart
   248  	case isSpace(r):
   249  		return lexSpace
   250  
   251  	case isEndOfLine(r):
   252  		l.ignore()
   253  
   254  		if l.addSemicolon && l.openParens == 0 {
   255  			l.emitVal(token.Semicolon, ";", l.line, l.column)
   256  		}
   257  
   258  		l.addSemicolon = false
   259  
   260  		return lexStart
   261  	case r == '"':
   262  		l.ignore()
   263  
   264  		return lexQuote
   265  	case r == '#':
   266  		return lexComment
   267  	case r == '+':
   268  		l.emit(token.Plus)
   269  		return lexStart
   270  	case r == '>':
   271  		l.emit(token.Gt)
   272  		return lexStart
   273  	case r == '|':
   274  		l.emit(token.Pipe)
   275  		return lexStart
   276  	case r == '$':
   277  		r = l.next()
   278  
   279  		if !isIdentifier(r) {
   280  			return l.errorf("Expected identifier, but found %q", r)
   281  		}
   282  
   283  		absorbIdentifier(l)
   284  
   285  		next := l.peek()
   286  		if next != eof && !isSpace(next) &&
   287  			!isEndOfLine(next) && next != ';' &&
   288  			next != ')' && next != ',' && next != '+' &&
   289  			next != '[' && next != ']' && next != '(' &&
   290  			next != '.' {
   291  			l.errorf("Unrecognized character in action: %#U", next)
   292  			return nil
   293  		}
   294  
   295  		l.emit(token.Variable)
   296  		return lexStart
   297  	case r == '=':
   298  		if l.peek() == '=' {
   299  			l.next()
   300  			l.emit(token.Equal)
   301  		} else {
   302  			l.emit(token.Assign)
   303  		}
   304  
   305  		return lexStart
   306  	case r == '!':
   307  		if l.peek() == '=' {
   308  			l.next()
   309  			l.emit(token.NotEqual)
   310  		} else {
   311  			l.emit(token.Arg)
   312  		}
   313  
   314  		return lexStart
   315  	case r == '<':
   316  		if l.peek() == '=' {
   317  			l.next()
   318  			l.emit(token.AssignCmd)
   319  		} else {
   320  			l.emit(token.Lt)
   321  		}
   322  
   323  		return lexStart
   324  	case r == '{':
   325  		l.addSemicolon = false
   326  		l.emit(token.LBrace)
   327  		return lexStart
   328  	case r == '}':
   329  		l.emit(token.RBrace)
   330  		l.addSemicolon = false
   331  		return lexStart
   332  	case r == '[':
   333  		l.emit(token.LBrack)
   334  		return lexStart
   335  	case r == ']':
   336  		l.emit(token.RBrack)
   337  		return lexStart
   338  	case r == '(':
   339  		l.openParens++
   340  
   341  		l.emit(token.LParen)
   342  		l.addSemicolon = false
   343  		return lexStart
   344  	case r == ')':
   345  		l.openParens--
   346  
   347  		l.emit(token.RParen)
   348  		l.addSemicolon = true
   349  		return lexStart
   350  	case r == ',':
   351  		l.emit(token.Comma)
   352  		return lexStart
   353  	case r == '.':
   354  		dotLine, dotColumn := l.line, l.column
   355  		next := l.peek()
   356  		if next == '.' {
   357  			l.next()
   358  			next = l.peek()
   359  			if next == '.' {
   360  				l.next()
   361  				l.emitVal(token.Dotdotdot, "...", dotLine, dotColumn)
   362  				return lexStart
   363  			}
   364  		}
   365  		absorbArgument(l)
   366  		l.emit(token.Arg)
   367  		if next == eof && l.openParens > 0 {
   368  			l.addSemicolon = false
   369  		} else {
   370  			l.addSemicolon = true
   371  		}
   372  		return lexStart
   373  	case isIdentifier(r):
   374  		// nash literals are lowercase
   375  		absorbIdentifier(l)
   376  
   377  		next := l.peek()
   378  
   379  		if isEndOfLine(next) || isSpace(next) ||
   380  			next == '=' || next == '(' ||
   381  			next == ')' || next == ',' ||
   382  			next == '[' || next == eof {
   383  			lit := scanIdentifier(l)
   384  
   385  			if len(lit) > 1 && r >= 'a' && r <= 'z' {
   386  				l.emit(token.Lookup(lit))
   387  			} else {
   388  				l.emit(token.Ident)
   389  			}
   390  		} else if next == '.' {
   391  			// because of shell idiosyncrasies I've to replicate
   392  			// almost same dotdotdot lex here...
   393  			ident := l.input[l.start:l.pos]
   394  			identLine, identCol := l.lineStart, l.columnStart
   395  			dotLine, dotColumn := l.line, l.column
   396  			l.next()
   397  			next = l.peek()
   398  			if next == '.' {
   399  				l.next()
   400  				next = l.peek()
   401  				if next == '.' {
   402  					l.next()
   403  					l.emitVal(token.Ident, ident, identLine, identCol)
   404  					l.emitVal(token.Dotdotdot, "...", dotLine, dotColumn)
   405  					return lexStart
   406  				}
   407  			}
   408  			absorbArgument(l)
   409  			l.emit(token.Arg)
   410  		} else {
   411  			absorbArgument(l)
   412  			l.emit(token.Arg)
   413  		}
   414  
   415  		if next == eof && l.openParens > 0 {
   416  			l.addSemicolon = false
   417  		} else {
   418  			l.addSemicolon = true
   419  		}
   420  
   421  		return lexStart
   422  	case isArgument(r):
   423  		absorbArgument(l)
   424  		l.emit(token.Arg)
   425  		l.addSemicolon = true
   426  		return lexStart
   427  	}
   428  
   429  	return l.errorf("Unrecognized character in action: %#U", r)
   430  }
   431  
   432  func absorbIdentifier(l *Lexer) {
   433  	for {
   434  		r := l.next()
   435  
   436  		if isIdentifier(r) {
   437  			continue // absorb
   438  		}
   439  
   440  		break
   441  	}
   442  
   443  	l.backup() // pos is now ahead of the alphanum
   444  }
   445  
   446  func absorbArgument(l *Lexer) {
   447  	for {
   448  		r := l.next()
   449  
   450  		if isArgument(r) {
   451  			continue // absorb
   452  		}
   453  
   454  		break
   455  	}
   456  
   457  	l.backup() // pos is now ahead of the alphanum
   458  }
   459  
   460  func scanIdentifier(l *Lexer) string {
   461  	absorbIdentifier(l)
   462  
   463  	return l.input[l.start:l.pos]
   464  }
   465  
   466  func lexQuote(l *Lexer) stateFn {
   467  	var data []rune
   468  
   469  	data = make([]rune, 0, 256)
   470  
   471  	for {
   472  		r := l.next()
   473  
   474  		if r != '"' && r != eof {
   475  			if r == '\\' {
   476  				r = l.next()
   477  
   478  				switch r {
   479  				case 'n':
   480  					data = append(data, '\n')
   481  				case 't':
   482  					data = append(data, '\t')
   483  				case '\\':
   484  					data = append(data, '\\')
   485  				case '"':
   486  					data = append(data, '"')
   487  				case 'x', 'u', 'U':
   488  					return l.errorf("Escape types 'x', 'u' and 'U' aren't implemented yet")
   489  				case '0', '1', '2', '3', '4', '5', '6', '7':
   490  					x := r - '0'
   491  
   492  					for i := 2; i > 0; i-- {
   493  						r = l.next()
   494  
   495  						if r >= '0' && r <= '7' {
   496  							x = x*8 + r - '0'
   497  							continue
   498  						}
   499  
   500  						return l.errorf("non-octal character in escape sequence: %c", r)
   501  					}
   502  
   503  					if x > 255 {
   504  						return l.errorf("octal escape value > 255: %d", x)
   505  					}
   506  
   507  					data = append(data, x)
   508  				}
   509  			} else {
   510  				data = append(data, r)
   511  			}
   512  
   513  			continue
   514  		}
   515  
   516  		if r == eof {
   517  			return l.errorf("Quoted string not finished: %s", l.input[l.start:])
   518  		}
   519  
   520  		l.emitVal(token.String, string(data), l.lineStart, l.columnStart)
   521  
   522  		l.ignore() // ignores last quote
   523  		break
   524  	}
   525  
   526  	return lexStart
   527  }
   528  
   529  func lexComment(l *Lexer) stateFn {
   530  	for {
   531  		r := l.next()
   532  
   533  		if isEndOfLine(r) {
   534  			l.backup()
   535  			l.emit(token.Comment)
   536  
   537  			break
   538  		}
   539  
   540  		if r == eof {
   541  			l.backup()
   542  			l.emit(token.Comment)
   543  			break
   544  		}
   545  	}
   546  
   547  	return lexStart
   548  }
   549  
   550  func lexSpace(l *Lexer) stateFn {
   551  	ignoreSpaces(l)
   552  	return lexStart
   553  }
   554  
   555  func ignoreSpaces(l *Lexer) {
   556  	for {
   557  		r := l.next()
   558  
   559  		if !isSpace(r) {
   560  			break
   561  		}
   562  	}
   563  
   564  	l.backup()
   565  	l.ignore()
   566  }
   567  
   568  // isSpace reports whether r is a space character.
   569  func isSpace(r rune) bool {
   570  	return r == ' ' || r == '\t'
   571  }
   572  
   573  func isArgument(r rune) bool {
   574  	isId := isAlpha(r)
   575  
   576  	return isId || (r != eof && !isEndOfLine(r) && !isSpace(r) &&
   577  		r != '$' && r != '{' && r != '}' && r != '(' && r != ']' && r != '[' &&
   578  		r != ')' && r != '>' && r != '"' && r != ',' && r != ';' && r != '|')
   579  }
   580  
   581  func isIdentifier(r rune) bool {
   582  	return isAlpha(r) || r == '_'
   583  }
   584  
   585  // isIdentifier reports whether r is a valid identifier
   586  func isAlpha(r rune) bool {
   587  	return unicode.IsLetter(r) || unicode.IsDigit(r)
   588  }
   589  
   590  // isEndOfLine reports whether r is an end-of-line character.
   591  func isEndOfLine(r rune) bool {
   592  	return r == '\r' || r == '\n'
   593  }