github.com/benhoyt/goawk@v1.8.1/lexer/lexer.go (about)

     1  // Package lexer is an AWK lexer (tokenizer).
     2  //
     3  // The lexer turns a string of AWK source code into a stream of
     4  // tokens for parsing.
     5  //
     6  // To tokenize some source, create a new lexer with NewLexer(src) and
     7  // then call Scan() until the token type is EOF or ILLEGAL.
     8  //
     9  package lexer
    10  
    11  import (
    12  	"fmt"
    13  )
    14  
    15  // Lexer tokenizes a byte string of AWK source code. Use NewLexer to
    16  // actually create a lexer, and Scan() or ScanRegex() to get tokens.
    17  type Lexer struct {
    18  	src      []byte
    19  	offset   int
    20  	ch       byte
    21  	pos      Position
    22  	nextPos  Position
    23  	hadSpace bool
    24  	lastTok  Token
    25  }
    26  
    27  // Position stores the source line and column where a token starts.
    28  type Position struct {
    29  	// Line number of the token (starts at 1).
    30  	Line int
    31  	// Column on the line (starts at 1). Note that this is the byte
    32  	// offset into the line, not rune offset.
    33  	Column int
    34  }
    35  
    36  // NewLexer creates a new lexer that will tokenize the given source
    37  // code. See the module-level example for a working example.
    38  func NewLexer(src []byte) *Lexer {
    39  	l := &Lexer{src: src}
    40  	l.nextPos.Line = 1
    41  	l.nextPos.Column = 1
    42  	l.next()
    43  	return l
    44  }
    45  
    46  // HadSpace returns true if the previously-scanned token had
    47  // whitespace before it. Used by the parser because when calling a
    48  // user-defined function the grammar doesn't allow a space between
    49  // the function name and the left parenthesis.
    50  func (l *Lexer) HadSpace() bool {
    51  	return l.hadSpace
    52  }
    53  
    54  // Scan scans the next token and returns its position (line/column),
    55  // token value (one of the uppercased token constants), and the
    56  // string value of the token. For most tokens, the token value is
    57  // empty. For NAME, NUMBER, STRING, and REGEX tokens, it's the
    58  // token's value. For an ILLEGAL token, it's the error message.
    59  func (l *Lexer) Scan() (Position, Token, string) {
    60  	pos, tok, val := l.scan()
    61  	l.lastTok = tok
    62  	return pos, tok, val
    63  }
    64  
    65  // Does the real work of scanning. Scan() wraps this to more easily
    66  // set lastTok.
    67  func (l *Lexer) scan() (Position, Token, string) {
    68  	// Skip whitespace (except newline, which is a token)
    69  	l.hadSpace = false
    70  	for l.ch == ' ' || l.ch == '\t' || l.ch == '\r' || l.ch == '\\' {
    71  		l.hadSpace = true
    72  		if l.ch == '\\' {
    73  			l.next()
    74  			if l.ch == '\r' {
    75  				l.next()
    76  			}
    77  			if l.ch != '\n' {
    78  				return l.pos, ILLEGAL, "expected \\n after \\ line continuation"
    79  			}
    80  		}
    81  		l.next()
    82  	}
    83  	if l.ch == '#' {
    84  		// Skip comment till end of line
    85  		l.next()
    86  		for l.ch != '\n' && l.ch != 0 {
    87  			l.next()
    88  		}
    89  	}
    90  	if l.ch == 0 {
    91  		// l.next() reached end of input
    92  		return l.pos, EOF, ""
    93  	}
    94  
    95  	pos := l.pos
    96  	tok := ILLEGAL
    97  	val := ""
    98  
    99  	ch := l.ch
   100  	l.next()
   101  
   102  	// Names: keywords and functions
   103  	if isNameStart(ch) {
   104  		start := l.offset - 2
   105  		for isNameStart(l.ch) || (l.ch >= '0' && l.ch <= '9') {
   106  			l.next()
   107  		}
   108  		name := string(l.src[start : l.offset-1])
   109  		tok := KeywordToken(name)
   110  		if tok == ILLEGAL {
   111  			tok = NAME
   112  			val = name
   113  		}
   114  		return pos, tok, val
   115  	}
   116  
   117  	// These are ordered by my guess at frequency of use. Should run
   118  	// through a corpus of real AWK programs to determine actual
   119  	// frequency.
   120  	switch ch {
   121  	case '$':
   122  		tok = DOLLAR
   123  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
   124  		// Avoid make/append and use l.offset directly for performance
   125  		start := l.offset - 2
   126  		gotDigit := false
   127  		if ch != '.' {
   128  			gotDigit = true
   129  			for l.ch >= '0' && l.ch <= '9' {
   130  				l.next()
   131  			}
   132  			if l.ch == '.' {
   133  				l.next()
   134  			}
   135  		}
   136  		for l.ch >= '0' && l.ch <= '9' {
   137  			gotDigit = true
   138  			l.next()
   139  		}
   140  		if !gotDigit {
   141  			return l.pos, ILLEGAL, "expected digits"
   142  		}
   143  		if l.ch == 'e' || l.ch == 'E' {
   144  			l.next()
   145  			gotSign := false
   146  			if l.ch == '+' || l.ch == '-' {
   147  				gotSign = true
   148  				l.next()
   149  			}
   150  			gotDigit = false
   151  			for l.ch >= '0' && l.ch <= '9' {
   152  				l.next()
   153  				gotDigit = true
   154  			}
   155  			// Per awk/gawk, "1e" is allowed, but not "1e+"
   156  			if gotSign && !gotDigit {
   157  				return l.pos, ILLEGAL, "expected digits"
   158  			}
   159  		}
   160  		tok = NUMBER
   161  		val = string(l.src[start : l.offset-1])
   162  	case '{':
   163  		tok = LBRACE
   164  	case '}':
   165  		tok = RBRACE
   166  	case '=':
   167  		tok = l.choice('=', ASSIGN, EQUALS)
   168  	case '<':
   169  		tok = l.choice('=', LESS, LTE)
   170  	case '>':
   171  		switch l.ch {
   172  		case '=':
   173  			l.next()
   174  			tok = GTE
   175  		case '>':
   176  			l.next()
   177  			tok = APPEND
   178  		default:
   179  			tok = GREATER
   180  		}
   181  	case '"', '\'':
   182  		// Note: POSIX awk spec doesn't allow single-quoted strings,
   183  		// but this helps without quoting, especially on Windows
   184  		// where the shell quote character is " (double quote).
   185  		chars := make([]byte, 0, 32) // most won't require heap allocation
   186  		for l.ch != ch {
   187  			c := l.ch
   188  			if c == 0 {
   189  				return l.pos, ILLEGAL, "didn't find end quote in string"
   190  			}
   191  			if c == '\r' || c == '\n' {
   192  				return l.pos, ILLEGAL, "can't have newline in string"
   193  			}
   194  			if c != '\\' {
   195  				// Normal, non-escaped character
   196  				chars = append(chars, c)
   197  				l.next()
   198  				continue
   199  			}
   200  			// Escape sequence, skip over \ and process
   201  			l.next()
   202  			switch l.ch {
   203  			case 'n':
   204  				c = '\n'
   205  				l.next()
   206  			case 't':
   207  				c = '\t'
   208  				l.next()
   209  			case 'r':
   210  				c = '\r'
   211  				l.next()
   212  			case 'a':
   213  				c = '\a'
   214  				l.next()
   215  			case 'b':
   216  				c = '\b'
   217  				l.next()
   218  			case 'f':
   219  				c = '\f'
   220  				l.next()
   221  			case 'v':
   222  				c = '\v'
   223  				l.next()
   224  			case 'x':
   225  				// Hex byte of one of two hex digits
   226  				l.next()
   227  				digit := hexDigit(l.ch)
   228  				if digit < 0 {
   229  					return l.pos, ILLEGAL, "1 or 2 hex digits expected"
   230  				}
   231  				c = byte(digit)
   232  				l.next()
   233  				digit = hexDigit(l.ch)
   234  				if digit >= 0 {
   235  					c = c*16 + byte(digit)
   236  					l.next()
   237  				}
   238  			case '0', '1', '2', '3', '4', '5', '6', '7':
   239  				// Octal byte of 1-3 octal digits
   240  				c = l.ch - '0'
   241  				l.next()
   242  				for i := 0; i < 2 && l.ch >= '0' && l.ch <= '7'; i++ {
   243  					c = c*8 + l.ch - '0'
   244  					l.next()
   245  				}
   246  			default:
   247  				// Any other escape character is just the char
   248  				// itself, eg: "\z" is just "z"
   249  				c = l.ch
   250  				l.next()
   251  			}
   252  			chars = append(chars, c)
   253  		}
   254  		l.next()
   255  		tok = STRING
   256  		val = string(chars)
   257  	case '(':
   258  		tok = LPAREN
   259  	case ')':
   260  		tok = RPAREN
   261  	case ',':
   262  		tok = COMMA
   263  	case ';':
   264  		tok = SEMICOLON
   265  	case '+':
   266  		switch l.ch {
   267  		case '+':
   268  			l.next()
   269  			tok = INCR
   270  		case '=':
   271  			l.next()
   272  			tok = ADD_ASSIGN
   273  		default:
   274  			tok = ADD
   275  		}
   276  	case '-':
   277  		switch l.ch {
   278  		case '-':
   279  			l.next()
   280  			tok = DECR
   281  		case '=':
   282  			l.next()
   283  			tok = SUB_ASSIGN
   284  		default:
   285  			tok = SUB
   286  		}
   287  	case '*':
   288  		switch l.ch {
   289  		case '*':
   290  			l.next()
   291  			tok = l.choice('=', POW, POW_ASSIGN)
   292  		case '=':
   293  			l.next()
   294  			tok = MUL_ASSIGN
   295  		default:
   296  			tok = MUL
   297  		}
   298  	case '/':
   299  		tok = l.choice('=', DIV, DIV_ASSIGN)
   300  	case '%':
   301  		tok = l.choice('=', MOD, MOD_ASSIGN)
   302  	case '[':
   303  		tok = LBRACKET
   304  	case ']':
   305  		tok = RBRACKET
   306  	case '\n':
   307  		tok = NEWLINE
   308  	case '^':
   309  		tok = l.choice('=', POW, POW_ASSIGN)
   310  	case '!':
   311  		switch l.ch {
   312  		case '=':
   313  			l.next()
   314  			tok = NOT_EQUALS
   315  		case '~':
   316  			l.next()
   317  			tok = NOT_MATCH
   318  		default:
   319  			tok = NOT
   320  		}
   321  	case '~':
   322  		tok = MATCH
   323  	case '?':
   324  		tok = QUESTION
   325  	case ':':
   326  		tok = COLON
   327  	case '&':
   328  		tok = l.choice('&', ILLEGAL, AND)
   329  		if tok == ILLEGAL {
   330  			return l.pos, ILLEGAL, "unexpected char after '&'"
   331  		}
   332  	case '|':
   333  		tok = l.choice('|', PIPE, OR)
   334  	default:
   335  		tok = ILLEGAL
   336  		val = "unexpected char"
   337  	}
   338  	return pos, tok, val
   339  }
   340  
   341  // ScanRegex parses an AWK regular expression in /slash/ syntax. The
   342  // AWK grammar has somewhat special handling of regex tokens, so the
   343  // parser can only call this after a DIV or DIV_ASSIGN token has just
   344  // been scanned.
   345  func (l *Lexer) ScanRegex() (Position, Token, string) {
   346  	pos, tok, val := l.scanRegex()
   347  	l.lastTok = tok
   348  	return pos, tok, val
   349  }
   350  
   351  // Does the real work of scanning a regex. ScanRegex() wraps this to
   352  // more easily set lastTok.
   353  func (l *Lexer) scanRegex() (Position, Token, string) {
   354  	pos := l.pos
   355  	chars := make([]byte, 0, 32) // most won't require heap allocation
   356  	switch l.lastTok {
   357  	case DIV:
   358  		// Regex after '/' (the usual case)
   359  		pos.Column -= 1
   360  	case DIV_ASSIGN:
   361  		// Regex after '/=' (happens when regex starts with '=')
   362  		pos.Column -= 2
   363  		chars = append(chars, '=')
   364  	default:
   365  		return l.pos, ILLEGAL, fmt.Sprintf("unexpected %s preceding regex", l.lastTok)
   366  	}
   367  	for l.ch != '/' {
   368  		c := l.ch
   369  		if c == 0 {
   370  			return l.pos, ILLEGAL, "didn't find end slash in regex"
   371  		}
   372  		if c == '\r' || c == '\n' {
   373  			return l.pos, ILLEGAL, "can't have newline in regex"
   374  		}
   375  		if c == '\\' {
   376  			l.next()
   377  			if l.ch != '/' {
   378  				chars = append(chars, '\\')
   379  			}
   380  			c = l.ch
   381  		}
   382  		chars = append(chars, c)
   383  		l.next()
   384  	}
   385  	l.next()
   386  	return pos, REGEX, string(chars)
   387  }
   388  
   389  // Load the next character into l.ch (or 0 on end of input) and update
   390  // line and column position.
   391  func (l *Lexer) next() {
   392  	l.pos = l.nextPos
   393  	if l.offset >= len(l.src) {
   394  		// For last character, move offset 1 past the end as it
   395  		// simplifies offset calculations in NAME and NUMBER
   396  		if l.ch != 0 {
   397  			l.ch = 0
   398  			l.offset++
   399  		}
   400  		return
   401  	}
   402  	ch := l.src[l.offset]
   403  	if ch == '\n' {
   404  		l.nextPos.Line++
   405  		l.nextPos.Column = 1
   406  	} else {
   407  		l.nextPos.Column++
   408  	}
   409  	l.ch = ch
   410  	l.offset++
   411  }
   412  
   413  func isNameStart(ch byte) bool {
   414  	return ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
   415  }
   416  
   417  // Return the hex digit 0-15 corresponding to the given ASCII byte,
   418  // or -1 if it's not a valid hex digit.
   419  func hexDigit(ch byte) int {
   420  	switch {
   421  	case ch >= '0' && ch <= '9':
   422  		return int(ch - '0')
   423  	case ch >= 'a' && ch <= 'f':
   424  		return int(ch - 'a' + 10)
   425  	case ch >= 'A' && ch <= 'F':
   426  		return int(ch - 'A' + 10)
   427  	default:
   428  		return -1
   429  	}
   430  }
   431  
   432  func (l *Lexer) choice(ch byte, one, two Token) Token {
   433  	if l.ch == ch {
   434  		l.next()
   435  		return two
   436  	}
   437  	return one
   438  }