github.com/benhoyt/goawk@v1.8.1/lexer/lexer.go (about) 1 // Package lexer is an AWK lexer (tokenizer). 2 // 3 // The lexer turns a string of AWK source code into a stream of 4 // tokens for parsing. 5 // 6 // To tokenize some source, create a new lexer with NewLexer(src) and 7 // then call Scan() until the token type is EOF or ILLEGAL. 8 // 9 package lexer 10 11 import ( 12 "fmt" 13 ) 14 15 // Lexer tokenizes a byte string of AWK source code. Use NewLexer to 16 // actually create a lexer, and Scan() or ScanRegex() to get tokens. 17 type Lexer struct { 18 src []byte 19 offset int 20 ch byte 21 pos Position 22 nextPos Position 23 hadSpace bool 24 lastTok Token 25 } 26 27 // Position stores the source line and column where a token starts. 28 type Position struct { 29 // Line number of the token (starts at 1). 30 Line int 31 // Column on the line (starts at 1). Note that this is the byte 32 // offset into the line, not rune offset. 33 Column int 34 } 35 36 // NewLexer creates a new lexer that will tokenize the given source 37 // code. See the module-level example for a working example. 38 func NewLexer(src []byte) *Lexer { 39 l := &Lexer{src: src} 40 l.nextPos.Line = 1 41 l.nextPos.Column = 1 42 l.next() 43 return l 44 } 45 46 // HadSpace returns true if the previously-scanned token had 47 // whitespace before it. Used by the parser because when calling a 48 // user-defined function the grammar doesn't allow a space between 49 // the function name and the left parenthesis. 50 func (l *Lexer) HadSpace() bool { 51 return l.hadSpace 52 } 53 54 // Scan scans the next token and returns its position (line/column), 55 // token value (one of the uppercased token constants), and the 56 // string value of the token. For most tokens, the token value is 57 // empty. For NAME, NUMBER, STRING, and REGEX tokens, it's the 58 // token's value. For an ILLEGAL token, it's the error message. 59 func (l *Lexer) Scan() (Position, Token, string) { 60 pos, tok, val := l.scan() 61 l.lastTok = tok 62 return pos, tok, val 63 } 64 65 // Does the real work of scanning. Scan() wraps this to more easily 66 // set lastTok. 67 func (l *Lexer) scan() (Position, Token, string) { 68 // Skip whitespace (except newline, which is a token) 69 l.hadSpace = false 70 for l.ch == ' ' || l.ch == '\t' || l.ch == '\r' || l.ch == '\\' { 71 l.hadSpace = true 72 if l.ch == '\\' { 73 l.next() 74 if l.ch == '\r' { 75 l.next() 76 } 77 if l.ch != '\n' { 78 return l.pos, ILLEGAL, "expected \\n after \\ line continuation" 79 } 80 } 81 l.next() 82 } 83 if l.ch == '#' { 84 // Skip comment till end of line 85 l.next() 86 for l.ch != '\n' && l.ch != 0 { 87 l.next() 88 } 89 } 90 if l.ch == 0 { 91 // l.next() reached end of input 92 return l.pos, EOF, "" 93 } 94 95 pos := l.pos 96 tok := ILLEGAL 97 val := "" 98 99 ch := l.ch 100 l.next() 101 102 // Names: keywords and functions 103 if isNameStart(ch) { 104 start := l.offset - 2 105 for isNameStart(l.ch) || (l.ch >= '0' && l.ch <= '9') { 106 l.next() 107 } 108 name := string(l.src[start : l.offset-1]) 109 tok := KeywordToken(name) 110 if tok == ILLEGAL { 111 tok = NAME 112 val = name 113 } 114 return pos, tok, val 115 } 116 117 // These are ordered by my guess at frequency of use. Should run 118 // through a corpus of real AWK programs to determine actual 119 // frequency. 120 switch ch { 121 case '$': 122 tok = DOLLAR 123 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.': 124 // Avoid make/append and use l.offset directly for performance 125 start := l.offset - 2 126 gotDigit := false 127 if ch != '.' { 128 gotDigit = true 129 for l.ch >= '0' && l.ch <= '9' { 130 l.next() 131 } 132 if l.ch == '.' { 133 l.next() 134 } 135 } 136 for l.ch >= '0' && l.ch <= '9' { 137 gotDigit = true 138 l.next() 139 } 140 if !gotDigit { 141 return l.pos, ILLEGAL, "expected digits" 142 } 143 if l.ch == 'e' || l.ch == 'E' { 144 l.next() 145 gotSign := false 146 if l.ch == '+' || l.ch == '-' { 147 gotSign = true 148 l.next() 149 } 150 gotDigit = false 151 for l.ch >= '0' && l.ch <= '9' { 152 l.next() 153 gotDigit = true 154 } 155 // Per awk/gawk, "1e" is allowed, but not "1e+" 156 if gotSign && !gotDigit { 157 return l.pos, ILLEGAL, "expected digits" 158 } 159 } 160 tok = NUMBER 161 val = string(l.src[start : l.offset-1]) 162 case '{': 163 tok = LBRACE 164 case '}': 165 tok = RBRACE 166 case '=': 167 tok = l.choice('=', ASSIGN, EQUALS) 168 case '<': 169 tok = l.choice('=', LESS, LTE) 170 case '>': 171 switch l.ch { 172 case '=': 173 l.next() 174 tok = GTE 175 case '>': 176 l.next() 177 tok = APPEND 178 default: 179 tok = GREATER 180 } 181 case '"', '\'': 182 // Note: POSIX awk spec doesn't allow single-quoted strings, 183 // but this helps without quoting, especially on Windows 184 // where the shell quote character is " (double quote). 185 chars := make([]byte, 0, 32) // most won't require heap allocation 186 for l.ch != ch { 187 c := l.ch 188 if c == 0 { 189 return l.pos, ILLEGAL, "didn't find end quote in string" 190 } 191 if c == '\r' || c == '\n' { 192 return l.pos, ILLEGAL, "can't have newline in string" 193 } 194 if c != '\\' { 195 // Normal, non-escaped character 196 chars = append(chars, c) 197 l.next() 198 continue 199 } 200 // Escape sequence, skip over \ and process 201 l.next() 202 switch l.ch { 203 case 'n': 204 c = '\n' 205 l.next() 206 case 't': 207 c = '\t' 208 l.next() 209 case 'r': 210 c = '\r' 211 l.next() 212 case 'a': 213 c = '\a' 214 l.next() 215 case 'b': 216 c = '\b' 217 l.next() 218 case 'f': 219 c = '\f' 220 l.next() 221 case 'v': 222 c = '\v' 223 l.next() 224 case 'x': 225 // Hex byte of one of two hex digits 226 l.next() 227 digit := hexDigit(l.ch) 228 if digit < 0 { 229 return l.pos, ILLEGAL, "1 or 2 hex digits expected" 230 } 231 c = byte(digit) 232 l.next() 233 digit = hexDigit(l.ch) 234 if digit >= 0 { 235 c = c*16 + byte(digit) 236 l.next() 237 } 238 case '0', '1', '2', '3', '4', '5', '6', '7': 239 // Octal byte of 1-3 octal digits 240 c = l.ch - '0' 241 l.next() 242 for i := 0; i < 2 && l.ch >= '0' && l.ch <= '7'; i++ { 243 c = c*8 + l.ch - '0' 244 l.next() 245 } 246 default: 247 // Any other escape character is just the char 248 // itself, eg: "\z" is just "z" 249 c = l.ch 250 l.next() 251 } 252 chars = append(chars, c) 253 } 254 l.next() 255 tok = STRING 256 val = string(chars) 257 case '(': 258 tok = LPAREN 259 case ')': 260 tok = RPAREN 261 case ',': 262 tok = COMMA 263 case ';': 264 tok = SEMICOLON 265 case '+': 266 switch l.ch { 267 case '+': 268 l.next() 269 tok = INCR 270 case '=': 271 l.next() 272 tok = ADD_ASSIGN 273 default: 274 tok = ADD 275 } 276 case '-': 277 switch l.ch { 278 case '-': 279 l.next() 280 tok = DECR 281 case '=': 282 l.next() 283 tok = SUB_ASSIGN 284 default: 285 tok = SUB 286 } 287 case '*': 288 switch l.ch { 289 case '*': 290 l.next() 291 tok = l.choice('=', POW, POW_ASSIGN) 292 case '=': 293 l.next() 294 tok = MUL_ASSIGN 295 default: 296 tok = MUL 297 } 298 case '/': 299 tok = l.choice('=', DIV, DIV_ASSIGN) 300 case '%': 301 tok = l.choice('=', MOD, MOD_ASSIGN) 302 case '[': 303 tok = LBRACKET 304 case ']': 305 tok = RBRACKET 306 case '\n': 307 tok = NEWLINE 308 case '^': 309 tok = l.choice('=', POW, POW_ASSIGN) 310 case '!': 311 switch l.ch { 312 case '=': 313 l.next() 314 tok = NOT_EQUALS 315 case '~': 316 l.next() 317 tok = NOT_MATCH 318 default: 319 tok = NOT 320 } 321 case '~': 322 tok = MATCH 323 case '?': 324 tok = QUESTION 325 case ':': 326 tok = COLON 327 case '&': 328 tok = l.choice('&', ILLEGAL, AND) 329 if tok == ILLEGAL { 330 return l.pos, ILLEGAL, "unexpected char after '&'" 331 } 332 case '|': 333 tok = l.choice('|', PIPE, OR) 334 default: 335 tok = ILLEGAL 336 val = "unexpected char" 337 } 338 return pos, tok, val 339 } 340 341 // ScanRegex parses an AWK regular expression in /slash/ syntax. The 342 // AWK grammar has somewhat special handling of regex tokens, so the 343 // parser can only call this after a DIV or DIV_ASSIGN token has just 344 // been scanned. 345 func (l *Lexer) ScanRegex() (Position, Token, string) { 346 pos, tok, val := l.scanRegex() 347 l.lastTok = tok 348 return pos, tok, val 349 } 350 351 // Does the real work of scanning a regex. ScanRegex() wraps this to 352 // more easily set lastTok. 353 func (l *Lexer) scanRegex() (Position, Token, string) { 354 pos := l.pos 355 chars := make([]byte, 0, 32) // most won't require heap allocation 356 switch l.lastTok { 357 case DIV: 358 // Regex after '/' (the usual case) 359 pos.Column -= 1 360 case DIV_ASSIGN: 361 // Regex after '/=' (happens when regex starts with '=') 362 pos.Column -= 2 363 chars = append(chars, '=') 364 default: 365 return l.pos, ILLEGAL, fmt.Sprintf("unexpected %s preceding regex", l.lastTok) 366 } 367 for l.ch != '/' { 368 c := l.ch 369 if c == 0 { 370 return l.pos, ILLEGAL, "didn't find end slash in regex" 371 } 372 if c == '\r' || c == '\n' { 373 return l.pos, ILLEGAL, "can't have newline in regex" 374 } 375 if c == '\\' { 376 l.next() 377 if l.ch != '/' { 378 chars = append(chars, '\\') 379 } 380 c = l.ch 381 } 382 chars = append(chars, c) 383 l.next() 384 } 385 l.next() 386 return pos, REGEX, string(chars) 387 } 388 389 // Load the next character into l.ch (or 0 on end of input) and update 390 // line and column position. 391 func (l *Lexer) next() { 392 l.pos = l.nextPos 393 if l.offset >= len(l.src) { 394 // For last character, move offset 1 past the end as it 395 // simplifies offset calculations in NAME and NUMBER 396 if l.ch != 0 { 397 l.ch = 0 398 l.offset++ 399 } 400 return 401 } 402 ch := l.src[l.offset] 403 if ch == '\n' { 404 l.nextPos.Line++ 405 l.nextPos.Column = 1 406 } else { 407 l.nextPos.Column++ 408 } 409 l.ch = ch 410 l.offset++ 411 } 412 413 func isNameStart(ch byte) bool { 414 return ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') 415 } 416 417 // Return the hex digit 0-15 corresponding to the given ASCII byte, 418 // or -1 if it's not a valid hex digit. 419 func hexDigit(ch byte) int { 420 switch { 421 case ch >= '0' && ch <= '9': 422 return int(ch - '0') 423 case ch >= 'a' && ch <= 'f': 424 return int(ch - 'a' + 10) 425 case ch >= 'A' && ch <= 'F': 426 return int(ch - 'A' + 10) 427 default: 428 return -1 429 } 430 } 431 432 func (l *Lexer) choice(ch byte, one, two Token) Token { 433 if l.ch == ch { 434 l.next() 435 return two 436 } 437 return one 438 }