github.com/tiagovtristao/plz@v13.4.0+incompatible/src/parse/asp/lexer.go (about) 1 package asp 2 3 import ( 4 "io" 5 "io/ioutil" 6 "unicode" 7 "unicode/utf8" 8 ) 9 10 // Token types. 11 const ( 12 EOF = -(iota + 1) 13 Ident 14 Int 15 String 16 LexOperator 17 EOL 18 Unindent 19 ) 20 21 // A Token describes each individual lexical element emitted by the lexer. 22 type Token struct { 23 // Type of token. If > 0 this is the literal character value; if < 0 it is one of the types above. 24 Type rune 25 // The literal text of the token. Strings are lightly normalised to always be surrounded by quotes (but only one). 26 Value string 27 // The position in the input that the token occurred at. 28 Pos Position 29 } 30 31 // String implements the fmt.Stringer interface 32 func (tok Token) String() string { 33 if tok.Value != "" { 34 return tok.Value 35 } 36 return reverseSymbol(tok.Type) 37 } 38 39 // EndPos returns the end position of a token 40 func (tok Token) EndPos() Position { 41 end := tok.Pos 42 end.Offset += len(tok.Value) 43 end.Column += len(tok.Value) 44 45 return end 46 } 47 48 // A Position describes a position in a source file. 49 // All properties in Position are one(1) indexed 50 type Position struct { 51 Filename string 52 Offset int 53 Line int 54 Column int 55 } 56 57 type namer interface { 58 Name() string 59 } 60 61 // NameOfReader returns a name for the given reader, if one can be determined. 62 func NameOfReader(r io.Reader) string { 63 if n, ok := r.(namer); ok { 64 return n.Name() 65 } 66 return "" 67 } 68 69 // newLexer creates a new lex instance. 70 func newLexer(r io.Reader) *lex { 71 // Read the entire file upfront to avoid bufio etc. 72 // This should work OK as long as BUILD files are relatively small. 73 b, err := ioutil.ReadAll(r) 74 if err != nil { 75 fail(Position{Filename: NameOfReader(r)}, err.Error()) 76 } 77 // If the file doesn't end in a newline, we will reject it with an "unexpected end of file" 78 // error. That's a bit crap so quietly fix it up here. 79 if len(b) > 0 && b[len(b)-1] != '\n' { 80 b = append(b, '\n') 81 } 82 l := &lex{ 83 b: append(b, 0, 0), // Null-terminating the buffer makes things easier later. 84 filename: NameOfReader(r), 85 indents: []int{0}, 86 } 87 l.Next() // Initial value is zero, this forces it to populate itself. 88 // Discard any leading newlines, they are just an annoyance. 89 for l.Peek().Type == EOL { 90 l.Next() 91 } 92 return l 93 } 94 95 // A lex is a lexer for a single BUILD file. 96 type lex struct { 97 b []byte 98 i int 99 line int 100 col int 101 indent int 102 // The next token. We always look one token ahead in order to facilitate both Peek() and Next(). 103 next Token 104 filename string 105 // Used to track how many braces we're within. 106 braces int 107 // Pending unindent tokens. This is a bit yuck but means the parser doesn't need to 108 // concern itself about indentation. 109 unindents int 110 // Current levels of indentation 111 indents []int 112 // Remember whether the last token we output was an end-of-line so we don't emit multiple in sequence. 113 lastEOL bool 114 } 115 116 // reverseSymbol looks up a symbol's name from the lexer. 117 func reverseSymbol(sym rune) string { 118 switch sym { 119 case EOF: 120 return "end of file" 121 case Ident: 122 return "identifier" 123 case Int: 124 return "integer" 125 case String: 126 return "string" 127 case LexOperator: 128 return "operator" 129 case EOL: 130 return "end of line" 131 case Unindent: 132 return "unindent" 133 } 134 return string(sym) // literal character 135 } 136 137 // reverseSymbols looks up a series of symbol's names from the lexer. 138 func reverseSymbols(syms []rune) []string { 139 ret := make([]string, len(syms)) 140 for i, sym := range syms { 141 ret[i] = reverseSymbol(sym) 142 } 143 return ret 144 } 145 146 // Peek at the next token 147 func (l *lex) Peek() Token { 148 return l.next 149 } 150 151 // Next consumes and returns the next token. 152 func (l *lex) Next() Token { 153 ret := l.next 154 l.next = l.nextToken() 155 l.lastEOL = l.next.Type == EOL || l.next.Type == Unindent 156 return ret 157 } 158 159 // AssignFollows is a hack to do extra lookahead which makes it easier to parse 160 // named call arguments. It returns true if the token after next is an assign operator. 161 func (l *lex) AssignFollows() bool { 162 l.stripSpaces() 163 return l.b[l.i] == '=' && l.b[l.i+1] != '=' 164 } 165 166 func (l *lex) stripSpaces() { 167 for l.b[l.i] == ' ' { 168 l.i++ 169 l.col++ 170 } 171 } 172 173 // nextToken consumes and returns the next token. 174 func (l *lex) nextToken() Token { 175 l.stripSpaces() 176 pos := Position{ 177 Filename: l.filename, 178 // These are all 1-indexed for niceness. 179 Offset: l.i + 1, 180 Line: l.line + 1, 181 Column: l.col + 1, 182 } 183 if l.unindents > 0 { 184 l.unindents-- 185 return Token{Type: Unindent, Pos: pos} 186 } 187 b := l.b[l.i] 188 rawString := b == 'r' && (l.b[l.i+1] == '"' || l.b[l.i+1] == '\'') 189 fString := b == 'f' && (l.b[l.i+1] == '"' || l.b[l.i+1] == '\'') 190 if rawString || fString { 191 l.i++ 192 l.col++ 193 b = l.b[l.i] 194 } else if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || b >= utf8.RuneSelf { 195 return l.consumeIdent(pos) 196 } 197 l.i++ 198 l.col++ 199 switch b { 200 case 0: 201 // End of file (we null terminate it above so this is easy to spot) 202 return Token{Type: EOF, Pos: pos} 203 case '\n': 204 // End of line, read indent to next non-space character 205 lastIndent := l.indent 206 l.line++ 207 l.col = 0 208 indent := 0 209 for l.b[l.i] == ' ' { 210 l.i++ 211 l.col++ 212 indent++ 213 } 214 if l.b[l.i] == '\n' { 215 return l.nextToken() 216 } 217 if l.braces == 0 { 218 l.indent = indent 219 } 220 if lastIndent > l.indent && l.braces == 0 { 221 pos.Line++ // Works better if it's at the new position 222 pos.Column = l.col + 1 223 for l.indents[len(l.indents)-1] > l.indent { 224 l.unindents++ 225 l.indents = l.indents[:len(l.indents)-1] 226 } 227 if l.indent != l.indents[len(l.indents)-1] { 228 fail(pos, "Unexpected indent") 229 } 230 } else if lastIndent != l.indent { 231 l.indents = append(l.indents, l.indent) 232 } 233 if l.braces == 0 && !l.lastEOL { 234 return Token{Type: EOL, Pos: pos} 235 } 236 return l.nextToken() 237 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 238 return l.consumeInteger(b, pos) 239 case '"', '\'': 240 // String literal, consume to end. 241 return l.consumePossiblyTripleQuotedString(b, pos, rawString, fString) 242 case '(', '[', '{': 243 l.braces++ 244 return Token{Type: rune(b), Value: string(b), Pos: pos} 245 case ')', ']', '}': 246 if l.braces > 0 { // Don't let it go negative, it fouls things up 247 l.braces-- 248 } 249 return Token{Type: rune(b), Value: string(b), Pos: pos} 250 case '=', '!', '+', '<', '>': 251 // Look ahead one byte to see if this is an augmented assignment or comparison. 252 if l.b[l.i] == '=' { 253 l.i++ 254 l.col++ 255 return Token{Type: LexOperator, Value: string([]byte{b, l.b[l.i-1]}), Pos: pos} 256 } 257 fallthrough 258 case ',', '.', '%', '*', '|', '&', ':': 259 return Token{Type: rune(b), Value: string(b), Pos: pos} 260 case '#': 261 // Comment character, consume to end of line. 262 for l.b[l.i] != '\n' && l.b[l.i] != 0 { 263 l.i++ 264 l.col++ 265 } 266 return l.nextToken() // Comments aren't tokens themselves. 267 case '-': 268 // We lex unary - with the integer if possible. 269 if l.b[l.i] >= '0' && l.b[l.i] <= '9' { 270 return l.consumeInteger(b, pos) 271 } 272 return Token{Type: rune(b), Value: string(b), Pos: pos} 273 case '\t': 274 fail(pos, "Tabs are not permitted in BUILD files, use space-based indentation instead") 275 default: 276 fail(pos, "Unknown symbol %c", b) 277 } 278 panic("unreachable") 279 } 280 281 // consumeInteger consumes all characters until the end of an integer literal is reached. 282 func (l *lex) consumeInteger(initial byte, pos Position) Token { 283 s := make([]byte, 1, 10) 284 s[0] = initial 285 for c := l.b[l.i]; c >= '0' && c <= '9'; c = l.b[l.i] { 286 l.i++ 287 l.col++ 288 s = append(s, c) 289 } 290 return Token{Type: Int, Value: string(s), Pos: pos} 291 } 292 293 // consumePossiblyTripleQuotedString consumes all characters until the end of a string token. 294 func (l *lex) consumePossiblyTripleQuotedString(quote byte, pos Position, raw, fString bool) Token { 295 if l.b[l.i] == quote && l.b[l.i+1] == quote { 296 l.i += 2 // Jump over initial quote 297 l.col += 2 298 return l.consumeString(quote, pos, true, raw, fString) 299 } 300 return l.consumeString(quote, pos, false, raw, fString) 301 } 302 303 // consumeString consumes all characters until the end of a string literal is reached. 304 func (l *lex) consumeString(quote byte, pos Position, multiline, raw, fString bool) Token { 305 s := make([]byte, 1, 100) // 100 chars is typically enough for a single string literal. 306 s[0] = '"' 307 escaped := false 308 for { 309 c := l.b[l.i] 310 l.i++ 311 l.col++ 312 if escaped { 313 if c == 'n' { 314 s = append(s, '\n') 315 } else if c == '\n' && multiline { 316 l.line++ 317 l.col = 0 318 } else if c == '\\' || c == '\'' || c == '"' { 319 s = append(s, c) 320 } else { 321 s = append(s, '\\', c) 322 } 323 escaped = false 324 continue 325 } 326 switch c { 327 case quote: 328 s = append(s, '"') 329 if !multiline || (l.b[l.i] == quote && l.b[l.i+1] == quote) { 330 if multiline { 331 l.i += 2 332 l.col += 2 333 } 334 token := Token{Type: String, Value: string(s), Pos: pos} 335 if fString { 336 token.Value = "f" + token.Value 337 } 338 if l.braces > 0 { 339 return l.handleImplicitStringConcatenation(token) 340 } 341 return token 342 } 343 case '\n': 344 if multiline { 345 l.line++ 346 l.col = 0 347 s = append(s, c) 348 continue 349 } 350 fallthrough 351 case 0: 352 fail(pos, "Unterminated string literal") 353 case '\\': 354 if !raw { 355 escaped = true 356 continue 357 } 358 fallthrough 359 default: 360 s = append(s, c) 361 } 362 } 363 } 364 365 // handleImplicitStringConcatenation looks ahead after a string token and checks if the next token will be a string; if so 366 // we collapse them both into one string now. 367 func (l *lex) handleImplicitStringConcatenation(token Token) Token { 368 col := l.col 369 line := l.line 370 for i, b := range l.b[l.i:] { 371 switch b { 372 case '\n': 373 col = 0 374 line++ 375 continue 376 case ' ': 377 col++ 378 continue 379 case '"', '\'': 380 l.i += i + 1 381 l.col = col + 1 382 l.line = line 383 // Note that we don't handle raw or format strings here. Anecdotally, that seems relatively rare... 384 tok := l.consumePossiblyTripleQuotedString(b, token.Pos, false, false) 385 token.Value = token.Value[:len(token.Value)-1] + tok.Value[1:] 386 return token 387 default: 388 return token 389 } 390 } 391 return token 392 } 393 394 // consumeIdent consumes all characters of an identifier. 395 func (l *lex) consumeIdent(pos Position) Token { 396 s := make([]rune, 0, 100) 397 for { 398 c := rune(l.b[l.i]) 399 if c >= utf8.RuneSelf { 400 // Multi-byte encoded in utf-8. 401 r, n := utf8.DecodeRune(l.b[l.i:]) 402 c = r 403 l.i += n 404 l.col += n 405 if !unicode.IsLetter(c) && !unicode.IsDigit(c) { 406 fail(pos, "Illegal Unicode identifier %c", c) 407 } 408 s = append(s, c) 409 continue 410 } 411 l.i++ 412 l.col++ 413 switch c { 414 case ' ': 415 // End of identifier, but no unconsuming needed. 416 return Token{Type: Ident, Value: string(s), Pos: pos} 417 case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 418 s = append(s, c) 419 default: 420 // End of identifier. Unconsume the last character so it gets handled next time. 421 l.i-- 422 l.col-- 423 return Token{Type: Ident, Value: string(s), Pos: pos} 424 } 425 } 426 }