github.phpd.cn/thought-machine/please@v12.2.0+incompatible/src/parse/asp/lexer.go (about) 1 package asp 2 3 import ( 4 "io" 5 "io/ioutil" 6 "unicode" 7 "unicode/utf8" 8 ) 9 10 // Token types. 11 const ( 12 EOF = -(iota + 1) 13 Ident 14 Int 15 String 16 LexOperator 17 EOL 18 Unindent 19 ) 20 21 // A Token describes each individual lexical element emitted by the lexer. 22 type Token struct { 23 // Type of token. If > 0 this is the literal character value; if < 0 it is one of the types above. 24 Type rune 25 // The literal text of the token. Strings are lightly normalised to always be surrounded by quotes (but only one). 26 Value string 27 // The position in the input that the token occurred at. 28 Pos Position 29 } 30 31 // String implements the fmt.Stringer interface 32 func (tok Token) String() string { 33 if tok.Value != "" { 34 return tok.Value 35 } 36 return reverseSymbol(tok.Type) 37 } 38 39 // A Position describes a position in a source file. 40 type Position struct { 41 Filename string 42 Offset int 43 Line int 44 Column int 45 } 46 47 type namer interface { 48 Name() string 49 } 50 51 // NameOfReader returns a name for the given reader, if one can be determined. 52 func NameOfReader(r io.Reader) string { 53 if n, ok := r.(namer); ok { 54 return n.Name() 55 } 56 return "" 57 } 58 59 // newLexer creates a new lex instance. 60 func newLexer(r io.Reader) *lex { 61 // Read the entire file upfront to avoid bufio etc. 62 // This should work OK as long as BUILD files are relatively small. 63 b, err := ioutil.ReadAll(r) 64 if err != nil { 65 fail(Position{Filename: NameOfReader(r)}, err.Error()) 66 } 67 // If the file doesn't end in a newline, we will reject it with an "unexpected end of file" 68 // error. That's a bit crap so quietly fix it up here. 69 if len(b) > 0 && b[len(b)-1] != '\n' { 70 b = append(b, '\n') 71 } 72 l := &lex{ 73 b: append(b, 0, 0), // Null-terminating the buffer makes things easier later. 74 filename: NameOfReader(r), 75 indents: []int{0}, 76 } 77 l.Next() // Initial value is zero, this forces it to populate itself. 78 // Discard any leading newlines, they are just an annoyance. 79 for l.Peek().Type == EOL { 80 l.Next() 81 } 82 return l 83 } 84 85 // A lex is a lexer for a single BUILD file. 86 type lex struct { 87 b []byte 88 i int 89 line int 90 col int 91 indent int 92 // The next token. We always look one token ahead in order to facilitate both Peek() and Next(). 93 next Token 94 filename string 95 // Used to track how many braces we're within. 96 braces int 97 // Pending unindent tokens. This is a bit yuck but means the parser doesn't need to 98 // concern itself about indentation. 99 unindents int 100 // Current levels of indentation 101 indents []int 102 // Remember whether the last token we output was an end-of-line so we don't emit multiple in sequence. 103 lastEOL bool 104 } 105 106 // reverseSymbol looks up a symbol's name from the lexer. 107 func reverseSymbol(sym rune) string { 108 switch sym { 109 case EOF: 110 return "end of file" 111 case Ident: 112 return "identifier" 113 case Int: 114 return "integer" 115 case String: 116 return "string" 117 case LexOperator: 118 return "operator" 119 case EOL: 120 return "end of line" 121 case Unindent: 122 return "unindent" 123 } 124 return string(sym) // literal character 125 } 126 127 // reverseSymbols looks up a series of symbol's names from the lexer. 128 func reverseSymbols(syms []rune) []string { 129 ret := make([]string, len(syms)) 130 for i, sym := range syms { 131 ret[i] = reverseSymbol(sym) 132 } 133 return ret 134 } 135 136 // Peek at the next token 137 func (l *lex) Peek() Token { 138 return l.next 139 } 140 141 // Next consumes and returns the next token. 142 func (l *lex) Next() Token { 143 ret := l.next 144 l.next = l.nextToken() 145 l.lastEOL = l.next.Type == EOL || l.next.Type == Unindent 146 return ret 147 } 148 149 // AssignFollows is a hack to do extra lookahead which makes it easier to parse 150 // named call arguments. It returns true if the token after next is an assign operator. 151 func (l *lex) AssignFollows() bool { 152 l.stripSpaces() 153 return l.b[l.i] == '=' && l.b[l.i+1] != '=' 154 } 155 156 func (l *lex) stripSpaces() { 157 for l.b[l.i] == ' ' { 158 l.i++ 159 l.col++ 160 } 161 } 162 163 // nextToken consumes and returns the next token. 164 func (l *lex) nextToken() Token { 165 l.stripSpaces() 166 pos := Position{ 167 Filename: l.filename, 168 // These are all 1-indexed for niceness. 169 Offset: l.i + 1, 170 Line: l.line + 1, 171 Column: l.col + 1, 172 } 173 if l.unindents > 0 { 174 l.unindents-- 175 return Token{Type: Unindent, Pos: pos} 176 } 177 b := l.b[l.i] 178 rawString := b == 'r' && (l.b[l.i+1] == '"' || l.b[l.i+1] == '\'') 179 if rawString { 180 l.i++ 181 l.col++ 182 b = l.b[l.i] 183 } else if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || b >= utf8.RuneSelf { 184 return l.consumeIdent(pos) 185 } 186 l.i++ 187 l.col++ 188 switch b { 189 case 0: 190 // End of file (we null terminate it above so this is easy to spot) 191 return Token{Type: EOF, Pos: pos} 192 case '\n': 193 // End of line, read indent to next non-space character 194 lastIndent := l.indent 195 l.line++ 196 l.col = 0 197 indent := 0 198 for l.b[l.i] == ' ' { 199 l.i++ 200 l.col++ 201 indent++ 202 } 203 if l.b[l.i] == '\n' { 204 return l.nextToken() 205 } 206 if l.braces == 0 { 207 l.indent = indent 208 } 209 if lastIndent > l.indent && l.braces == 0 { 210 pos.Line++ // Works better if it's at the new position 211 pos.Column = l.col + 1 212 for l.indents[len(l.indents)-1] > l.indent { 213 l.unindents++ 214 l.indents = l.indents[:len(l.indents)-1] 215 } 216 if l.indent != l.indents[len(l.indents)-1] { 217 fail(pos, "Unexpected indent") 218 } 219 } else if lastIndent != l.indent { 220 l.indents = append(l.indents, l.indent) 221 } 222 if l.braces == 0 && !l.lastEOL { 223 return Token{Type: EOL, Pos: pos} 224 } 225 return l.nextToken() 226 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 227 return l.consumeInteger(b, pos) 228 case '"', '\'': 229 // String literal, consume to end. 230 return l.consumePossiblyTripleQuotedString(b, pos, rawString) 231 case '(', '[', '{': 232 l.braces++ 233 return Token{Type: rune(b), Value: string(b), Pos: pos} 234 case ')', ']', '}': 235 if l.braces > 0 { // Don't let it go negative, it fouls things up 236 l.braces-- 237 } 238 return Token{Type: rune(b), Value: string(b), Pos: pos} 239 case '=', '!', '+', '<', '>': 240 // Look ahead one byte to see if this is an augmented assignment or comparison. 241 if l.b[l.i] == '=' { 242 l.i++ 243 l.col++ 244 return Token{Type: LexOperator, Value: string([]byte{b, l.b[l.i-1]}), Pos: pos} 245 } 246 fallthrough 247 case ',', '.', '%', '*', '|', '&', ':': 248 return Token{Type: rune(b), Value: string(b), Pos: pos} 249 case '#': 250 // Comment character, consume to end of line. 251 for l.b[l.i] != '\n' && l.b[l.i] != 0 { 252 l.i++ 253 l.col++ 254 } 255 return l.nextToken() // Comments aren't tokens themselves. 256 case '-': 257 // We lex unary - with the integer if possible. 258 if l.b[l.i] >= '0' && l.b[l.i] <= '9' { 259 return l.consumeInteger(b, pos) 260 } 261 return Token{Type: rune(b), Value: string(b), Pos: pos} 262 case '\t': 263 fail(pos, "Tabs are not permitted in BUILD files, use space-based indentation instead") 264 default: 265 fail(pos, "Unknown symbol %c", b) 266 } 267 panic("unreachable") 268 } 269 270 // consumeInteger consumes all characters until the end of an integer literal is reached. 271 func (l *lex) consumeInteger(initial byte, pos Position) Token { 272 s := make([]byte, 1, 10) 273 s[0] = initial 274 for c := l.b[l.i]; c >= '0' && c <= '9'; c = l.b[l.i] { 275 l.i++ 276 l.col++ 277 s = append(s, c) 278 } 279 return Token{Type: Int, Value: string(s), Pos: pos} 280 } 281 282 // consumePossiblyTripleQuotedString consumes all characters until the end of a string token. 283 func (l *lex) consumePossiblyTripleQuotedString(quote byte, pos Position, raw bool) Token { 284 if l.b[l.i] == quote && l.b[l.i+1] == quote { 285 l.i += 2 // Jump over initial quote 286 l.col += 2 287 return l.consumeString(quote, pos, true, raw) 288 } 289 return l.consumeString(quote, pos, false, raw) 290 } 291 292 // consumeString consumes all characters until the end of a string literal is reached. 293 func (l *lex) consumeString(quote byte, pos Position, multiline, raw bool) Token { 294 s := make([]byte, 1, 100) // 100 chars is typically enough for a single string literal. 295 s[0] = '"' 296 escaped := false 297 for { 298 c := l.b[l.i] 299 l.i++ 300 l.col++ 301 if escaped { 302 if c == 'n' { 303 s = append(s, '\n') 304 } else if c == '\n' && multiline { 305 l.line++ 306 l.col = 0 307 } else if c == '\\' || c == '\'' || c == '"' { 308 s = append(s, c) 309 } else { 310 s = append(s, '\\', c) 311 } 312 escaped = false 313 continue 314 } 315 switch c { 316 case quote: 317 s = append(s, '"') 318 if !multiline || (l.b[l.i] == quote && l.b[l.i+1] == quote) { 319 if multiline { 320 l.i += 2 321 l.col += 2 322 } 323 token := Token{Type: String, Value: string(s), Pos: pos} 324 if l.braces > 0 { 325 return l.handleImplicitStringConcatenation(token) 326 } 327 return token 328 } 329 case '\n': 330 if multiline { 331 l.line++ 332 l.col = 0 333 s = append(s, c) 334 continue 335 } 336 fallthrough 337 case 0: 338 fail(pos, "Unterminated string literal") 339 case '\\': 340 if !raw { 341 escaped = true 342 continue 343 } 344 fallthrough 345 default: 346 s = append(s, c) 347 } 348 } 349 } 350 351 // handleImplicitStringConcatenation looks ahead after a string token and checks if the next token will be a string; if so 352 // we collapse them both into one string now. 353 func (l *lex) handleImplicitStringConcatenation(token Token) Token { 354 col := l.col 355 line := l.line 356 for i, b := range l.b[l.i:] { 357 switch b { 358 case '\n': 359 col = 0 360 line++ 361 continue 362 case ' ': 363 col++ 364 continue 365 case '"', '\'': 366 l.i += i + 1 367 l.col = col + 1 368 l.line = line 369 // Note that we don't handle raw strings here. Anecdotally, that seems relatively rare... 370 tok := l.consumePossiblyTripleQuotedString(b, token.Pos, false) 371 token.Value = token.Value[:len(token.Value)-1] + tok.Value[1:] 372 return token 373 default: 374 return token 375 } 376 } 377 return token 378 } 379 380 // consumeIdent consumes all characters of an identifier. 381 func (l *lex) consumeIdent(pos Position) Token { 382 s := make([]rune, 0, 100) 383 for { 384 c := rune(l.b[l.i]) 385 if c >= utf8.RuneSelf { 386 // Multi-byte encoded in utf-8. 387 r, n := utf8.DecodeRune(l.b[l.i:]) 388 c = r 389 l.i += n 390 l.col += n 391 if !unicode.IsLetter(c) && !unicode.IsDigit(c) { 392 fail(pos, "Illegal Unicode identifier %c", c) 393 } 394 s = append(s, c) 395 continue 396 } 397 l.i++ 398 l.col++ 399 switch c { 400 case ' ': 401 // End of identifier, but no unconsuming needed. 402 return Token{Type: Ident, Value: string(s), Pos: pos} 403 case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 404 s = append(s, c) 405 default: 406 // End of identifier. Unconsume the last character so it gets handled next time. 407 l.i-- 408 l.col-- 409 return Token{Type: Ident, Value: string(s), Pos: pos} 410 } 411 } 412 }