github.com/golang/gofrontend@v0.0.0-20240429183944-60f985a78526/go/lex.h (about) 1 // lex.h -- Go frontend lexer. -*- C++ -*- 2 3 // Copyright 2009 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 #ifndef GO_LEX_H 8 #define GO_LEX_H 9 10 #include <mpfr.h> 11 12 #include "operator.h" 13 #include "go-linemap.h" 14 15 struct Unicode_range; 16 17 // The keywords. These must be in sorted order, other than 18 // KEYWORD_INVALID. They must match the Keywords::mapping_ array in 19 // lex.cc. 20 21 enum Keyword 22 { 23 KEYWORD_INVALID, // Not a keyword. 24 KEYWORD_ASM, 25 KEYWORD_BREAK, 26 KEYWORD_CASE, 27 KEYWORD_CHAN, 28 KEYWORD_CONST, 29 KEYWORD_CONTINUE, 30 KEYWORD_DEFAULT, 31 KEYWORD_DEFER, 32 KEYWORD_ELSE, 33 KEYWORD_FALLTHROUGH, 34 KEYWORD_FOR, 35 KEYWORD_FUNC, 36 KEYWORD_GO, 37 KEYWORD_GOTO, 38 KEYWORD_IF, 39 KEYWORD_IMPORT, 40 KEYWORD_INTERFACE, 41 KEYWORD_MAP, 42 KEYWORD_PACKAGE, 43 KEYWORD_RANGE, 44 KEYWORD_RETURN, 45 KEYWORD_SELECT, 46 KEYWORD_STRUCT, 47 KEYWORD_SWITCH, 48 KEYWORD_TYPE, 49 KEYWORD_VAR 50 }; 51 52 // Pragmas built from magic comments and recorded for functions. 53 // These are used as bits in a bitmask. 54 // The set of values is intended to be the same as the gc compiler. 55 56 enum GoPragma 57 { 58 GOPRAGMA_NOINTERFACE = 1 << 0, // Method not in type descriptor. 59 GOPRAGMA_NOESCAPE = 1 << 1, // Args do not escape. 60 GOPRAGMA_NORACE = 1 << 2, // No race detector. 61 GOPRAGMA_NOSPLIT = 1 << 3, // Do not split stack. 62 GOPRAGMA_NOINLINE = 1 << 4, // Do not inline. 63 GOPRAGMA_SYSTEMSTACK = 1 << 5, // Must run on system stack. 64 GOPRAGMA_NOWRITEBARRIER = 1 << 6, // No write barriers. 65 GOPRAGMA_NOWRITEBARRIERREC = 1 << 7, // No write barriers here or callees. 66 GOPRAGMA_YESWRITEBARRIERREC = 1 << 8, // Stops nowritebarrierrec. 67 GOPRAGMA_MARK = 1 << 9, // Marker for nowritebarrierrec. 68 GOPRAGMA_CGOUNSAFEARGS = 1 << 10, // Pointer to arg is pointer to all. 69 GOPRAGMA_UINTPTRESCAPES = 1 << 11, // uintptr(p) escapes. 70 GOPRAGMA_NOTINHEAP = 1 << 12 // type is not in heap. 71 }; 72 73 // A token returned from the lexer. 74 75 class Token 76 { 77 public: 78 // Token classification. 79 enum Classification 80 { 81 // Token is invalid. 82 TOKEN_INVALID, 83 // Token indicates end of input. 84 TOKEN_EOF, 85 // Token is a keyword. 86 TOKEN_KEYWORD, 87 // Token is an identifier. 88 TOKEN_IDENTIFIER, 89 // Token is a string of characters. 90 TOKEN_STRING, 91 // Token is an operator. 92 TOKEN_OPERATOR, 93 // Token is a character constant. 94 TOKEN_CHARACTER, 95 // Token is an integer. 96 TOKEN_INTEGER, 97 // Token is a floating point number. 98 TOKEN_FLOAT, 99 // Token is an imaginary number. 100 TOKEN_IMAGINARY 101 }; 102 103 ~Token(); 104 Token(const Token&); 105 Token& operator=(const Token&); 106 107 // Get token classification. 108 Classification 109 classification() const 110 { return this->classification_; } 111 112 // Make a token for an invalid value. 113 static Token 114 make_invalid_token(Location location) 115 { return Token(TOKEN_INVALID, location); } 116 117 // Make a token representing end of file. 118 static Token 119 make_eof_token(Location location) 120 { return Token(TOKEN_EOF, location); } 121 122 // Make a keyword token. 123 static Token 124 make_keyword_token(Keyword keyword, Location location) 125 { 126 Token tok(TOKEN_KEYWORD, location); 127 tok.u_.keyword = keyword; 128 return tok; 129 } 130 131 // Make an identifier token. 132 static Token 133 make_identifier_token(const std::string& value, bool is_exported, 134 Location location) 135 { 136 Token tok(TOKEN_IDENTIFIER, location); 137 tok.u_.identifier_value.name = new std::string(value); 138 tok.u_.identifier_value.is_exported = is_exported; 139 return tok; 140 } 141 142 // Make a quoted string token. 143 static Token 144 make_string_token(const std::string& value, Location location) 145 { 146 Token tok(TOKEN_STRING, location); 147 tok.u_.string_value = new std::string(value); 148 return tok; 149 } 150 151 // Make an operator token. 152 static Token 153 make_operator_token(Operator op, Location location) 154 { 155 Token tok(TOKEN_OPERATOR, location); 156 tok.u_.op = op; 157 return tok; 158 } 159 160 // Make a character constant token. 161 static Token 162 make_character_token(mpz_t val, Location location) 163 { 164 Token tok(TOKEN_CHARACTER, location); 165 mpz_init(tok.u_.integer_value); 166 mpz_swap(tok.u_.integer_value, val); 167 return tok; 168 } 169 170 // Make an integer token. 171 static Token 172 make_integer_token(mpz_t val, Location location) 173 { 174 Token tok(TOKEN_INTEGER, location); 175 mpz_init(tok.u_.integer_value); 176 mpz_swap(tok.u_.integer_value, val); 177 return tok; 178 } 179 180 // Make a float token. 181 static Token 182 make_float_token(mpfr_t val, Location location) 183 { 184 Token tok(TOKEN_FLOAT, location); 185 mpfr_init(tok.u_.float_value); 186 mpfr_swap(tok.u_.float_value, val); 187 return tok; 188 } 189 190 // Make a token for an imaginary number. 191 static Token 192 make_imaginary_token(mpfr_t val, Location location) 193 { 194 Token tok(TOKEN_IMAGINARY, location); 195 mpfr_init(tok.u_.float_value); 196 mpfr_swap(tok.u_.float_value, val); 197 return tok; 198 } 199 200 // Get the location of the token. 201 Location 202 location() const 203 { return this->location_; } 204 205 // Return whether this is an invalid token. 206 bool 207 is_invalid() const 208 { return this->classification_ == TOKEN_INVALID; } 209 210 // Return whether this is the EOF token. 211 bool 212 is_eof() const 213 { return this->classification_ == TOKEN_EOF; } 214 215 // Return the keyword value for a keyword token. 216 Keyword 217 keyword() const 218 { 219 go_assert(this->classification_ == TOKEN_KEYWORD); 220 return this->u_.keyword; 221 } 222 223 // Return whether this is an identifier. 224 bool 225 is_identifier() const 226 { return this->classification_ == TOKEN_IDENTIFIER; } 227 228 // Return the identifier. 229 const std::string& 230 identifier() const 231 { 232 go_assert(this->classification_ == TOKEN_IDENTIFIER); 233 return *this->u_.identifier_value.name; 234 } 235 236 // Return whether the identifier is exported. 237 bool 238 is_identifier_exported() const 239 { 240 go_assert(this->classification_ == TOKEN_IDENTIFIER); 241 return this->u_.identifier_value.is_exported; 242 } 243 244 // Return whether this is a string. 245 bool 246 is_string() const 247 { 248 return this->classification_ == TOKEN_STRING; 249 } 250 251 // Return the value of a string. The returned value is a string of 252 // UTF-8 characters. 253 std::string 254 string_value() const 255 { 256 go_assert(this->classification_ == TOKEN_STRING); 257 return *this->u_.string_value; 258 } 259 260 // Return the value of a character constant. 261 const mpz_t* 262 character_value() const 263 { 264 go_assert(this->classification_ == TOKEN_CHARACTER); 265 return &this->u_.integer_value; 266 } 267 268 // Return the value of an integer. 269 const mpz_t* 270 integer_value() const 271 { 272 go_assert(this->classification_ == TOKEN_INTEGER); 273 return &this->u_.integer_value; 274 } 275 276 // Return the value of a float. 277 const mpfr_t* 278 float_value() const 279 { 280 go_assert(this->classification_ == TOKEN_FLOAT); 281 return &this->u_.float_value; 282 } 283 284 // Return the value of an imaginary number. 285 const mpfr_t* 286 imaginary_value() const 287 { 288 go_assert(this->classification_ == TOKEN_IMAGINARY); 289 return &this->u_.float_value; 290 } 291 292 // Return the operator value for an operator token. 293 Operator 294 op() const 295 { 296 go_assert(this->classification_ == TOKEN_OPERATOR); 297 return this->u_.op; 298 } 299 300 // Return whether this token is KEYWORD. 301 bool 302 is_keyword(Keyword keyword) const 303 { 304 return (this->classification_ == TOKEN_KEYWORD 305 && this->u_.keyword == keyword); 306 } 307 308 // Return whether this token is OP. 309 bool 310 is_op(Operator op) const 311 { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; } 312 313 // Print the token for debugging. 314 void 315 print(FILE*) const; 316 317 private: 318 // Private constructor used by make_..._token functions above. 319 Token(Classification, Location); 320 321 // Clear the token. 322 void 323 clear(); 324 325 // The token classification. 326 Classification classification_; 327 union 328 { 329 // The keyword value for TOKEN_KEYWORD. 330 Keyword keyword; 331 // The token value for TOKEN_IDENTIFIER. 332 struct 333 { 334 // The name of the identifier. This has been mangled to only 335 // include ASCII characters. 336 std::string* name; 337 // Whether this name should be exported. This is true if the 338 // first letter in the name is upper case. 339 bool is_exported; 340 } identifier_value; 341 // The string value for TOKEN_STRING. 342 std::string* string_value; 343 // The token value for TOKEN_CHARACTER or TOKEN_INTEGER. 344 mpz_t integer_value; 345 // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY. 346 mpfr_t float_value; 347 // The token value for TOKEN_OPERATOR or the keyword value 348 Operator op; 349 } u_; 350 // The source location. 351 Location location_; 352 }; 353 354 // The lexer itself. 355 356 class Lex 357 { 358 public: 359 Lex(const char* input_file_name, FILE* input_file, Linemap *linemap); 360 361 ~Lex(); 362 363 // Return the next token. 364 Token 365 next_token(); 366 367 // Return the contents of any current //extern comment. 368 const std::string& 369 extern_name() const 370 { return this->extern_; } 371 372 // Return the current set of pragmas, and clear them. 373 unsigned int 374 get_and_clear_pragmas() 375 { 376 unsigned int ret = this->pragmas_; 377 this->pragmas_ = 0; 378 return ret; 379 } 380 381 struct Linkname 382 { 383 std::string ext_name; // External name; empty to just export. 384 bool is_exported; // Whether the internal name is exported. 385 Location loc; // Location of go:linkname directive. 386 387 Linkname() 388 : ext_name(), is_exported(false), loc() 389 { } 390 391 Linkname(const std::string& ext_name_a, bool is_exported_a, Location loc_a) 392 : ext_name(ext_name_a), is_exported(is_exported_a), loc(loc_a) 393 { } 394 }; 395 396 typedef std::map<std::string, Linkname> Linknames; 397 398 // Return the linknames seen so far, or NULL if none, and clear the 399 // set. These are from go:linkname compiler directives. 400 Linknames* 401 get_and_clear_linknames() 402 { 403 Linknames* ret = this->linknames_; 404 this->linknames_ = NULL; 405 return ret; 406 } 407 408 // Return whether there are any current go:embed patterns. 409 bool 410 has_embeds() const 411 { return !this->embeds_.empty(); } 412 413 // If there are any go:embed patterns seen so far, store them in 414 // *EMBEDS and clear the saved set. *EMBEDS must be an empty 415 // vector. 416 void 417 get_and_clear_embeds(std::vector<std::string>* embeds) 418 { 419 go_assert(embeds->empty()); 420 std::swap(*embeds, this->embeds_); 421 } 422 423 // Clear any go:embed patterns seen so far. This is used for 424 // erroneous cases. 425 void 426 clear_embeds() 427 { this->embeds_.clear(); } 428 429 // Return whether the identifier NAME should be exported. NAME is a 430 // mangled name which includes only ASCII characters. 431 static bool 432 is_exported_mangled_name(const std::string& name); 433 434 // Return whether the identifier NAME should be exported. NAME is 435 // an unmangled utf-8 string and may contain non-ASCII characters. 436 static bool 437 is_exported_name(const std::string& name); 438 439 // Return whether the identifier NAME is invalid. When we see an 440 // invalid character we still build an identifier, but we use a 441 // magic string to indicate that the identifier is invalid. We then 442 // use this to avoid knockon errors. 443 static bool 444 is_invalid_identifier(const std::string& name); 445 446 // A helper function. Append V to STR. IS_CHARACTER is true if V 447 // is a Unicode character which should be converted into UTF-8, 448 // false if it is a byte value to be appended directly. The 449 // location is used to warn about an out of range character. 450 static void 451 append_char(unsigned int v, bool is_charater, std::string* str, 452 Location); 453 454 // A helper function. Fetch a UTF-8 character from STR and store it 455 // in *VALUE. Return the number of bytes read from STR. Return 0 456 // if STR does not point to a valid UTF-8 character. 457 static int 458 fetch_char(const char* str, unsigned int *value); 459 460 // Return whether C is a Unicode or "C" locale space character. 461 static bool 462 is_unicode_space(unsigned int c); 463 464 // Convert the specified hex char into an unsigned integer value. 465 static unsigned 466 hex_val(char c); 467 468 private: 469 ssize_t 470 get_line(); 471 472 bool 473 require_line(); 474 475 // The current location. 476 Location 477 location() const; 478 479 // A position CHARS column positions before the current location. 480 Location 481 earlier_location(int chars) const; 482 483 static bool 484 is_hex_digit(char); 485 486 static bool 487 is_base_digit(int base, char); 488 489 static unsigned char 490 octal_value(char c) 491 { return c - '0'; } 492 493 Token 494 make_invalid_token() 495 { return Token::make_invalid_token(this->location()); } 496 497 Token 498 make_eof_token() 499 { return Token::make_eof_token(this->location()); } 500 501 Token 502 make_operator(Operator op, int chars) 503 { return Token::make_operator_token(op, this->earlier_location(chars)); } 504 505 Token 506 gather_identifier(); 507 508 static bool 509 could_be_exponent(int base, const char*, const char*); 510 511 Token 512 gather_number(); 513 514 void 515 skip_exponent(); 516 517 Token 518 gather_character(); 519 520 Token 521 gather_string(); 522 523 Token 524 gather_raw_string(); 525 526 const char* 527 advance_one_utf8_char(const char*, unsigned int*, bool*); 528 529 const char* 530 advance_one_char(const char*, bool, unsigned int*, bool*); 531 532 static bool 533 is_unicode_digit(unsigned int c); 534 535 static bool 536 is_unicode_letter(unsigned int c); 537 538 static bool 539 is_unicode_uppercase(unsigned int c); 540 541 static bool 542 is_in_unicode_range(unsigned int C, const Unicode_range* ranges, 543 size_t range_size); 544 545 Operator 546 three_character_operator(char, char, char); 547 548 Operator 549 two_character_operator(char, char); 550 551 Operator 552 one_character_operator(char); 553 554 bool 555 skip_c_comment(bool* found_newline); 556 557 void 558 skip_cpp_comment(); 559 560 void 561 gather_embed(const char*, const char*); 562 563 // The input file name. 564 const char* input_file_name_ ATTRIBUTE_UNUSED; 565 // The input file. 566 FILE* input_file_; 567 // The object used to keep track of file names and line numbers. 568 Linemap* linemap_; 569 // The line buffer. This holds the current line. 570 char* linebuf_; 571 // The size of the line buffer. 572 size_t linebufsize_; 573 // The nmber of characters in the current line. 574 size_t linesize_; 575 // The current offset in linebuf_. 576 size_t lineoff_; 577 // The current line number. 578 size_t lineno_; 579 // Whether to add a semicolon if we see a newline now. 580 bool add_semi_at_eol_; 581 // Pragmas for the next function, from magic comments. 582 unsigned int pragmas_; 583 // The external name to use for a function declaration, from a magic 584 // //extern comment. 585 std::string extern_; 586 // The list of //go:linkname comments, if any. 587 Linknames* linknames_; 588 // The list of //go:embed patterns, if any. 589 std::vector<std::string> embeds_; 590 }; 591 592 #endif // !defined(GO_LEX_H)