github.com/golang/gofrontend@v0.0.0-20240429183944-60f985a78526/go/lex.h (about)

     1  // lex.h -- Go frontend lexer.     -*- C++ -*-
     2  
     3  // Copyright 2009 The Go Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  #ifndef GO_LEX_H
     8  #define GO_LEX_H
     9  
    10  #include <mpfr.h>
    11  
    12  #include "operator.h"
    13  #include "go-linemap.h"
    14  
    15  struct Unicode_range;
    16  
    17  // The keywords.  These must be in sorted order, other than
    18  // KEYWORD_INVALID.  They must match the Keywords::mapping_ array in
    19  // lex.cc.
    20  
    21  enum Keyword
    22  {
    23    KEYWORD_INVALID,	// Not a keyword.
    24    KEYWORD_ASM,
    25    KEYWORD_BREAK,
    26    KEYWORD_CASE,
    27    KEYWORD_CHAN,
    28    KEYWORD_CONST,
    29    KEYWORD_CONTINUE,
    30    KEYWORD_DEFAULT,
    31    KEYWORD_DEFER,
    32    KEYWORD_ELSE,
    33    KEYWORD_FALLTHROUGH,
    34    KEYWORD_FOR,
    35    KEYWORD_FUNC,
    36    KEYWORD_GO,
    37    KEYWORD_GOTO,
    38    KEYWORD_IF,
    39    KEYWORD_IMPORT,
    40    KEYWORD_INTERFACE,
    41    KEYWORD_MAP,
    42    KEYWORD_PACKAGE,
    43    KEYWORD_RANGE,
    44    KEYWORD_RETURN,
    45    KEYWORD_SELECT,
    46    KEYWORD_STRUCT,
    47    KEYWORD_SWITCH,
    48    KEYWORD_TYPE,
    49    KEYWORD_VAR
    50  };
    51  
    52  // Pragmas built from magic comments and recorded for functions.
    53  // These are used as bits in a bitmask.
    54  // The set of values is intended to be the same as the gc compiler.
    55  
    56  enum GoPragma
    57  {
    58    GOPRAGMA_NOINTERFACE = 1 << 0,	// Method not in type descriptor.
    59    GOPRAGMA_NOESCAPE = 1 << 1,		// Args do not escape.
    60    GOPRAGMA_NORACE = 1 << 2,		// No race detector.
    61    GOPRAGMA_NOSPLIT = 1 << 3,		// Do not split stack.
    62    GOPRAGMA_NOINLINE = 1 << 4,		// Do not inline.
    63    GOPRAGMA_SYSTEMSTACK = 1 << 5,	// Must run on system stack.
    64    GOPRAGMA_NOWRITEBARRIER = 1 << 6,	// No write barriers.
    65    GOPRAGMA_NOWRITEBARRIERREC = 1 << 7,	// No write barriers here or callees.
    66    GOPRAGMA_YESWRITEBARRIERREC = 1 << 8,	// Stops nowritebarrierrec.
    67    GOPRAGMA_MARK = 1 << 9,		// Marker for nowritebarrierrec.
    68    GOPRAGMA_CGOUNSAFEARGS = 1 << 10,	// Pointer to arg is pointer to all.
    69    GOPRAGMA_UINTPTRESCAPES = 1 << 11,	// uintptr(p) escapes.
    70    GOPRAGMA_NOTINHEAP = 1 << 12		// type is not in heap.
    71  };
    72  
    73  // A token returned from the lexer.
    74  
    75  class Token
    76  {
    77   public:
    78    // Token classification.
    79    enum Classification
    80    {
    81      // Token is invalid.
    82      TOKEN_INVALID,
    83      // Token indicates end of input.
    84      TOKEN_EOF,
    85      // Token is a keyword.
    86      TOKEN_KEYWORD,
    87      // Token is an identifier.
    88      TOKEN_IDENTIFIER,
    89      // Token is a string of characters.
    90      TOKEN_STRING,
    91      // Token is an operator.
    92      TOKEN_OPERATOR,
    93      // Token is a character constant.
    94      TOKEN_CHARACTER,
    95      // Token is an integer.
    96      TOKEN_INTEGER,
    97      // Token is a floating point number.
    98      TOKEN_FLOAT,
    99      // Token is an imaginary number.
   100      TOKEN_IMAGINARY
   101    };
   102  
   103    ~Token();
   104    Token(const Token&);
   105    Token& operator=(const Token&);
   106  
   107    // Get token classification.
   108    Classification
   109    classification() const
   110    { return this->classification_; }
   111  
   112    // Make a token for an invalid value.
   113    static Token
   114    make_invalid_token(Location location)
   115    { return Token(TOKEN_INVALID, location); }
   116  
   117    // Make a token representing end of file.
   118    static Token
   119    make_eof_token(Location location)
   120    { return Token(TOKEN_EOF, location); }
   121  
   122    // Make a keyword token.
   123    static Token
   124    make_keyword_token(Keyword keyword, Location location)
   125    {
   126      Token tok(TOKEN_KEYWORD, location);
   127      tok.u_.keyword = keyword;
   128      return tok;
   129    }
   130  
   131    // Make an identifier token.
   132    static Token
   133    make_identifier_token(const std::string& value, bool is_exported,
   134  			Location location)
   135    {
   136      Token tok(TOKEN_IDENTIFIER, location);
   137      tok.u_.identifier_value.name = new std::string(value);
   138      tok.u_.identifier_value.is_exported = is_exported;
   139      return tok;
   140    }
   141  
   142    // Make a quoted string token.
   143    static Token
   144    make_string_token(const std::string& value, Location location)
   145    {
   146      Token tok(TOKEN_STRING, location);
   147      tok.u_.string_value = new std::string(value);
   148      return tok;
   149    }
   150  
   151    // Make an operator token.
   152    static Token
   153    make_operator_token(Operator op, Location location)
   154    {
   155      Token tok(TOKEN_OPERATOR, location);
   156      tok.u_.op = op;
   157      return tok;
   158    }
   159  
   160    // Make a character constant token.
   161    static Token
   162    make_character_token(mpz_t val, Location location)
   163    {
   164      Token tok(TOKEN_CHARACTER, location);
   165      mpz_init(tok.u_.integer_value);
   166      mpz_swap(tok.u_.integer_value, val);
   167      return tok;
   168    }
   169  
   170    // Make an integer token.
   171    static Token
   172    make_integer_token(mpz_t val, Location location)
   173    {
   174      Token tok(TOKEN_INTEGER, location);
   175      mpz_init(tok.u_.integer_value);
   176      mpz_swap(tok.u_.integer_value, val);
   177      return tok;
   178    }
   179  
   180    // Make a float token.
   181    static Token
   182    make_float_token(mpfr_t val, Location location)
   183    {
   184      Token tok(TOKEN_FLOAT, location);
   185      mpfr_init(tok.u_.float_value);
   186      mpfr_swap(tok.u_.float_value, val);
   187      return tok;
   188    }
   189  
   190    // Make a token for an imaginary number.
   191    static Token
   192    make_imaginary_token(mpfr_t val, Location location)
   193    {
   194      Token tok(TOKEN_IMAGINARY, location);
   195      mpfr_init(tok.u_.float_value);
   196      mpfr_swap(tok.u_.float_value, val);
   197      return tok;
   198    }
   199  
   200    // Get the location of the token.
   201    Location
   202    location() const
   203    { return this->location_; }
   204  
   205    // Return whether this is an invalid token.
   206    bool
   207    is_invalid() const
   208    { return this->classification_ == TOKEN_INVALID; }
   209  
   210    // Return whether this is the EOF token.
   211    bool
   212    is_eof() const
   213    { return this->classification_ == TOKEN_EOF; }
   214  
   215    // Return the keyword value for a keyword token.
   216    Keyword
   217    keyword() const
   218    {
   219      go_assert(this->classification_ == TOKEN_KEYWORD);
   220      return this->u_.keyword;
   221    }
   222  
   223    // Return whether this is an identifier.
   224    bool
   225    is_identifier() const
   226    { return this->classification_ == TOKEN_IDENTIFIER; }
   227  
   228    // Return the identifier.
   229    const std::string&
   230    identifier() const
   231    {
   232      go_assert(this->classification_ == TOKEN_IDENTIFIER);
   233      return *this->u_.identifier_value.name;
   234    }
   235  
   236    // Return whether the identifier is exported.
   237    bool
   238    is_identifier_exported() const
   239    {
   240      go_assert(this->classification_ == TOKEN_IDENTIFIER);
   241      return this->u_.identifier_value.is_exported;
   242    }
   243  
   244    // Return whether this is a string.
   245    bool
   246    is_string() const
   247    {
   248      return this->classification_ == TOKEN_STRING;
   249    }
   250  
   251    // Return the value of a string.  The returned value is a string of
   252    // UTF-8 characters.
   253    std::string
   254    string_value() const
   255    {
   256      go_assert(this->classification_ == TOKEN_STRING);
   257      return *this->u_.string_value;
   258    }
   259  
   260    // Return the value of a character constant.
   261    const mpz_t*
   262    character_value() const
   263    {
   264      go_assert(this->classification_ == TOKEN_CHARACTER);
   265      return &this->u_.integer_value;
   266    }
   267  
   268    // Return the value of an integer.
   269    const mpz_t*
   270    integer_value() const
   271    {
   272      go_assert(this->classification_ == TOKEN_INTEGER);
   273      return &this->u_.integer_value;
   274    }
   275  
   276    // Return the value of a float.
   277    const mpfr_t*
   278    float_value() const
   279    {
   280      go_assert(this->classification_ == TOKEN_FLOAT);
   281      return &this->u_.float_value;
   282    }
   283  
   284    // Return the value of an imaginary number.
   285    const mpfr_t*
   286    imaginary_value() const
   287    {
   288      go_assert(this->classification_ == TOKEN_IMAGINARY);
   289      return &this->u_.float_value;
   290    }
   291  
   292    // Return the operator value for an operator token.
   293    Operator
   294    op() const
   295    {
   296      go_assert(this->classification_ == TOKEN_OPERATOR);
   297      return this->u_.op;
   298    }
   299  
   300    // Return whether this token is KEYWORD.
   301    bool
   302    is_keyword(Keyword keyword) const
   303    {
   304      return (this->classification_ == TOKEN_KEYWORD
   305  	    && this->u_.keyword == keyword);
   306    }
   307  
   308    // Return whether this token is OP.
   309    bool
   310    is_op(Operator op) const
   311    { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
   312  
   313    // Print the token for debugging.
   314    void
   315    print(FILE*) const;
   316  
   317   private:
   318    // Private constructor used by make_..._token functions above.
   319    Token(Classification, Location);
   320  
   321    // Clear the token.
   322    void
   323    clear();
   324  
   325    // The token classification.
   326    Classification classification_;
   327    union
   328    {
   329      // The keyword value for TOKEN_KEYWORD.
   330      Keyword keyword;
   331      // The token value for TOKEN_IDENTIFIER.
   332      struct
   333      {
   334        // The name of the identifier.  This has been mangled to only
   335        // include ASCII characters.
   336        std::string* name;
   337        // Whether this name should be exported.  This is true if the
   338        // first letter in the name is upper case.
   339        bool is_exported;
   340      } identifier_value;
   341      // The string value for TOKEN_STRING.
   342      std::string* string_value;
   343      // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
   344      mpz_t integer_value;
   345      // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
   346      mpfr_t float_value;
   347      // The token value for TOKEN_OPERATOR or the keyword value
   348      Operator op;
   349    } u_;
   350    // The source location.
   351    Location location_;
   352  };
   353  
   354  // The lexer itself.
   355  
   356  class Lex
   357  {
   358   public:
   359    Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
   360  
   361    ~Lex();
   362  
   363    // Return the next token.
   364    Token
   365    next_token();
   366  
   367    // Return the contents of any current //extern comment.
   368    const std::string&
   369    extern_name() const
   370    { return this->extern_; }
   371  
   372    // Return the current set of pragmas, and clear them.
   373    unsigned int
   374    get_and_clear_pragmas()
   375    {
   376      unsigned int ret = this->pragmas_;
   377      this->pragmas_ = 0;
   378      return ret;
   379    }
   380  
   381    struct Linkname
   382    {
   383      std::string ext_name;	// External name; empty to just export.
   384      bool is_exported;		// Whether the internal name is exported.
   385      Location loc;		// Location of go:linkname directive.
   386  
   387      Linkname()
   388        : ext_name(), is_exported(false), loc()
   389      { }
   390  
   391      Linkname(const std::string& ext_name_a, bool is_exported_a, Location loc_a)
   392        : ext_name(ext_name_a), is_exported(is_exported_a), loc(loc_a)
   393      { }
   394    };
   395  
   396    typedef std::map<std::string, Linkname> Linknames;
   397  
   398    // Return the linknames seen so far, or NULL if none, and clear the
   399    // set.  These are from go:linkname compiler directives.
   400    Linknames*
   401    get_and_clear_linknames()
   402    {
   403      Linknames* ret = this->linknames_;
   404      this->linknames_ = NULL;
   405      return ret;
   406    }
   407  
   408    // Return whether there are any current go:embed patterns.
   409    bool
   410    has_embeds() const
   411    { return !this->embeds_.empty(); }
   412  
   413    // If there are any go:embed patterns seen so far, store them in
   414    // *EMBEDS and clear the saved set.  *EMBEDS must be an empty
   415    // vector.
   416    void
   417    get_and_clear_embeds(std::vector<std::string>* embeds)
   418    {
   419      go_assert(embeds->empty());
   420      std::swap(*embeds, this->embeds_);
   421    }
   422  
   423    // Clear any go:embed patterns seen so far.  This is used for
   424    // erroneous cases.
   425    void
   426    clear_embeds()
   427    { this->embeds_.clear(); }
   428  
   429    // Return whether the identifier NAME should be exported.  NAME is a
   430    // mangled name which includes only ASCII characters.
   431    static bool
   432    is_exported_mangled_name(const std::string& name);
   433  
   434    // Return whether the identifier NAME should be exported.  NAME is
   435    // an unmangled utf-8 string and may contain non-ASCII characters.
   436    static bool
   437    is_exported_name(const std::string& name);
   438  
   439    // Return whether the identifier NAME is invalid.  When we see an
   440    // invalid character we still build an identifier, but we use a
   441    // magic string to indicate that the identifier is invalid.  We then
   442    // use this to avoid knockon errors.
   443    static bool
   444    is_invalid_identifier(const std::string& name);
   445  
   446    // A helper function.  Append V to STR.  IS_CHARACTER is true if V
   447    // is a Unicode character which should be converted into UTF-8,
   448    // false if it is a byte value to be appended directly.  The
   449    // location is used to warn about an out of range character.
   450    static void
   451    append_char(unsigned int v, bool is_charater, std::string* str,
   452  	      Location);
   453  
   454    // A helper function.  Fetch a UTF-8 character from STR and store it
   455    // in *VALUE.  Return the number of bytes read from STR.  Return 0
   456    // if STR does not point to a valid UTF-8 character.
   457    static int
   458    fetch_char(const char* str, unsigned int *value);
   459  
   460    // Return whether C is a Unicode or "C" locale space character.
   461    static bool
   462    is_unicode_space(unsigned int c);
   463  
   464    // Convert the specified hex char into an unsigned integer value.
   465    static unsigned
   466    hex_val(char c);
   467  
   468   private:
   469    ssize_t
   470    get_line();
   471  
   472    bool
   473    require_line();
   474  
   475    // The current location.
   476    Location
   477    location() const;
   478  
   479    // A position CHARS column positions before the current location.
   480    Location
   481    earlier_location(int chars) const;
   482  
   483    static bool
   484    is_hex_digit(char);
   485  
   486    static bool
   487    is_base_digit(int base, char);
   488  
   489    static unsigned char
   490    octal_value(char c)
   491    { return c - '0'; }
   492  
   493    Token
   494    make_invalid_token()
   495    { return Token::make_invalid_token(this->location()); }
   496  
   497    Token
   498    make_eof_token()
   499    { return Token::make_eof_token(this->location()); }
   500  
   501    Token
   502    make_operator(Operator op, int chars)
   503    { return Token::make_operator_token(op, this->earlier_location(chars)); }
   504  
   505    Token
   506    gather_identifier();
   507  
   508    static bool
   509    could_be_exponent(int base, const char*, const char*);
   510  
   511    Token
   512    gather_number();
   513  
   514    void
   515    skip_exponent();
   516  
   517    Token
   518    gather_character();
   519  
   520    Token
   521    gather_string();
   522  
   523    Token
   524    gather_raw_string();
   525  
   526    const char*
   527    advance_one_utf8_char(const char*, unsigned int*, bool*);
   528  
   529    const char*
   530    advance_one_char(const char*, bool, unsigned int*, bool*);
   531  
   532    static bool
   533    is_unicode_digit(unsigned int c);
   534  
   535    static bool
   536    is_unicode_letter(unsigned int c);
   537  
   538    static bool
   539    is_unicode_uppercase(unsigned int c);
   540  
   541    static bool
   542    is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
   543  		      size_t range_size);
   544  
   545    Operator
   546    three_character_operator(char, char, char);
   547  
   548    Operator
   549    two_character_operator(char, char);
   550  
   551    Operator
   552    one_character_operator(char);
   553  
   554    bool
   555    skip_c_comment(bool* found_newline);
   556  
   557    void
   558    skip_cpp_comment();
   559  
   560    void
   561    gather_embed(const char*, const char*);
   562  
   563    // The input file name.
   564    const char* input_file_name_ ATTRIBUTE_UNUSED;
   565    // The input file.
   566    FILE* input_file_;
   567    // The object used to keep track of file names and line numbers.
   568    Linemap* linemap_;
   569    // The line buffer.  This holds the current line.
   570    char* linebuf_;
   571    // The size of the line buffer.
   572    size_t linebufsize_;
   573    // The nmber of characters in the current line.
   574    size_t linesize_;
   575    // The current offset in linebuf_.
   576    size_t lineoff_;
   577    // The current line number.
   578    size_t lineno_;
   579    // Whether to add a semicolon if we see a newline now.
   580    bool add_semi_at_eol_;
   581    // Pragmas for the next function, from magic comments.
   582    unsigned int pragmas_;
   583    // The external name to use for a function declaration, from a magic
   584    // //extern comment.
   585    std::string extern_;
   586    // The list of //go:linkname comments, if any.
   587    Linknames* linknames_;
   588    // The list of //go:embed patterns, if any.
   589    std::vector<std::string> embeds_;
   590  };
   591  
   592  #endif // !defined(GO_LEX_H)