modernc.org/cc@v1.0.1/v2/lexer.go (about)

     1  // Copyright 2017 The CC Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cc // import "modernc.org/cc/v2"
     6  
     7  // [0]: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf
     8  
     9  import (
    10  	"bufio"
    11  	"fmt"
    12  	"go/token"
    13  	"io"
    14  
    15  	"modernc.org/golex/lex"
    16  	"modernc.org/mathutil"
    17  	"modernc.org/xc"
    18  )
    19  
    20  var (
    21  	noTypedefNameAfter = map[rune]struct{}{
    22  		'*':          {},
    23  		'.':          {},
    24  		ARROW:        {},
    25  		BOOL:         {},
    26  		CHAR:         {},
    27  		COMPLEX:      {},
    28  		DOUBLE:       {},
    29  		ENUM:         {},
    30  		FLOAT:        {},
    31  		GOTO:         {},
    32  		IDENTIFIER:   {},
    33  		INT:          {},
    34  		LONG:         {},
    35  		SHORT:        {},
    36  		SIGNED:       {},
    37  		STRUCT:       {},
    38  		TYPEDEF_NAME: {},
    39  		UNION:        {},
    40  		UNSIGNED:     {},
    41  		VOID:         {},
    42  	}
    43  )
    44  
    45  const (
    46  	intBits  = mathutil.IntBits
    47  	bitShift = intBits>>6 + 5
    48  	bitMask  = intBits - 1
    49  
    50  	scINITIAL = 0 // Start condition (shared value).
    51  )
    52  
    53  const (
    54  	// Character class is an 8 bit encoding of an Unicode rune for the
    55  	// golex generated FSM.
    56  	//
    57  	// Every ASCII rune is its own class.  DO NOT change any of the
    58  	// existing values. Adding new classes is OK.
    59  	ccEOF         = iota + 0x80
    60  	_             // ccError
    61  	ccOther       // Any other rune.
    62  	ccUCNDigit    // [0], Annex D, Universal character names for identifiers - digits.
    63  	ccUCNNonDigit // [0], Annex D, Universal character names for identifiers - non digits.
    64  )
    65  
    66  type trigraphs struct {
    67  	*lex.Lexer
    68  	pos token.Pos
    69  	r   *bufio.Reader
    70  	sc  int
    71  }
    72  
    73  func newTrigraphs(ctx *context, file *token.File, r io.Reader) (*trigraphs, error) {
    74  	sc := scINITIAL
    75  	if ctx.tweaks.EnableTrigraphs {
    76  		sc = scTRIGRAPHS
    77  	}
    78  	t := &trigraphs{
    79  		pos: file.Pos(0),
    80  		r:   bufio.NewReader(r),
    81  		sc:  sc,
    82  	}
    83  	lx, err := lex.New(
    84  		file,
    85  		t,
    86  		lex.ErrorFunc(func(pos token.Pos, msg string) { ctx.errPos(pos, msg) }),
    87  		lex.RuneClass(func(r rune) int { return int(r) }),
    88  	)
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  
    93  	t.Lexer = lx
    94  	return t, nil
    95  }
    96  
    97  func (t *trigraphs) ReadRune() (rune, int, error) { panic("internal error 9") }
    98  
    99  func (t *trigraphs) ReadChar() (c lex.Char, size int, err error) {
   100  	size = 1
   101  	b, err := t.r.ReadByte()
   102  	if err != nil {
   103  		return lex.NewChar(t.pos, rune(b)), 0, err
   104  	}
   105  
   106  	c = lex.NewChar(t.pos, rune(b))
   107  	t.pos++
   108  	return c, 1, nil
   109  }
   110  
   111  type ungetBuffer []cppToken
   112  
   113  func (u *ungetBuffer) unget(t cppToken) {
   114  	*u = append(*u, t)
   115  }
   116  
   117  func (u *ungetBuffer) read() (t cppToken) {
   118  	s := *u
   119  	n := len(s) - 1
   120  	t = s[n]
   121  	*u = s[:n]
   122  	return t
   123  }
   124  
   125  func (u *ungetBuffer) ungets(toks ...cppToken) {
   126  	s := *u
   127  	for i := len(toks) - 1; i >= 0; i-- {
   128  		s = append(s, toks[i])
   129  	}
   130  	*u = s
   131  }
   132  
   133  type lexer struct {
   134  	*context
   135  	*lex.Lexer
   136  	ast         Node
   137  	attr        [][]xc.Token
   138  	attr2       [][]xc.Token
   139  	commentPos0 token.Pos
   140  	currFn      *Declarator // [0]6.4.2.2
   141  	last        lex.Char
   142  	mode        int      // CONSTANT_EXPRESSION, TRANSLATION_UNIT
   143  	prev        xc.Token // Most recent result returned by Lex
   144  	sc          int
   145  	ssave       *Scope
   146  	t           *trigraphs
   147  	tc          *tokenPipe
   148  
   149  	noTypedefName bool // Do not consider next token a TYPEDEF_NAME
   150  	typedef       bool // Prev token returned was TYPEDEF_NAME
   151  
   152  	ungetBuffer
   153  }
   154  
   155  func newLexer(ctx *context, nm string, sz int, r io.Reader) (*lexer, error) {
   156  	file := fset.AddFile(nm, -1, sz)
   157  	t, err := newTrigraphs(ctx, file, r)
   158  	if err != nil {
   159  		return nil, err
   160  	}
   161  
   162  	l := &lexer{
   163  		context: ctx,
   164  		t:       t,
   165  	}
   166  
   167  	lx, err := lex.New(
   168  		file,
   169  		l,
   170  		lex.ErrorFunc(func(pos token.Pos, msg string) { l.errPos(pos, msg) }),
   171  		lex.RuneClass(rune2class),
   172  	)
   173  	if err != nil {
   174  		return nil, err
   175  	}
   176  
   177  	l.Lexer = lx
   178  	return l, nil
   179  }
   180  
   181  func (l *lexer) Error(msg string)             { l.err(l.First, "%v", msg) }
   182  func (l *lexer) ReadRune() (rune, int, error) { panic("internal error 10") }
   183  func (l *lexer) comment(general bool)         { /*TODO*/ }
   184  func (l *lexer) parseExpr() bool              { return l.parse(CONSTANT_EXPRESSION) }
   185  
   186  func (l *lexer) Lex(lval *yySymType) (r int) {
   187  more:
   188  	//TODO use follow set to recover from errors.
   189  	l.lex(lval)
   190  	lval.Token.Rune = l.toC(lval.Token.Rune, lval.Token.Val)
   191  	typedef := l.typedef
   192  	l.typedef = false
   193  	noTypedefName := l.noTypedefName
   194  	l.noTypedefName = false
   195  	switch lval.Token.Rune {
   196  	case '(':
   197  		if l.prev.Rune == ATOMIC && l.prev.Pos()+token.Pos(len("_Atomic")) == lval.Token.Pos() {
   198  			lval.Token.Rune = ATOMIC_LPAREN
   199  		}
   200  	case NON_REPL:
   201  		lval.Token.Rune = IDENTIFIER
   202  		fallthrough
   203  	case IDENTIFIER:
   204  		if lval.Token.Val == idAttribute {
   205  			if len(l.attr) != 0 {
   206  				panic(fmt.Errorf("%v:", l.position(lval.Token)))
   207  			}
   208  
   209  			l.attr = nil
   210  			l.parseAttr(lval)
   211  			goto more
   212  		}
   213  
   214  		if noTypedefName || typedef || !followSetHasTypedefName[lval.yys] {
   215  			break
   216  		}
   217  
   218  		if _, ok := noTypedefNameAfter[l.prev.Rune]; ok {
   219  			break
   220  		}
   221  
   222  		if l.scope.isTypedef(lval.Token.Val) {
   223  			// https://en.wikipedia.org/wiki/The_lexer_hack
   224  			lval.Token.Rune = TYPEDEF_NAME
   225  			l.typedef = true
   226  		}
   227  	case PPNUMBER:
   228  		lval.Token.Rune = INTCONST
   229  		val := dict.S(lval.Token.Val)
   230  		if !(len(val) > 1 && val[0] == '0' && (val[1] == 'x' || val[1] == 'X')) {
   231  			for _, v := range val {
   232  				switch v {
   233  				case '.', '+', '-', 'e', 'E', 'p', 'P':
   234  					lval.Token.Rune = FLOATCONST
   235  				}
   236  			}
   237  		}
   238  	case ccEOF:
   239  		lval.Token.Rune = lex.RuneEOF
   240  		lval.Token.Val = 0
   241  	}
   242  
   243  	if l.prev.Rune == FOR {
   244  		s := l.scope.forStmtEndScope
   245  		if s == nil {
   246  			s = l.scope
   247  		}
   248  		l.newScope().forStmtEndScope = s
   249  	}
   250  	l.prev = lval.Token
   251  	return int(l.prev.Rune)
   252  }
   253  
   254  func (l *lexer) attrs() (r [][]xc.Token) {
   255  	l.attr, r = nil, l.attr
   256  	return r
   257  }
   258  
   259  func (l *lexer) parseAttr(lval *yySymType) {
   260  	l.lex(lval)
   261  	if lval.Token.Rune != '(' {
   262  		panic("TODO")
   263  	}
   264  
   265  	l.lex(lval)
   266  	if lval.Token.Rune != '(' {
   267  		panic("TODO")
   268  	}
   269  
   270  	l.parseAttrList(lval)
   271  	l.lex(lval)
   272  	if lval.Token.Rune != ')' {
   273  		panic("TODO")
   274  	}
   275  
   276  	l.lex(lval)
   277  	if lval.Token.Rune != ')' {
   278  		panic("TODO")
   279  	}
   280  }
   281  
   282  func (l *lexer) parseAttrList(lval *yySymType) {
   283  	for {
   284  		l.lex(lval)
   285  		switch t := lval.Token; t.Rune {
   286  		case IDENTIFIER:
   287  			l.attr = append(l.attr, []xc.Token{t})
   288  		case ')':
   289  			l.unget(cppToken{Token: t})
   290  			return
   291  		case '(':
   292  			l.parseAttrParams(lval)
   293  		case ',':
   294  			// ok
   295  		default:
   296  			panic(fmt.Errorf("%v: %v", l.position(lval.Token), PrettyString(lval.Token)))
   297  		}
   298  	}
   299  }
   300  
   301  func (l *lexer) parseAttrParams(lval *yySymType) {
   302  	for {
   303  		l.lex(lval)
   304  		switch t := lval.Token; t.Rune {
   305  		case IDENTIFIER, STRINGLITERAL:
   306  			n := len(l.attr)
   307  			l.attr[n-1] = append(l.attr[n-1], t)
   308  		case ')':
   309  			return
   310  		default:
   311  			panic(fmt.Errorf("%v: %v", l.position(lval.Token), PrettyString(lval.Token)))
   312  		}
   313  	}
   314  }
   315  
   316  func (l *lexer) ReadChar() (c lex.Char, size int, err error) {
   317  	if c = l.t.Lookahead(); c.Rune == lex.RuneEOF {
   318  		return c, 0, io.EOF
   319  	}
   320  
   321  	ch := l.t.scan()
   322  	return lex.NewChar(l.t.First.Pos(), rune(ch)), 1, nil
   323  }
   324  
   325  func (l *lexer) Reduced(rule, state int, lval *yySymType) (stop bool) {
   326  	if rule != l.exampleRule {
   327  		return false
   328  	}
   329  
   330  	switch x := lval.node.(type) {
   331  	case interface {
   332  		fragment() interface{}
   333  	}:
   334  		l.exampleAST = x.fragment()
   335  	default:
   336  		l.exampleAST = x
   337  	}
   338  	return true
   339  }
   340  
   341  func (l *lexer) cppScan() lex.Char {
   342  again:
   343  	r := l.scan()
   344  	if r == ' ' && l.last.Rune == ' ' {
   345  		goto again
   346  	}
   347  
   348  	l.last = lex.NewChar(l.First.Pos(), rune(r))
   349  	return l.last
   350  }
   351  
   352  func (l *lexer) lex(lval *yySymType) {
   353  	if len(l.ungetBuffer) != 0 {
   354  		lval.Token = l.ungetBuffer.read().Token
   355  		return
   356  	}
   357  
   358  	if l.tc != nil {
   359  		lval.Token = l.tc.read().Token
   360  		l.First = lval.Token.Char
   361  		return
   362  	}
   363  
   364  	ch := l.scanChar()
   365  	lval.Token = xc.Token{Char: ch}
   366  	if _, ok := tokHasVal[ch.Rune]; ok {
   367  		lval.Token = xc.Token{Char: ch, Val: dict.ID(l.TokenBytes(nil))}
   368  	}
   369  }
   370  
   371  // static const char __func__[] = "function-name"; // [0], 6.4.2.2.
   372  func (l *lexer) declareFuncName() {
   373  	pos := l.First.Pos() // '{'
   374  	l.ungets(
   375  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, STATIC), Val: idStatic}},
   376  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, CONST), Val: idConst}},
   377  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, CHAR), Val: idChar}},
   378  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, IDENTIFIER), Val: idFuncName}},
   379  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, '[')}},
   380  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, ']')}},
   381  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, '=')}},
   382  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, STRINGLITERAL), Val: dict.SID(`"` + string(dict.S(l.currFn.Name())) + `"`)}},
   383  		cppToken{Token: xc.Token{Char: lex.NewChar(pos, ';')}},
   384  	)
   385  }
   386  
   387  func (l *lexer) insertParamNames() {
   388  	if l.currFn == nil {
   389  		return
   390  	}
   391  
   392  	defer func() { l.currFn = nil }()
   393  
   394  	fp := l.currFn.fpScope(l.context)
   395  	if fp == nil {
   396  		return
   397  	}
   398  
   399  	for k, v := range fp.typedefs {
   400  		l.scope.insertTypedef(l.context, k, v)
   401  	}
   402  }
   403  
   404  func (l *lexer) parse(mode int) bool {
   405  	var tok xc.Token
   406  	tok.Rune = rune(mode)
   407  	l.ungetBuffer = append(l.ungetBuffer, cppToken{Token: tok})
   408  	l.mode = mode
   409  	l.last.Rune = '\n'
   410  	return yyParse(l) == 0
   411  }
   412  
   413  func (l *lexer) scanChar() (c lex.Char) {
   414  again:
   415  	r := l.scan()
   416  	if r == ' ' {
   417  		goto again
   418  	}
   419  
   420  	l.last = lex.NewChar(l.First.Pos(), rune(r))
   421  	switch r {
   422  	case CONSTANT_EXPRESSION, TRANSLATION_UNIT:
   423  		l.mode = r
   424  	}
   425  	return l.last
   426  }
   427  
   428  func (l *lexer) fixDeclarator(n Node) {
   429  	if dd := n.(*DirectDeclarator); dd.Case == DirectDeclaratorParen {
   430  		nm := dd.Declarator.Name()
   431  		//dbg("removing %q from %p", dict.S(nm), l.scope.Parent)
   432  		delete(l.scope.Parent.typedefs, nm)
   433  		l.scope.fixDecl = nm
   434  	}
   435  }
   436  
   437  func (l *lexer) postFixDeclarator(ctx *context) {
   438  	if nm := l.scope.fixDecl; nm != 0 {
   439  		//dbg("reinserting %q into %p", dict.S(nm), l.scope.Parent)
   440  		l.scope.Parent.insertTypedef(ctx, nm, false)
   441  	}
   442  }