modernc.org/cc@v1.0.1/v2/lexer.go (about) 1 // Copyright 2017 The CC Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cc // import "modernc.org/cc/v2" 6 7 // [0]: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf 8 9 import ( 10 "bufio" 11 "fmt" 12 "go/token" 13 "io" 14 15 "modernc.org/golex/lex" 16 "modernc.org/mathutil" 17 "modernc.org/xc" 18 ) 19 20 var ( 21 noTypedefNameAfter = map[rune]struct{}{ 22 '*': {}, 23 '.': {}, 24 ARROW: {}, 25 BOOL: {}, 26 CHAR: {}, 27 COMPLEX: {}, 28 DOUBLE: {}, 29 ENUM: {}, 30 FLOAT: {}, 31 GOTO: {}, 32 IDENTIFIER: {}, 33 INT: {}, 34 LONG: {}, 35 SHORT: {}, 36 SIGNED: {}, 37 STRUCT: {}, 38 TYPEDEF_NAME: {}, 39 UNION: {}, 40 UNSIGNED: {}, 41 VOID: {}, 42 } 43 ) 44 45 const ( 46 intBits = mathutil.IntBits 47 bitShift = intBits>>6 + 5 48 bitMask = intBits - 1 49 50 scINITIAL = 0 // Start condition (shared value). 51 ) 52 53 const ( 54 // Character class is an 8 bit encoding of an Unicode rune for the 55 // golex generated FSM. 56 // 57 // Every ASCII rune is its own class. DO NOT change any of the 58 // existing values. Adding new classes is OK. 59 ccEOF = iota + 0x80 60 _ // ccError 61 ccOther // Any other rune. 62 ccUCNDigit // [0], Annex D, Universal character names for identifiers - digits. 63 ccUCNNonDigit // [0], Annex D, Universal character names for identifiers - non digits. 64 ) 65 66 type trigraphs struct { 67 *lex.Lexer 68 pos token.Pos 69 r *bufio.Reader 70 sc int 71 } 72 73 func newTrigraphs(ctx *context, file *token.File, r io.Reader) (*trigraphs, error) { 74 sc := scINITIAL 75 if ctx.tweaks.EnableTrigraphs { 76 sc = scTRIGRAPHS 77 } 78 t := &trigraphs{ 79 pos: file.Pos(0), 80 r: bufio.NewReader(r), 81 sc: sc, 82 } 83 lx, err := lex.New( 84 file, 85 t, 86 lex.ErrorFunc(func(pos token.Pos, msg string) { ctx.errPos(pos, msg) }), 87 lex.RuneClass(func(r rune) int { return int(r) }), 88 ) 89 if err != nil { 90 return nil, err 91 } 92 93 t.Lexer = lx 94 return t, nil 95 } 96 97 func (t *trigraphs) ReadRune() (rune, int, error) { panic("internal error 9") } 98 99 func (t *trigraphs) ReadChar() (c lex.Char, size int, err error) { 100 size = 1 101 b, err := t.r.ReadByte() 102 if err != nil { 103 return lex.NewChar(t.pos, rune(b)), 0, err 104 } 105 106 c = lex.NewChar(t.pos, rune(b)) 107 t.pos++ 108 return c, 1, nil 109 } 110 111 type ungetBuffer []cppToken 112 113 func (u *ungetBuffer) unget(t cppToken) { 114 *u = append(*u, t) 115 } 116 117 func (u *ungetBuffer) read() (t cppToken) { 118 s := *u 119 n := len(s) - 1 120 t = s[n] 121 *u = s[:n] 122 return t 123 } 124 125 func (u *ungetBuffer) ungets(toks ...cppToken) { 126 s := *u 127 for i := len(toks) - 1; i >= 0; i-- { 128 s = append(s, toks[i]) 129 } 130 *u = s 131 } 132 133 type lexer struct { 134 *context 135 *lex.Lexer 136 ast Node 137 attr [][]xc.Token 138 attr2 [][]xc.Token 139 commentPos0 token.Pos 140 currFn *Declarator // [0]6.4.2.2 141 last lex.Char 142 mode int // CONSTANT_EXPRESSION, TRANSLATION_UNIT 143 prev xc.Token // Most recent result returned by Lex 144 sc int 145 ssave *Scope 146 t *trigraphs 147 tc *tokenPipe 148 149 noTypedefName bool // Do not consider next token a TYPEDEF_NAME 150 typedef bool // Prev token returned was TYPEDEF_NAME 151 152 ungetBuffer 153 } 154 155 func newLexer(ctx *context, nm string, sz int, r io.Reader) (*lexer, error) { 156 file := fset.AddFile(nm, -1, sz) 157 t, err := newTrigraphs(ctx, file, r) 158 if err != nil { 159 return nil, err 160 } 161 162 l := &lexer{ 163 context: ctx, 164 t: t, 165 } 166 167 lx, err := lex.New( 168 file, 169 l, 170 lex.ErrorFunc(func(pos token.Pos, msg string) { l.errPos(pos, msg) }), 171 lex.RuneClass(rune2class), 172 ) 173 if err != nil { 174 return nil, err 175 } 176 177 l.Lexer = lx 178 return l, nil 179 } 180 181 func (l *lexer) Error(msg string) { l.err(l.First, "%v", msg) } 182 func (l *lexer) ReadRune() (rune, int, error) { panic("internal error 10") } 183 func (l *lexer) comment(general bool) { /*TODO*/ } 184 func (l *lexer) parseExpr() bool { return l.parse(CONSTANT_EXPRESSION) } 185 186 func (l *lexer) Lex(lval *yySymType) (r int) { 187 more: 188 //TODO use follow set to recover from errors. 189 l.lex(lval) 190 lval.Token.Rune = l.toC(lval.Token.Rune, lval.Token.Val) 191 typedef := l.typedef 192 l.typedef = false 193 noTypedefName := l.noTypedefName 194 l.noTypedefName = false 195 switch lval.Token.Rune { 196 case '(': 197 if l.prev.Rune == ATOMIC && l.prev.Pos()+token.Pos(len("_Atomic")) == lval.Token.Pos() { 198 lval.Token.Rune = ATOMIC_LPAREN 199 } 200 case NON_REPL: 201 lval.Token.Rune = IDENTIFIER 202 fallthrough 203 case IDENTIFIER: 204 if lval.Token.Val == idAttribute { 205 if len(l.attr) != 0 { 206 panic(fmt.Errorf("%v:", l.position(lval.Token))) 207 } 208 209 l.attr = nil 210 l.parseAttr(lval) 211 goto more 212 } 213 214 if noTypedefName || typedef || !followSetHasTypedefName[lval.yys] { 215 break 216 } 217 218 if _, ok := noTypedefNameAfter[l.prev.Rune]; ok { 219 break 220 } 221 222 if l.scope.isTypedef(lval.Token.Val) { 223 // https://en.wikipedia.org/wiki/The_lexer_hack 224 lval.Token.Rune = TYPEDEF_NAME 225 l.typedef = true 226 } 227 case PPNUMBER: 228 lval.Token.Rune = INTCONST 229 val := dict.S(lval.Token.Val) 230 if !(len(val) > 1 && val[0] == '0' && (val[1] == 'x' || val[1] == 'X')) { 231 for _, v := range val { 232 switch v { 233 case '.', '+', '-', 'e', 'E', 'p', 'P': 234 lval.Token.Rune = FLOATCONST 235 } 236 } 237 } 238 case ccEOF: 239 lval.Token.Rune = lex.RuneEOF 240 lval.Token.Val = 0 241 } 242 243 if l.prev.Rune == FOR { 244 s := l.scope.forStmtEndScope 245 if s == nil { 246 s = l.scope 247 } 248 l.newScope().forStmtEndScope = s 249 } 250 l.prev = lval.Token 251 return int(l.prev.Rune) 252 } 253 254 func (l *lexer) attrs() (r [][]xc.Token) { 255 l.attr, r = nil, l.attr 256 return r 257 } 258 259 func (l *lexer) parseAttr(lval *yySymType) { 260 l.lex(lval) 261 if lval.Token.Rune != '(' { 262 panic("TODO") 263 } 264 265 l.lex(lval) 266 if lval.Token.Rune != '(' { 267 panic("TODO") 268 } 269 270 l.parseAttrList(lval) 271 l.lex(lval) 272 if lval.Token.Rune != ')' { 273 panic("TODO") 274 } 275 276 l.lex(lval) 277 if lval.Token.Rune != ')' { 278 panic("TODO") 279 } 280 } 281 282 func (l *lexer) parseAttrList(lval *yySymType) { 283 for { 284 l.lex(lval) 285 switch t := lval.Token; t.Rune { 286 case IDENTIFIER: 287 l.attr = append(l.attr, []xc.Token{t}) 288 case ')': 289 l.unget(cppToken{Token: t}) 290 return 291 case '(': 292 l.parseAttrParams(lval) 293 case ',': 294 // ok 295 default: 296 panic(fmt.Errorf("%v: %v", l.position(lval.Token), PrettyString(lval.Token))) 297 } 298 } 299 } 300 301 func (l *lexer) parseAttrParams(lval *yySymType) { 302 for { 303 l.lex(lval) 304 switch t := lval.Token; t.Rune { 305 case IDENTIFIER, STRINGLITERAL: 306 n := len(l.attr) 307 l.attr[n-1] = append(l.attr[n-1], t) 308 case ')': 309 return 310 default: 311 panic(fmt.Errorf("%v: %v", l.position(lval.Token), PrettyString(lval.Token))) 312 } 313 } 314 } 315 316 func (l *lexer) ReadChar() (c lex.Char, size int, err error) { 317 if c = l.t.Lookahead(); c.Rune == lex.RuneEOF { 318 return c, 0, io.EOF 319 } 320 321 ch := l.t.scan() 322 return lex.NewChar(l.t.First.Pos(), rune(ch)), 1, nil 323 } 324 325 func (l *lexer) Reduced(rule, state int, lval *yySymType) (stop bool) { 326 if rule != l.exampleRule { 327 return false 328 } 329 330 switch x := lval.node.(type) { 331 case interface { 332 fragment() interface{} 333 }: 334 l.exampleAST = x.fragment() 335 default: 336 l.exampleAST = x 337 } 338 return true 339 } 340 341 func (l *lexer) cppScan() lex.Char { 342 again: 343 r := l.scan() 344 if r == ' ' && l.last.Rune == ' ' { 345 goto again 346 } 347 348 l.last = lex.NewChar(l.First.Pos(), rune(r)) 349 return l.last 350 } 351 352 func (l *lexer) lex(lval *yySymType) { 353 if len(l.ungetBuffer) != 0 { 354 lval.Token = l.ungetBuffer.read().Token 355 return 356 } 357 358 if l.tc != nil { 359 lval.Token = l.tc.read().Token 360 l.First = lval.Token.Char 361 return 362 } 363 364 ch := l.scanChar() 365 lval.Token = xc.Token{Char: ch} 366 if _, ok := tokHasVal[ch.Rune]; ok { 367 lval.Token = xc.Token{Char: ch, Val: dict.ID(l.TokenBytes(nil))} 368 } 369 } 370 371 // static const char __func__[] = "function-name"; // [0], 6.4.2.2. 372 func (l *lexer) declareFuncName() { 373 pos := l.First.Pos() // '{' 374 l.ungets( 375 cppToken{Token: xc.Token{Char: lex.NewChar(pos, STATIC), Val: idStatic}}, 376 cppToken{Token: xc.Token{Char: lex.NewChar(pos, CONST), Val: idConst}}, 377 cppToken{Token: xc.Token{Char: lex.NewChar(pos, CHAR), Val: idChar}}, 378 cppToken{Token: xc.Token{Char: lex.NewChar(pos, IDENTIFIER), Val: idFuncName}}, 379 cppToken{Token: xc.Token{Char: lex.NewChar(pos, '[')}}, 380 cppToken{Token: xc.Token{Char: lex.NewChar(pos, ']')}}, 381 cppToken{Token: xc.Token{Char: lex.NewChar(pos, '=')}}, 382 cppToken{Token: xc.Token{Char: lex.NewChar(pos, STRINGLITERAL), Val: dict.SID(`"` + string(dict.S(l.currFn.Name())) + `"`)}}, 383 cppToken{Token: xc.Token{Char: lex.NewChar(pos, ';')}}, 384 ) 385 } 386 387 func (l *lexer) insertParamNames() { 388 if l.currFn == nil { 389 return 390 } 391 392 defer func() { l.currFn = nil }() 393 394 fp := l.currFn.fpScope(l.context) 395 if fp == nil { 396 return 397 } 398 399 for k, v := range fp.typedefs { 400 l.scope.insertTypedef(l.context, k, v) 401 } 402 } 403 404 func (l *lexer) parse(mode int) bool { 405 var tok xc.Token 406 tok.Rune = rune(mode) 407 l.ungetBuffer = append(l.ungetBuffer, cppToken{Token: tok}) 408 l.mode = mode 409 l.last.Rune = '\n' 410 return yyParse(l) == 0 411 } 412 413 func (l *lexer) scanChar() (c lex.Char) { 414 again: 415 r := l.scan() 416 if r == ' ' { 417 goto again 418 } 419 420 l.last = lex.NewChar(l.First.Pos(), rune(r)) 421 switch r { 422 case CONSTANT_EXPRESSION, TRANSLATION_UNIT: 423 l.mode = r 424 } 425 return l.last 426 } 427 428 func (l *lexer) fixDeclarator(n Node) { 429 if dd := n.(*DirectDeclarator); dd.Case == DirectDeclaratorParen { 430 nm := dd.Declarator.Name() 431 //dbg("removing %q from %p", dict.S(nm), l.scope.Parent) 432 delete(l.scope.Parent.typedefs, nm) 433 l.scope.fixDecl = nm 434 } 435 } 436 437 func (l *lexer) postFixDeclarator(ctx *context) { 438 if nm := l.scope.fixDecl; nm != 0 { 439 //dbg("reinserting %q into %p", dict.S(nm), l.scope.Parent) 440 l.scope.Parent.insertTypedef(ctx, nm, false) 441 } 442 }