github.com/openconfig/goyang@v1.4.5/pkg/yang/lex.go (about) 1 // Copyright 2015 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package yang 16 17 // This file implements the lexical tokenization of yang. The lexer returns 18 // a series of tokens with one of the following codes: 19 // 20 // tError // an error was encountered 21 // tEOF // end-of-file 22 // tString // A de-quoted string (e.g., "\"bob\"" becomes "bob") 23 // tUnquoted // An un-quoted string 24 // '{' 25 // ';' 26 // '}' 27 28 import ( 29 "bytes" 30 "fmt" 31 "io" 32 "os" 33 "reflect" 34 "runtime" 35 "strings" 36 "unicode/utf8" 37 ) 38 39 const ( 40 eof = 0x7fffffff // end of file, also an invalid rune 41 maxErrors = 8 42 tooMany = "too many errors...\n" 43 ) 44 45 // stateFn represents a state in the lexer as a function, returning the next 46 // state the lexer should move to. 47 type stateFn func(*lexer) stateFn 48 49 // A lexer holds the internal state of the lexer. 50 type lexer struct { 51 errout io.Writer // destination for errors, defaults to os.Stderr 52 errcnt int // number of errors encountered 53 54 file string // name of file we are processing 55 input string // contents of the file 56 start int // start position in input of unconsumed data. 57 pos int // current position in the input. 58 line int // the current line number (1's based) 59 col int // the current column number (0 based, add 1 before displaying) 60 61 debug bool // set to true to include internal debugging 62 inPattern bool // set when parsing the argument to a pattern 63 items chan *token // channel of scanned items. 64 tcol int // column with tabs expanded (for multi-line strings) 65 scol int // starting col of current token 66 sline int // starting line of current token 67 state stateFn // current state of the lexer 68 width int // width of last rune read from input. 69 } 70 71 // A code is a token code. Single character tokens (i.e., punctuation) 72 // are represented by their unicode code point. 73 type code int 74 75 const ( 76 tEOF = code(-1 - iota) // Reached end of file 77 tError // An error 78 tString // A dequoted string 79 tUnquoted // A non-quoted string 80 ) 81 82 // String returns c as a string. 83 func (c code) String() string { 84 switch c { 85 case tError: 86 return "Error" 87 case tString: 88 return "String" 89 case tUnquoted: 90 return "Unquoted" 91 } 92 if c < 0 || c > '~' { 93 return fmt.Sprintf("%d", c) 94 } 95 return fmt.Sprintf("'%c'", c) 96 } 97 98 // A token represents one lexical unit read from the input. 99 // Line and Col are both 1's based. 100 type token struct { 101 code code 102 Text string // the actual text of the token 103 File string // the source file the token is from 104 Line int // the source line number the token is from 105 Col int // the source column number the token is from (8 space tabs) 106 } 107 108 // Code returns the code of t. If t is nil, tEOF is returned. 109 func (t *token) Code() code { 110 if t == nil { 111 return tEOF 112 } 113 return t.code 114 } 115 116 // String returns the location, code, and text of t as a string. 117 func (t *token) String() string { 118 var s []string 119 if t.File != "" { 120 s = append(s, t.File+":") 121 } 122 if t.Line != 0 { 123 s = append(s, fmt.Sprintf("%d:%d:", t.Line, t.Col)) 124 } 125 if t.Text == "" { 126 s = append(s, fmt.Sprintf(" %v", t.code)) 127 } else { 128 s = append(s, " ", t.Text) 129 } 130 return strings.Join(s, "") 131 } 132 133 // A note on writing to errout. Errors should always be written to errout 134 // in a single Write call. The test code makes this assumption for testing 135 // expected errors. 136 137 // newLexer returns a new lexer, importing into it the provided input and path. 138 // The provided path should indicate where the source originated. 139 func newLexer(input, path string) *lexer { 140 // Force input to be newline terminated. 141 if len(input) > 0 && input[len(input)-1] != '\n' { 142 input += "\n" 143 } 144 return &lexer{ 145 file: path, 146 input: input, 147 line: 1, // humans start with 1 148 items: make(chan *token, maxErrors), 149 state: lexGround, 150 errout: os.Stderr, 151 } 152 } 153 154 // NextToken returns the next token from the input, returning nil on EOF. 155 func (l *lexer) NextToken() *token { 156 for { 157 select { 158 case item := <-l.items: 159 return item 160 default: 161 if l.state == nil { 162 return nil 163 } 164 if l.debug { 165 name := runtime.FuncForPC(reflect.ValueOf(l.state).Pointer()).Name() 166 name = name[strings.LastIndex(name, ".")+1:] 167 name = strings.TrimPrefix(name, "lex") 168 input := l.input[l.pos:] 169 if len(input) > 8 { 170 input = input[:8] + "..." 171 } 172 fmt.Fprintf(os.Stderr, "%d:%d: state %s %q\n", l.line, l.col+1, name, input) 173 } 174 l.state = l.state(l) 175 } 176 } 177 } 178 179 // emit emits the currently parsed token marked with code c using emitText. 180 func (l *lexer) emit(c code) { 181 l.emitText(c, l.input[l.start:l.pos]) 182 } 183 184 // emitText emits text as a token marked with c. 185 // All input up to the current cursor (pos) is consumed. 186 func (l *lexer) emitText(c code, text string) { 187 if l.debug { 188 fmt.Fprintf(os.Stderr, "%v: %q\n", c, text) 189 } 190 select { 191 case l.items <- &token{ 192 code: c, 193 Text: text, 194 File: l.file, 195 Line: l.sline, 196 Col: l.scol + 1, 197 }: 198 default: 199 } 200 l.consume() 201 } 202 203 // consume consumes all input to the current cursor. 204 func (l *lexer) consume() { 205 l.start = l.pos 206 } 207 208 // backup steps back one rune. It can be called only immediately after a call 209 // of next. Backing up over a tab will set tcol to the last position of the 210 // tab, not where the tab started. This is okay as when we call next again it 211 // will move tcol back to where it was before backup was called. 212 func (l *lexer) backup() { 213 l.pos -= l.width 214 if l.width > 0 { 215 l.col-- 216 l.tcol-- 217 if l.col < 0 { 218 // We must have backuped up over a newline. 219 // Don't bother to figure out the column number 220 // as the next call to next will reset it to 0. 221 l.line-- 222 l.col = 0 223 l.tcol = 0 224 } 225 } 226 } 227 228 // peek returns but does not move past the next rune in the input. backup 229 // is not supported over peeked characters. 230 func (l *lexer) peek() rune { 231 rune := l.next() 232 l.backup() 233 return rune 234 } 235 236 // next returns the next rune in the input. If next encounters the end of input 237 // then it will return eof. 238 func (l *lexer) next() (rune rune) { 239 if l.pos >= len(l.input) { 240 l.width = 0 241 return eof 242 } 243 // l.width is what limits more than a single backup. 244 rune, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) 245 l.pos += l.width 246 switch rune { 247 case '\n': 248 l.line++ 249 l.col = 0 250 l.tcol = 0 251 case '\t': 252 l.tcol = (l.tcol + 8) & ^7 253 l.col++ // should this be l.width? 254 default: 255 l.tcol++ 256 l.col++ // should this be l.width? 257 } 258 return rune 259 } 260 261 // acceptRun moves the cursor forward up to, but not including, the first rune 262 // not found in the valid set. It returns true if any runes were accepted. 263 func (l *lexer) acceptRun(valid string) bool { 264 ret := false 265 for strings.ContainsRune(valid, l.next()) { 266 ret = true 267 } 268 l.backup() 269 return ret 270 } 271 272 // skipTo moves the cursor up to, but not including, s. 273 // Returns whether s was found in the remaining input. 274 func (l *lexer) skipTo(s string) bool { 275 if x := strings.Index(l.input[l.pos:], s); x >= 0 { 276 l.updateCursor(x) 277 return true 278 } 279 return false 280 } 281 282 // updateCursor moves the cursor forward n bytes. updateCursor does not 283 // correctly handle tabs. This is okay as it is only used by skipTo, and skipTo 284 // is never used to skip to an initial " (which is the only time that tcol is 285 // necessary, as per YANG's multi-line quoted string requirement). 286 func (l *lexer) updateCursor(n int) { 287 s := l.input[l.pos : l.pos+n] 288 l.pos += n 289 // we could get away without updating width at all because backup is 290 // only promised to work after a call to next. 291 l.width = n 292 293 if c := strings.Count(s, "\n"); c > 0 { 294 l.line += c 295 l.col = 0 296 } 297 l.col += utf8.RuneCountInString(s[strings.LastIndex(s, "\n")+1:]) 298 } 299 300 // Errorf writes an error on l.errout and increments the error count. 301 // If too many errors (8) are encountered then lexing will stop and 302 // eof is returned as the next token. 303 func (l *lexer) Errorf(f string, v ...interface{}) { 304 buf := &bytes.Buffer{} 305 306 if l.debug { 307 // For internal debugging, print the file and line number 308 // of the call to Errorf 309 _, name, line, _ := runtime.Caller(1) 310 311 fmt.Fprintf(buf, "%s:%d: ", name, line) 312 } 313 fmt.Fprintf(buf, "%s:%d:%d: ", l.file, l.line, l.col+1) 314 fmt.Fprintf(buf, f, v...) 315 b := buf.Bytes() 316 if b[len(b)-1] != '\n' { 317 buf.Write([]byte{'\n'}) 318 } 319 l.emit(tError) 320 l.adderror(buf.Bytes()) 321 } 322 323 func (l *lexer) ErrorfAt(line, col int, f string, v ...interface{}) { 324 oline, ocol := l.line, l.col 325 defer func() { 326 l.line, l.col = oline, ocol 327 }() 328 l.line, l.col = line, col 329 l.Errorf(f, v...) 330 } 331 332 // adderror writes out the error string err and increases the error count. 333 // If more than maxErrors are encountered, a "too many errors" message is 334 // displayed and processing stops (by clearing the input). 335 func (l *lexer) adderror(err []byte) { 336 if l.errcnt == maxErrors { 337 l.pos = 0 338 l.start = 0 339 l.input = "" 340 l.errout.Write([]byte(tooMany)) 341 l.errcnt++ 342 return 343 } else if l.errcnt == maxErrors+1 { 344 return 345 } 346 l.errout.Write(err) 347 l.errcnt++ 348 } 349 350 // Below are all the states 351 352 // lexGround is the state when the lexer is not in the middle of a token. The 353 // ground state is left once the start of a token is found. Pure comment lines 354 // leave the lexer in the ground state. 355 func lexGround(l *lexer) stateFn { 356 l.acceptRun(" \t\r\n") // Skip leading spaces 357 l.consume() 358 l.sline = l.line 359 l.scol = l.col 360 361 switch c := l.peek(); c { 362 case eof: 363 return nil 364 case ';', '{', '}': 365 l.next() 366 l.emit(code(c)) 367 return lexGround 368 case '\'': 369 l.next() 370 l.consume() // Toss the leading ' 371 if !l.skipTo("'") { 372 l.ErrorfAt(l.line, l.col-1, `missing closing '`) 373 return nil 374 } 375 l.emit(tString) 376 l.next() // Either EOF or the matching ' 377 return lexGround 378 case '"': 379 l.next() 380 return lexQString 381 case '/': 382 l.next() 383 switch l.peek() { 384 case '/': 385 // Start of a // comment 386 if !l.skipTo("\n") { 387 // Here "\n" should always be found, since we force all 388 // input to be "\n" terminated. 389 l.ErrorfAt(l.line, l.col-1, `lexer internal error: all lines should be newline-terminated.`) 390 return nil 391 } 392 return lexGround 393 case '*': 394 // Start of a /* comment 395 if !l.skipTo("*/") { 396 l.ErrorfAt(l.line, l.col-1, `missing closing */`) 397 return nil 398 } 399 // Now actually skip the */ 400 l.next() 401 l.next() 402 return lexGround 403 default: 404 return lexUnquoted 405 } 406 case '+': 407 l.next() 408 switch l.peek() { 409 case '"', '\'': 410 l.emit(tUnquoted) 411 return lexGround 412 default: 413 return lexUnquoted 414 } 415 default: 416 return lexUnquoted 417 } 418 } 419 420 // From the YANG standard: 421 // 422 // If the double-quoted string contains a line break followed by space 423 // or tab characters that are used to indent the text according to the 424 // layout in the YANG file, this leading whitespace is stripped from the 425 // string, up to and including the column of the double quote character, 426 // or to the first non-whitespace character, whichever occurs first. In 427 // this process, a tab character is treated as 8 space characters. 428 // 429 // If the double-quoted string contains space or tab characters before a 430 // line break, this trailing whitespace is stripped from the string. 431 432 // lexQString handles double quoted strings, see the above text on how they 433 // work. The leading " has already been parsed. 434 func lexQString(l *lexer) stateFn { 435 indent := l.tcol // the column our text starts on 436 over := true // set to false when we are not past the indent 437 438 // Keep track of where the starting quote was 439 line, col := l.line, l.col-1 440 441 var text []byte 442 for { 443 // l.next can return non-8bit unicode code points. 444 // c cannot be treated as only a single byte. 445 switch c := l.next(); c { 446 case eof: 447 l.ErrorfAt(line, col, `missing closing "`) 448 return nil 449 case '"': 450 l.emitText(tString, string(text)) 451 452 return lexGround 453 case '\n': 454 Loop: 455 // Trim trailing white space from the line. 456 for i := len(text); i > 0; { 457 i-- 458 switch text[i] { 459 case ' ', '\t': 460 text = text[:i] 461 default: 462 break Loop 463 } 464 } 465 text = append(text, []byte(string(c))...) 466 over = false 467 case ' ', '\t': 468 // Ignore leading white space up to our indent. 469 if !over && l.tcol <= indent { 470 break 471 } 472 over = true 473 text = append(text, []byte(string(c))...) 474 case '\\': 475 switch c = l.next(); c { 476 case 'n': 477 c = '\n' 478 case 't': 479 c = '\t' 480 case '"': 481 case '\\': 482 default: 483 // Strings are use both in descriptions and 484 // in patterns. In strings only \n, \t, \" 485 // and \\ are defined. In patterns the \ 486 // can either mean to escape the character 487 // (e..g., \{) or to be part of of a special 488 // sequence such as \S. 489 if !l.inPattern { 490 l.ErrorfAt(l.line, l.col-2, `invalid escape sequence: \`+string(c)) 491 } 492 text = append(text, '\\') 493 } 494 fallthrough 495 default: 496 over = true 497 text = append(text, []byte(string(c))...) 498 } 499 } 500 } 501 502 // lexUnquoted reads one identifier/number/un-quoted-string/... 503 // 504 // From https://tools.ietf.org/html/rfc7950#section-6.1.3: 505 // An unquoted string is any sequence of characters that does not 506 // contain any space, tab, carriage return, or line feed characters, a 507 // single or double quote character, a semicolon (";"), braces ("{" or 508 // "}"), or comment sequences ("//", "/*", or "*/"). 509 func lexUnquoted(l *lexer) stateFn { 510 for { 511 switch c := l.peek(); c { 512 // TODO: Support detection of comment immediately following an 513 // unquoted string, likely through supporting two peeks instead 514 // of just one. 515 case ' ', '\r', '\n', '\t', ';', '"', '\'', '{', '}', eof: 516 l.emit(tUnquoted) 517 return lexGround 518 default: 519 l.next() 520 } 521 } 522 }