github.com/openconfig/goyang@v1.4.5/pkg/yang/lex.go (about)

     1  // Copyright 2015 Google Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package yang
    16  
    17  // This file implements the lexical tokenization of yang.  The lexer returns
    18  // a series of tokens with one of the following codes:
    19  //
    20  //    tError       // an error was encountered
    21  //    tEOF         // end-of-file
    22  //    tString      // A de-quoted string (e.g., "\"bob\"" becomes "bob")
    23  //    tUnquoted    // An un-quoted string
    24  //    '{'
    25  //    ';'
    26  //    '}'
    27  
    28  import (
    29  	"bytes"
    30  	"fmt"
    31  	"io"
    32  	"os"
    33  	"reflect"
    34  	"runtime"
    35  	"strings"
    36  	"unicode/utf8"
    37  )
    38  
    39  const (
    40  	eof       = 0x7fffffff // end of file, also an invalid rune
    41  	maxErrors = 8
    42  	tooMany   = "too many errors...\n"
    43  )
    44  
    45  // stateFn represents a state in the lexer as a function, returning the next
    46  // state the lexer should move to.
    47  type stateFn func(*lexer) stateFn
    48  
    49  // A lexer holds the internal state of the lexer.
    50  type lexer struct {
    51  	errout io.Writer // destination for errors, defaults to os.Stderr
    52  	errcnt int       // number of errors encountered
    53  
    54  	file  string // name of file we are processing
    55  	input string // contents of the file
    56  	start int    // start position in input of unconsumed data.
    57  	pos   int    // current position in the input.
    58  	line  int    // the current line number (1's based)
    59  	col   int    // the current column number (0 based, add 1 before displaying)
    60  
    61  	debug     bool        // set to true to include internal debugging
    62  	inPattern bool        // set when parsing the argument to a pattern
    63  	items     chan *token // channel of scanned items.
    64  	tcol      int         // column with tabs expanded (for multi-line strings)
    65  	scol      int         // starting col of current token
    66  	sline     int         // starting line of current token
    67  	state     stateFn     // current state of the lexer
    68  	width     int         // width of last rune read from input.
    69  }
    70  
    71  // A code is a token code.  Single character tokens (i.e., punctuation)
    72  // are represented by their unicode code point.
    73  type code int
    74  
    75  const (
    76  	tEOF      = code(-1 - iota) // Reached end of file
    77  	tError                      // An error
    78  	tString                     // A dequoted string
    79  	tUnquoted                   // A non-quoted string
    80  )
    81  
    82  // String returns c as a string.
    83  func (c code) String() string {
    84  	switch c {
    85  	case tError:
    86  		return "Error"
    87  	case tString:
    88  		return "String"
    89  	case tUnquoted:
    90  		return "Unquoted"
    91  	}
    92  	if c < 0 || c > '~' {
    93  		return fmt.Sprintf("%d", c)
    94  	}
    95  	return fmt.Sprintf("'%c'", c)
    96  }
    97  
    98  // A token represents one lexical unit read from the input.
    99  // Line and Col are both 1's based.
   100  type token struct {
   101  	code code
   102  	Text string // the actual text of the token
   103  	File string // the source file the token is from
   104  	Line int    // the source line number the token is from
   105  	Col  int    // the source column number the token is from (8 space tabs)
   106  }
   107  
   108  // Code returns the code of t.  If t is nil, tEOF is returned.
   109  func (t *token) Code() code {
   110  	if t == nil {
   111  		return tEOF
   112  	}
   113  	return t.code
   114  }
   115  
   116  // String returns the location, code, and text of t as a string.
   117  func (t *token) String() string {
   118  	var s []string
   119  	if t.File != "" {
   120  		s = append(s, t.File+":")
   121  	}
   122  	if t.Line != 0 {
   123  		s = append(s, fmt.Sprintf("%d:%d:", t.Line, t.Col))
   124  	}
   125  	if t.Text == "" {
   126  		s = append(s, fmt.Sprintf(" %v", t.code))
   127  	} else {
   128  		s = append(s, " ", t.Text)
   129  	}
   130  	return strings.Join(s, "")
   131  }
   132  
   133  // A note on writing to errout.  Errors should always be written to errout
   134  // in a single Write call.  The test code makes this assumption for testing
   135  // expected errors.
   136  
   137  // newLexer returns a new lexer, importing into it the provided input and path.
   138  // The provided path should indicate where the source originated.
   139  func newLexer(input, path string) *lexer {
   140  	// Force input to be newline terminated.
   141  	if len(input) > 0 && input[len(input)-1] != '\n' {
   142  		input += "\n"
   143  	}
   144  	return &lexer{
   145  		file:   path,
   146  		input:  input,
   147  		line:   1, // humans start with 1
   148  		items:  make(chan *token, maxErrors),
   149  		state:  lexGround,
   150  		errout: os.Stderr,
   151  	}
   152  }
   153  
   154  // NextToken returns the next token from the input, returning nil on EOF.
   155  func (l *lexer) NextToken() *token {
   156  	for {
   157  		select {
   158  		case item := <-l.items:
   159  			return item
   160  		default:
   161  			if l.state == nil {
   162  				return nil
   163  			}
   164  			if l.debug {
   165  				name := runtime.FuncForPC(reflect.ValueOf(l.state).Pointer()).Name()
   166  				name = name[strings.LastIndex(name, ".")+1:]
   167  				name = strings.TrimPrefix(name, "lex")
   168  				input := l.input[l.pos:]
   169  				if len(input) > 8 {
   170  					input = input[:8] + "..."
   171  				}
   172  				fmt.Fprintf(os.Stderr, "%d:%d: state %s %q\n", l.line, l.col+1, name, input)
   173  			}
   174  			l.state = l.state(l)
   175  		}
   176  	}
   177  }
   178  
   179  // emit emits the currently parsed token marked with code c using emitText.
   180  func (l *lexer) emit(c code) {
   181  	l.emitText(c, l.input[l.start:l.pos])
   182  }
   183  
   184  // emitText emits text as a token marked with c.
   185  // All input up to the current cursor (pos) is consumed.
   186  func (l *lexer) emitText(c code, text string) {
   187  	if l.debug {
   188  		fmt.Fprintf(os.Stderr, "%v: %q\n", c, text)
   189  	}
   190  	select {
   191  	case l.items <- &token{
   192  		code: c,
   193  		Text: text,
   194  		File: l.file,
   195  		Line: l.sline,
   196  		Col:  l.scol + 1,
   197  	}:
   198  	default:
   199  	}
   200  	l.consume()
   201  }
   202  
   203  // consume consumes all input to the current cursor.
   204  func (l *lexer) consume() {
   205  	l.start = l.pos
   206  }
   207  
   208  // backup steps back one rune.  It can be called only immediately after a call
   209  // of next.  Backing up over a tab will set tcol to the last position of the
   210  // tab, not where the tab started.  This is okay as when we call next again it
   211  // will move tcol back to where it was before backup was called.
   212  func (l *lexer) backup() {
   213  	l.pos -= l.width
   214  	if l.width > 0 {
   215  		l.col--
   216  		l.tcol--
   217  		if l.col < 0 {
   218  			// We must have backuped up over a newline.
   219  			// Don't bother to figure out the column number
   220  			// as the next call to next will reset it to 0.
   221  			l.line--
   222  			l.col = 0
   223  			l.tcol = 0
   224  		}
   225  	}
   226  }
   227  
   228  // peek returns but does not move past the next rune in the input.  backup
   229  // is not supported over peeked characters.
   230  func (l *lexer) peek() rune {
   231  	rune := l.next()
   232  	l.backup()
   233  	return rune
   234  }
   235  
   236  // next returns the next rune in the input.  If next encounters the end of input
   237  // then it will return eof.
   238  func (l *lexer) next() (rune rune) {
   239  	if l.pos >= len(l.input) {
   240  		l.width = 0
   241  		return eof
   242  	}
   243  	// l.width is what limits more than a single backup.
   244  	rune, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
   245  	l.pos += l.width
   246  	switch rune {
   247  	case '\n':
   248  		l.line++
   249  		l.col = 0
   250  		l.tcol = 0
   251  	case '\t':
   252  		l.tcol = (l.tcol + 8) & ^7
   253  		l.col++ // should this be l.width?
   254  	default:
   255  		l.tcol++
   256  		l.col++ // should this be l.width?
   257  	}
   258  	return rune
   259  }
   260  
   261  // acceptRun moves the cursor forward up to, but not including, the first rune
   262  // not found in the valid set.  It returns true if any runes were accepted.
   263  func (l *lexer) acceptRun(valid string) bool {
   264  	ret := false
   265  	for strings.ContainsRune(valid, l.next()) {
   266  		ret = true
   267  	}
   268  	l.backup()
   269  	return ret
   270  }
   271  
   272  // skipTo moves the cursor up to, but not including, s.
   273  // Returns whether s was found in the remaining input.
   274  func (l *lexer) skipTo(s string) bool {
   275  	if x := strings.Index(l.input[l.pos:], s); x >= 0 {
   276  		l.updateCursor(x)
   277  		return true
   278  	}
   279  	return false
   280  }
   281  
   282  // updateCursor moves the cursor forward n bytes.  updateCursor does not
   283  // correctly handle tabs.  This is okay as it is only used by skipTo, and skipTo
   284  // is never used to skip to an initial " (which is the only time that tcol is
   285  // necessary, as per YANG's multi-line quoted string requirement).
   286  func (l *lexer) updateCursor(n int) {
   287  	s := l.input[l.pos : l.pos+n]
   288  	l.pos += n
   289  	// we could get away without updating width at all because backup is
   290  	// only promised to work after a call to next.
   291  	l.width = n
   292  
   293  	if c := strings.Count(s, "\n"); c > 0 {
   294  		l.line += c
   295  		l.col = 0
   296  	}
   297  	l.col += utf8.RuneCountInString(s[strings.LastIndex(s, "\n")+1:])
   298  }
   299  
   300  // Errorf writes an error on l.errout and increments the error count.
   301  // If too many errors (8) are encountered then lexing will stop and
   302  // eof is returned as the next token.
   303  func (l *lexer) Errorf(f string, v ...interface{}) {
   304  	buf := &bytes.Buffer{}
   305  
   306  	if l.debug {
   307  		// For internal debugging, print the file and line number
   308  		// of the call to Errorf
   309  		_, name, line, _ := runtime.Caller(1)
   310  
   311  		fmt.Fprintf(buf, "%s:%d: ", name, line)
   312  	}
   313  	fmt.Fprintf(buf, "%s:%d:%d: ", l.file, l.line, l.col+1)
   314  	fmt.Fprintf(buf, f, v...)
   315  	b := buf.Bytes()
   316  	if b[len(b)-1] != '\n' {
   317  		buf.Write([]byte{'\n'})
   318  	}
   319  	l.emit(tError)
   320  	l.adderror(buf.Bytes())
   321  }
   322  
   323  func (l *lexer) ErrorfAt(line, col int, f string, v ...interface{}) {
   324  	oline, ocol := l.line, l.col
   325  	defer func() {
   326  		l.line, l.col = oline, ocol
   327  	}()
   328  	l.line, l.col = line, col
   329  	l.Errorf(f, v...)
   330  }
   331  
   332  // adderror writes out the error string err and increases the error count.
   333  // If more than maxErrors are encountered, a "too many errors" message is
   334  // displayed and processing stops (by clearing the input).
   335  func (l *lexer) adderror(err []byte) {
   336  	if l.errcnt == maxErrors {
   337  		l.pos = 0
   338  		l.start = 0
   339  		l.input = ""
   340  		l.errout.Write([]byte(tooMany))
   341  		l.errcnt++
   342  		return
   343  	} else if l.errcnt == maxErrors+1 {
   344  		return
   345  	}
   346  	l.errout.Write(err)
   347  	l.errcnt++
   348  }
   349  
   350  // Below are all the states
   351  
   352  // lexGround is the state when the lexer is not in the middle of a token.  The
   353  // ground state is left once the start of a token is found.  Pure comment lines
   354  // leave the lexer in the ground state.
   355  func lexGround(l *lexer) stateFn {
   356  	l.acceptRun(" \t\r\n") // Skip leading spaces
   357  	l.consume()
   358  	l.sline = l.line
   359  	l.scol = l.col
   360  
   361  	switch c := l.peek(); c {
   362  	case eof:
   363  		return nil
   364  	case ';', '{', '}':
   365  		l.next()
   366  		l.emit(code(c))
   367  		return lexGround
   368  	case '\'':
   369  		l.next()
   370  		l.consume() // Toss the leading '
   371  		if !l.skipTo("'") {
   372  			l.ErrorfAt(l.line, l.col-1, `missing closing '`)
   373  			return nil
   374  		}
   375  		l.emit(tString)
   376  		l.next() // Either EOF or the matching '
   377  		return lexGround
   378  	case '"':
   379  		l.next()
   380  		return lexQString
   381  	case '/':
   382  		l.next()
   383  		switch l.peek() {
   384  		case '/':
   385  			// Start of a // comment
   386  			if !l.skipTo("\n") {
   387  				// Here "\n" should always be found, since we force all
   388  				// input to be "\n" terminated.
   389  				l.ErrorfAt(l.line, l.col-1, `lexer internal error: all lines should be newline-terminated.`)
   390  				return nil
   391  			}
   392  			return lexGround
   393  		case '*':
   394  			// Start of a /* comment
   395  			if !l.skipTo("*/") {
   396  				l.ErrorfAt(l.line, l.col-1, `missing closing */`)
   397  				return nil
   398  			}
   399  			// Now actually skip the */
   400  			l.next()
   401  			l.next()
   402  			return lexGround
   403  		default:
   404  			return lexUnquoted
   405  		}
   406  	case '+':
   407  		l.next()
   408  		switch l.peek() {
   409  		case '"', '\'':
   410  			l.emit(tUnquoted)
   411  			return lexGround
   412  		default:
   413  			return lexUnquoted
   414  		}
   415  	default:
   416  		return lexUnquoted
   417  	}
   418  }
   419  
   420  // From the YANG standard:
   421  //
   422  //   If the double-quoted string contains a line break followed by space
   423  //   or tab characters that are used to indent the text according to the
   424  //   layout in the YANG file, this leading whitespace is stripped from the
   425  //   string, up to and including the column of the double quote character,
   426  //   or to the first non-whitespace character, whichever occurs first.  In
   427  //   this process, a tab character is treated as 8 space characters.
   428  //
   429  //   If the double-quoted string contains space or tab characters before a
   430  //   line break, this trailing whitespace is stripped from the string.
   431  
   432  // lexQString handles double quoted strings, see the above text on how they
   433  // work.  The leading " has already been parsed.
   434  func lexQString(l *lexer) stateFn {
   435  	indent := l.tcol // the column our text starts on
   436  	over := true     // set to false when we are not past the indent
   437  
   438  	// Keep track of where the starting quote was
   439  	line, col := l.line, l.col-1
   440  
   441  	var text []byte
   442  	for {
   443  		// l.next can return non-8bit unicode code points.
   444  		// c cannot be treated as only a single byte.
   445  		switch c := l.next(); c {
   446  		case eof:
   447  			l.ErrorfAt(line, col, `missing closing "`)
   448  			return nil
   449  		case '"':
   450  			l.emitText(tString, string(text))
   451  
   452  			return lexGround
   453  		case '\n':
   454  		Loop:
   455  			// Trim trailing white space from the line.
   456  			for i := len(text); i > 0; {
   457  				i--
   458  				switch text[i] {
   459  				case ' ', '\t':
   460  					text = text[:i]
   461  				default:
   462  					break Loop
   463  				}
   464  			}
   465  			text = append(text, []byte(string(c))...)
   466  			over = false
   467  		case ' ', '\t':
   468  			// Ignore leading white space up to our indent.
   469  			if !over && l.tcol <= indent {
   470  				break
   471  			}
   472  			over = true
   473  			text = append(text, []byte(string(c))...)
   474  		case '\\':
   475  			switch c = l.next(); c {
   476  			case 'n':
   477  				c = '\n'
   478  			case 't':
   479  				c = '\t'
   480  			case '"':
   481  			case '\\':
   482  			default:
   483  				// Strings are use both in descriptions and
   484  				// in patterns.  In strings only \n, \t, \"
   485  				// and \\ are defined.  In patterns the \
   486  				// can either mean to escape the character
   487  				// (e..g., \{) or to be part of of a special
   488  				// sequence such as \S.
   489  				if !l.inPattern {
   490  					l.ErrorfAt(l.line, l.col-2, `invalid escape sequence: \`+string(c))
   491  				}
   492  				text = append(text, '\\')
   493  			}
   494  			fallthrough
   495  		default:
   496  			over = true
   497  			text = append(text, []byte(string(c))...)
   498  		}
   499  	}
   500  }
   501  
   502  // lexUnquoted reads one identifier/number/un-quoted-string/...
   503  //
   504  // From https://tools.ietf.org/html/rfc7950#section-6.1.3:
   505  // An unquoted string is any sequence of characters that does not
   506  // contain any space, tab, carriage return, or line feed characters, a
   507  // single or double quote character, a semicolon (";"), braces ("{" or
   508  // "}"), or comment sequences ("//", "/*", or "*/").
   509  func lexUnquoted(l *lexer) stateFn {
   510  	for {
   511  		switch c := l.peek(); c {
   512  		// TODO: Support detection of comment immediately following an
   513  		// unquoted string, likely through supporting two peeks instead
   514  		// of just one.
   515  		case ' ', '\r', '\n', '\t', ';', '"', '\'', '{', '}', eof:
   516  			l.emit(tUnquoted)
   517  			return lexGround
   518  		default:
   519  			l.next()
   520  		}
   521  	}
   522  }