github.com/tiagovtristao/plz@v13.4.0+incompatible/src/parse/asp/lexer.go

github.com/tiagovtristao/plz@v13.4.0+incompatible/src/parse/asp/lexer.go (about)

     1  package asp
     2  
     3  import (
     4  	"io"
     5  	"io/ioutil"
     6  	"unicode"
     7  	"unicode/utf8"
     8  )
     9  
    10  // Token types.
    11  const (
    12  	EOF = -(iota + 1)
    13  	Ident
    14  	Int
    15  	String
    16  	LexOperator
    17  	EOL
    18  	Unindent
    19  )
    20  
    21  // A Token describes each individual lexical element emitted by the lexer.
    22  type Token struct {
    23  	// Type of token. If > 0 this is the literal character value; if < 0 it is one of the types above.
    24  	Type rune
    25  	// The literal text of the token. Strings are lightly normalised to always be surrounded by quotes (but only one).
    26  	Value string
    27  	// The position in the input that the token occurred at.
    28  	Pos Position
    29  }
    30  
    31  // String implements the fmt.Stringer interface
    32  func (tok Token) String() string {
    33  	if tok.Value != "" {
    34  		return tok.Value
    35  	}
    36  	return reverseSymbol(tok.Type)
    37  }
    38  
    39  // EndPos returns the end position of a token
    40  func (tok Token) EndPos() Position {
    41  	end := tok.Pos
    42  	end.Offset += len(tok.Value)
    43  	end.Column += len(tok.Value)
    44  
    45  	return end
    46  }
    47  
    48  // A Position describes a position in a source file.
    49  // All properties in Position are one(1) indexed
    50  type Position struct {
    51  	Filename string
    52  	Offset   int
    53  	Line     int
    54  	Column   int
    55  }
    56  
    57  type namer interface {
    58  	Name() string
    59  }
    60  
    61  // NameOfReader returns a name for the given reader, if one can be determined.
    62  func NameOfReader(r io.Reader) string {
    63  	if n, ok := r.(namer); ok {
    64  		return n.Name()
    65  	}
    66  	return ""
    67  }
    68  
    69  // newLexer creates a new lex instance.
    70  func newLexer(r io.Reader) *lex {
    71  	// Read the entire file upfront to avoid bufio etc.
    72  	// This should work OK as long as BUILD files are relatively small.
    73  	b, err := ioutil.ReadAll(r)
    74  	if err != nil {
    75  		fail(Position{Filename: NameOfReader(r)}, err.Error())
    76  	}
    77  	// If the file doesn't end in a newline, we will reject it with an "unexpected end of file"
    78  	// error. That's a bit crap so quietly fix it up here.
    79  	if len(b) > 0 && b[len(b)-1] != '\n' {
    80  		b = append(b, '\n')
    81  	}
    82  	l := &lex{
    83  		b:        append(b, 0, 0), // Null-terminating the buffer makes things easier later.
    84  		filename: NameOfReader(r),
    85  		indents:  []int{0},
    86  	}
    87  	l.Next() // Initial value is zero, this forces it to populate itself.
    88  	// Discard any leading newlines, they are just an annoyance.
    89  	for l.Peek().Type == EOL {
    90  		l.Next()
    91  	}
    92  	return l
    93  }
    94  
    95  // A lex is a lexer for a single BUILD file.
    96  type lex struct {
    97  	b      []byte
    98  	i      int
    99  	line   int
   100  	col    int
   101  	indent int
   102  	// The next token. We always look one token ahead in order to facilitate both Peek() and Next().
   103  	next     Token
   104  	filename string
   105  	// Used to track how many braces we're within.
   106  	braces int
   107  	// Pending unindent tokens. This is a bit yuck but means the parser doesn't need to
   108  	// concern itself about indentation.
   109  	unindents int
   110  	// Current levels of indentation
   111  	indents []int
   112  	// Remember whether the last token we output was an end-of-line so we don't emit multiple in sequence.
   113  	lastEOL bool
   114  }
   115  
   116  // reverseSymbol looks up a symbol's name from the lexer.
   117  func reverseSymbol(sym rune) string {
   118  	switch sym {
   119  	case EOF:
   120  		return "end of file"
   121  	case Ident:
   122  		return "identifier"
   123  	case Int:
   124  		return "integer"
   125  	case String:
   126  		return "string"
   127  	case LexOperator:
   128  		return "operator"
   129  	case EOL:
   130  		return "end of line"
   131  	case Unindent:
   132  		return "unindent"
   133  	}
   134  	return string(sym) // literal character
   135  }
   136  
   137  // reverseSymbols looks up a series of symbol's names from the lexer.
   138  func reverseSymbols(syms []rune) []string {
   139  	ret := make([]string, len(syms))
   140  	for i, sym := range syms {
   141  		ret[i] = reverseSymbol(sym)
   142  	}
   143  	return ret
   144  }
   145  
   146  // Peek at the next token
   147  func (l *lex) Peek() Token {
   148  	return l.next
   149  }
   150  
   151  // Next consumes and returns the next token.
   152  func (l *lex) Next() Token {
   153  	ret := l.next
   154  	l.next = l.nextToken()
   155  	l.lastEOL = l.next.Type == EOL || l.next.Type == Unindent
   156  	return ret
   157  }
   158  
   159  // AssignFollows is a hack to do extra lookahead which makes it easier to parse
   160  // named call arguments. It returns true if the token after next is an assign operator.
   161  func (l *lex) AssignFollows() bool {
   162  	l.stripSpaces()
   163  	return l.b[l.i] == '=' && l.b[l.i+1] != '='
   164  }
   165  
   166  func (l *lex) stripSpaces() {
   167  	for l.b[l.i] == ' ' {
   168  		l.i++
   169  		l.col++
   170  	}
   171  }
   172  
   173  // nextToken consumes and returns the next token.
   174  func (l *lex) nextToken() Token {
   175  	l.stripSpaces()
   176  	pos := Position{
   177  		Filename: l.filename,
   178  		// These are all 1-indexed for niceness.
   179  		Offset: l.i + 1,
   180  		Line:   l.line + 1,
   181  		Column: l.col + 1,
   182  	}
   183  	if l.unindents > 0 {
   184  		l.unindents--
   185  		return Token{Type: Unindent, Pos: pos}
   186  	}
   187  	b := l.b[l.i]
   188  	rawString := b == 'r' && (l.b[l.i+1] == '"' || l.b[l.i+1] == '\'')
   189  	fString := b == 'f' && (l.b[l.i+1] == '"' || l.b[l.i+1] == '\'')
   190  	if rawString || fString {
   191  		l.i++
   192  		l.col++
   193  		b = l.b[l.i]
   194  	} else if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || b >= utf8.RuneSelf {
   195  		return l.consumeIdent(pos)
   196  	}
   197  	l.i++
   198  	l.col++
   199  	switch b {
   200  	case 0:
   201  		// End of file (we null terminate it above so this is easy to spot)
   202  		return Token{Type: EOF, Pos: pos}
   203  	case '\n':
   204  		// End of line, read indent to next non-space character
   205  		lastIndent := l.indent
   206  		l.line++
   207  		l.col = 0
   208  		indent := 0
   209  		for l.b[l.i] == ' ' {
   210  			l.i++
   211  			l.col++
   212  			indent++
   213  		}
   214  		if l.b[l.i] == '\n' {
   215  			return l.nextToken()
   216  		}
   217  		if l.braces == 0 {
   218  			l.indent = indent
   219  		}
   220  		if lastIndent > l.indent && l.braces == 0 {
   221  			pos.Line++ // Works better if it's at the new position
   222  			pos.Column = l.col + 1
   223  			for l.indents[len(l.indents)-1] > l.indent {
   224  				l.unindents++
   225  				l.indents = l.indents[:len(l.indents)-1]
   226  			}
   227  			if l.indent != l.indents[len(l.indents)-1] {
   228  				fail(pos, "Unexpected indent")
   229  			}
   230  		} else if lastIndent != l.indent {
   231  			l.indents = append(l.indents, l.indent)
   232  		}
   233  		if l.braces == 0 && !l.lastEOL {
   234  			return Token{Type: EOL, Pos: pos}
   235  		}
   236  		return l.nextToken()
   237  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   238  		return l.consumeInteger(b, pos)
   239  	case '"', '\'':
   240  		// String literal, consume to end.
   241  		return l.consumePossiblyTripleQuotedString(b, pos, rawString, fString)
   242  	case '(', '[', '{':
   243  		l.braces++
   244  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   245  	case ')', ']', '}':
   246  		if l.braces > 0 { // Don't let it go negative, it fouls things up
   247  			l.braces--
   248  		}
   249  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   250  	case '=', '!', '+', '<', '>':
   251  		// Look ahead one byte to see if this is an augmented assignment or comparison.
   252  		if l.b[l.i] == '=' {
   253  			l.i++
   254  			l.col++
   255  			return Token{Type: LexOperator, Value: string([]byte{b, l.b[l.i-1]}), Pos: pos}
   256  		}
   257  		fallthrough
   258  	case ',', '.', '%', '*', '|', '&', ':':
   259  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   260  	case '#':
   261  		// Comment character, consume to end of line.
   262  		for l.b[l.i] != '\n' && l.b[l.i] != 0 {
   263  			l.i++
   264  			l.col++
   265  		}
   266  		return l.nextToken() // Comments aren't tokens themselves.
   267  	case '-':
   268  		// We lex unary - with the integer if possible.
   269  		if l.b[l.i] >= '0' && l.b[l.i] <= '9' {
   270  			return l.consumeInteger(b, pos)
   271  		}
   272  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   273  	case '\t':
   274  		fail(pos, "Tabs are not permitted in BUILD files, use space-based indentation instead")
   275  	default:
   276  		fail(pos, "Unknown symbol %c", b)
   277  	}
   278  	panic("unreachable")
   279  }
   280  
   281  // consumeInteger consumes all characters until the end of an integer literal is reached.
   282  func (l *lex) consumeInteger(initial byte, pos Position) Token {
   283  	s := make([]byte, 1, 10)
   284  	s[0] = initial
   285  	for c := l.b[l.i]; c >= '0' && c <= '9'; c = l.b[l.i] {
   286  		l.i++
   287  		l.col++
   288  		s = append(s, c)
   289  	}
   290  	return Token{Type: Int, Value: string(s), Pos: pos}
   291  }
   292  
   293  // consumePossiblyTripleQuotedString consumes all characters until the end of a string token.
   294  func (l *lex) consumePossiblyTripleQuotedString(quote byte, pos Position, raw, fString bool) Token {
   295  	if l.b[l.i] == quote && l.b[l.i+1] == quote {
   296  		l.i += 2 // Jump over initial quote
   297  		l.col += 2
   298  		return l.consumeString(quote, pos, true, raw, fString)
   299  	}
   300  	return l.consumeString(quote, pos, false, raw, fString)
   301  }
   302  
   303  // consumeString consumes all characters until the end of a string literal is reached.
   304  func (l *lex) consumeString(quote byte, pos Position, multiline, raw, fString bool) Token {
   305  	s := make([]byte, 1, 100) // 100 chars is typically enough for a single string literal.
   306  	s[0] = '"'
   307  	escaped := false
   308  	for {
   309  		c := l.b[l.i]
   310  		l.i++
   311  		l.col++
   312  		if escaped {
   313  			if c == 'n' {
   314  				s = append(s, '\n')
   315  			} else if c == '\n' && multiline {
   316  				l.line++
   317  				l.col = 0
   318  			} else if c == '\\' || c == '\'' || c == '"' {
   319  				s = append(s, c)
   320  			} else {
   321  				s = append(s, '\\', c)
   322  			}
   323  			escaped = false
   324  			continue
   325  		}
   326  		switch c {
   327  		case quote:
   328  			s = append(s, '"')
   329  			if !multiline || (l.b[l.i] == quote && l.b[l.i+1] == quote) {
   330  				if multiline {
   331  					l.i += 2
   332  					l.col += 2
   333  				}
   334  				token := Token{Type: String, Value: string(s), Pos: pos}
   335  				if fString {
   336  					token.Value = "f" + token.Value
   337  				}
   338  				if l.braces > 0 {
   339  					return l.handleImplicitStringConcatenation(token)
   340  				}
   341  				return token
   342  			}
   343  		case '\n':
   344  			if multiline {
   345  				l.line++
   346  				l.col = 0
   347  				s = append(s, c)
   348  				continue
   349  			}
   350  			fallthrough
   351  		case 0:
   352  			fail(pos, "Unterminated string literal")
   353  		case '\\':
   354  			if !raw {
   355  				escaped = true
   356  				continue
   357  			}
   358  			fallthrough
   359  		default:
   360  			s = append(s, c)
   361  		}
   362  	}
   363  }
   364  
   365  // handleImplicitStringConcatenation looks ahead after a string token and checks if the next token will be a string; if so
   366  // we collapse them both into one string now.
   367  func (l *lex) handleImplicitStringConcatenation(token Token) Token {
   368  	col := l.col
   369  	line := l.line
   370  	for i, b := range l.b[l.i:] {
   371  		switch b {
   372  		case '\n':
   373  			col = 0
   374  			line++
   375  			continue
   376  		case ' ':
   377  			col++
   378  			continue
   379  		case '"', '\'':
   380  			l.i += i + 1
   381  			l.col = col + 1
   382  			l.line = line
   383  			// Note that we don't handle raw or format strings here. Anecdotally, that seems relatively rare...
   384  			tok := l.consumePossiblyTripleQuotedString(b, token.Pos, false, false)
   385  			token.Value = token.Value[:len(token.Value)-1] + tok.Value[1:]
   386  			return token
   387  		default:
   388  			return token
   389  		}
   390  	}
   391  	return token
   392  }
   393  
   394  // consumeIdent consumes all characters of an identifier.
   395  func (l *lex) consumeIdent(pos Position) Token {
   396  	s := make([]rune, 0, 100)
   397  	for {
   398  		c := rune(l.b[l.i])
   399  		if c >= utf8.RuneSelf {
   400  			// Multi-byte encoded in utf-8.
   401  			r, n := utf8.DecodeRune(l.b[l.i:])
   402  			c = r
   403  			l.i += n
   404  			l.col += n
   405  			if !unicode.IsLetter(c) && !unicode.IsDigit(c) {
   406  				fail(pos, "Illegal Unicode identifier %c", c)
   407  			}
   408  			s = append(s, c)
   409  			continue
   410  		}
   411  		l.i++
   412  		l.col++
   413  		switch c {
   414  		case ' ':
   415  			// End of identifier, but no unconsuming needed.
   416  			return Token{Type: Ident, Value: string(s), Pos: pos}
   417  		case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   418  			s = append(s, c)
   419  		default:
   420  			// End of identifier. Unconsume the last character so it gets handled next time.
   421  			l.i--
   422  			l.col--
   423  			return Token{Type: Ident, Value: string(s), Pos: pos}
   424  		}
   425  	}
   426  }