github.phpd.cn/thought-machine/please@v12.2.0+incompatible/src/parse/asp/lexer.go (about)

     1  package asp
     2  
     3  import (
     4  	"io"
     5  	"io/ioutil"
     6  	"unicode"
     7  	"unicode/utf8"
     8  )
     9  
    10  // Token types.
    11  const (
    12  	EOF = -(iota + 1)
    13  	Ident
    14  	Int
    15  	String
    16  	LexOperator
    17  	EOL
    18  	Unindent
    19  )
    20  
    21  // A Token describes each individual lexical element emitted by the lexer.
    22  type Token struct {
    23  	// Type of token. If > 0 this is the literal character value; if < 0 it is one of the types above.
    24  	Type rune
    25  	// The literal text of the token. Strings are lightly normalised to always be surrounded by quotes (but only one).
    26  	Value string
    27  	// The position in the input that the token occurred at.
    28  	Pos Position
    29  }
    30  
    31  // String implements the fmt.Stringer interface
    32  func (tok Token) String() string {
    33  	if tok.Value != "" {
    34  		return tok.Value
    35  	}
    36  	return reverseSymbol(tok.Type)
    37  }
    38  
    39  // A Position describes a position in a source file.
    40  type Position struct {
    41  	Filename string
    42  	Offset   int
    43  	Line     int
    44  	Column   int
    45  }
    46  
    47  type namer interface {
    48  	Name() string
    49  }
    50  
    51  // NameOfReader returns a name for the given reader, if one can be determined.
    52  func NameOfReader(r io.Reader) string {
    53  	if n, ok := r.(namer); ok {
    54  		return n.Name()
    55  	}
    56  	return ""
    57  }
    58  
    59  // newLexer creates a new lex instance.
    60  func newLexer(r io.Reader) *lex {
    61  	// Read the entire file upfront to avoid bufio etc.
    62  	// This should work OK as long as BUILD files are relatively small.
    63  	b, err := ioutil.ReadAll(r)
    64  	if err != nil {
    65  		fail(Position{Filename: NameOfReader(r)}, err.Error())
    66  	}
    67  	// If the file doesn't end in a newline, we will reject it with an "unexpected end of file"
    68  	// error. That's a bit crap so quietly fix it up here.
    69  	if len(b) > 0 && b[len(b)-1] != '\n' {
    70  		b = append(b, '\n')
    71  	}
    72  	l := &lex{
    73  		b:        append(b, 0, 0), // Null-terminating the buffer makes things easier later.
    74  		filename: NameOfReader(r),
    75  		indents:  []int{0},
    76  	}
    77  	l.Next() // Initial value is zero, this forces it to populate itself.
    78  	// Discard any leading newlines, they are just an annoyance.
    79  	for l.Peek().Type == EOL {
    80  		l.Next()
    81  	}
    82  	return l
    83  }
    84  
    85  // A lex is a lexer for a single BUILD file.
    86  type lex struct {
    87  	b      []byte
    88  	i      int
    89  	line   int
    90  	col    int
    91  	indent int
    92  	// The next token. We always look one token ahead in order to facilitate both Peek() and Next().
    93  	next     Token
    94  	filename string
    95  	// Used to track how many braces we're within.
    96  	braces int
    97  	// Pending unindent tokens. This is a bit yuck but means the parser doesn't need to
    98  	// concern itself about indentation.
    99  	unindents int
   100  	// Current levels of indentation
   101  	indents []int
   102  	// Remember whether the last token we output was an end-of-line so we don't emit multiple in sequence.
   103  	lastEOL bool
   104  }
   105  
   106  // reverseSymbol looks up a symbol's name from the lexer.
   107  func reverseSymbol(sym rune) string {
   108  	switch sym {
   109  	case EOF:
   110  		return "end of file"
   111  	case Ident:
   112  		return "identifier"
   113  	case Int:
   114  		return "integer"
   115  	case String:
   116  		return "string"
   117  	case LexOperator:
   118  		return "operator"
   119  	case EOL:
   120  		return "end of line"
   121  	case Unindent:
   122  		return "unindent"
   123  	}
   124  	return string(sym) // literal character
   125  }
   126  
   127  // reverseSymbols looks up a series of symbol's names from the lexer.
   128  func reverseSymbols(syms []rune) []string {
   129  	ret := make([]string, len(syms))
   130  	for i, sym := range syms {
   131  		ret[i] = reverseSymbol(sym)
   132  	}
   133  	return ret
   134  }
   135  
   136  // Peek at the next token
   137  func (l *lex) Peek() Token {
   138  	return l.next
   139  }
   140  
   141  // Next consumes and returns the next token.
   142  func (l *lex) Next() Token {
   143  	ret := l.next
   144  	l.next = l.nextToken()
   145  	l.lastEOL = l.next.Type == EOL || l.next.Type == Unindent
   146  	return ret
   147  }
   148  
   149  // AssignFollows is a hack to do extra lookahead which makes it easier to parse
   150  // named call arguments. It returns true if the token after next is an assign operator.
   151  func (l *lex) AssignFollows() bool {
   152  	l.stripSpaces()
   153  	return l.b[l.i] == '=' && l.b[l.i+1] != '='
   154  }
   155  
   156  func (l *lex) stripSpaces() {
   157  	for l.b[l.i] == ' ' {
   158  		l.i++
   159  		l.col++
   160  	}
   161  }
   162  
   163  // nextToken consumes and returns the next token.
   164  func (l *lex) nextToken() Token {
   165  	l.stripSpaces()
   166  	pos := Position{
   167  		Filename: l.filename,
   168  		// These are all 1-indexed for niceness.
   169  		Offset: l.i + 1,
   170  		Line:   l.line + 1,
   171  		Column: l.col + 1,
   172  	}
   173  	if l.unindents > 0 {
   174  		l.unindents--
   175  		return Token{Type: Unindent, Pos: pos}
   176  	}
   177  	b := l.b[l.i]
   178  	rawString := b == 'r' && (l.b[l.i+1] == '"' || l.b[l.i+1] == '\'')
   179  	if rawString {
   180  		l.i++
   181  		l.col++
   182  		b = l.b[l.i]
   183  	} else if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || b >= utf8.RuneSelf {
   184  		return l.consumeIdent(pos)
   185  	}
   186  	l.i++
   187  	l.col++
   188  	switch b {
   189  	case 0:
   190  		// End of file (we null terminate it above so this is easy to spot)
   191  		return Token{Type: EOF, Pos: pos}
   192  	case '\n':
   193  		// End of line, read indent to next non-space character
   194  		lastIndent := l.indent
   195  		l.line++
   196  		l.col = 0
   197  		indent := 0
   198  		for l.b[l.i] == ' ' {
   199  			l.i++
   200  			l.col++
   201  			indent++
   202  		}
   203  		if l.b[l.i] == '\n' {
   204  			return l.nextToken()
   205  		}
   206  		if l.braces == 0 {
   207  			l.indent = indent
   208  		}
   209  		if lastIndent > l.indent && l.braces == 0 {
   210  			pos.Line++ // Works better if it's at the new position
   211  			pos.Column = l.col + 1
   212  			for l.indents[len(l.indents)-1] > l.indent {
   213  				l.unindents++
   214  				l.indents = l.indents[:len(l.indents)-1]
   215  			}
   216  			if l.indent != l.indents[len(l.indents)-1] {
   217  				fail(pos, "Unexpected indent")
   218  			}
   219  		} else if lastIndent != l.indent {
   220  			l.indents = append(l.indents, l.indent)
   221  		}
   222  		if l.braces == 0 && !l.lastEOL {
   223  			return Token{Type: EOL, Pos: pos}
   224  		}
   225  		return l.nextToken()
   226  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   227  		return l.consumeInteger(b, pos)
   228  	case '"', '\'':
   229  		// String literal, consume to end.
   230  		return l.consumePossiblyTripleQuotedString(b, pos, rawString)
   231  	case '(', '[', '{':
   232  		l.braces++
   233  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   234  	case ')', ']', '}':
   235  		if l.braces > 0 { // Don't let it go negative, it fouls things up
   236  			l.braces--
   237  		}
   238  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   239  	case '=', '!', '+', '<', '>':
   240  		// Look ahead one byte to see if this is an augmented assignment or comparison.
   241  		if l.b[l.i] == '=' {
   242  			l.i++
   243  			l.col++
   244  			return Token{Type: LexOperator, Value: string([]byte{b, l.b[l.i-1]}), Pos: pos}
   245  		}
   246  		fallthrough
   247  	case ',', '.', '%', '*', '|', '&', ':':
   248  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   249  	case '#':
   250  		// Comment character, consume to end of line.
   251  		for l.b[l.i] != '\n' && l.b[l.i] != 0 {
   252  			l.i++
   253  			l.col++
   254  		}
   255  		return l.nextToken() // Comments aren't tokens themselves.
   256  	case '-':
   257  		// We lex unary - with the integer if possible.
   258  		if l.b[l.i] >= '0' && l.b[l.i] <= '9' {
   259  			return l.consumeInteger(b, pos)
   260  		}
   261  		return Token{Type: rune(b), Value: string(b), Pos: pos}
   262  	case '\t':
   263  		fail(pos, "Tabs are not permitted in BUILD files, use space-based indentation instead")
   264  	default:
   265  		fail(pos, "Unknown symbol %c", b)
   266  	}
   267  	panic("unreachable")
   268  }
   269  
   270  // consumeInteger consumes all characters until the end of an integer literal is reached.
   271  func (l *lex) consumeInteger(initial byte, pos Position) Token {
   272  	s := make([]byte, 1, 10)
   273  	s[0] = initial
   274  	for c := l.b[l.i]; c >= '0' && c <= '9'; c = l.b[l.i] {
   275  		l.i++
   276  		l.col++
   277  		s = append(s, c)
   278  	}
   279  	return Token{Type: Int, Value: string(s), Pos: pos}
   280  }
   281  
   282  // consumePossiblyTripleQuotedString consumes all characters until the end of a string token.
   283  func (l *lex) consumePossiblyTripleQuotedString(quote byte, pos Position, raw bool) Token {
   284  	if l.b[l.i] == quote && l.b[l.i+1] == quote {
   285  		l.i += 2 // Jump over initial quote
   286  		l.col += 2
   287  		return l.consumeString(quote, pos, true, raw)
   288  	}
   289  	return l.consumeString(quote, pos, false, raw)
   290  }
   291  
   292  // consumeString consumes all characters until the end of a string literal is reached.
   293  func (l *lex) consumeString(quote byte, pos Position, multiline, raw bool) Token {
   294  	s := make([]byte, 1, 100) // 100 chars is typically enough for a single string literal.
   295  	s[0] = '"'
   296  	escaped := false
   297  	for {
   298  		c := l.b[l.i]
   299  		l.i++
   300  		l.col++
   301  		if escaped {
   302  			if c == 'n' {
   303  				s = append(s, '\n')
   304  			} else if c == '\n' && multiline {
   305  				l.line++
   306  				l.col = 0
   307  			} else if c == '\\' || c == '\'' || c == '"' {
   308  				s = append(s, c)
   309  			} else {
   310  				s = append(s, '\\', c)
   311  			}
   312  			escaped = false
   313  			continue
   314  		}
   315  		switch c {
   316  		case quote:
   317  			s = append(s, '"')
   318  			if !multiline || (l.b[l.i] == quote && l.b[l.i+1] == quote) {
   319  				if multiline {
   320  					l.i += 2
   321  					l.col += 2
   322  				}
   323  				token := Token{Type: String, Value: string(s), Pos: pos}
   324  				if l.braces > 0 {
   325  					return l.handleImplicitStringConcatenation(token)
   326  				}
   327  				return token
   328  			}
   329  		case '\n':
   330  			if multiline {
   331  				l.line++
   332  				l.col = 0
   333  				s = append(s, c)
   334  				continue
   335  			}
   336  			fallthrough
   337  		case 0:
   338  			fail(pos, "Unterminated string literal")
   339  		case '\\':
   340  			if !raw {
   341  				escaped = true
   342  				continue
   343  			}
   344  			fallthrough
   345  		default:
   346  			s = append(s, c)
   347  		}
   348  	}
   349  }
   350  
   351  // handleImplicitStringConcatenation looks ahead after a string token and checks if the next token will be a string; if so
   352  // we collapse them both into one string now.
   353  func (l *lex) handleImplicitStringConcatenation(token Token) Token {
   354  	col := l.col
   355  	line := l.line
   356  	for i, b := range l.b[l.i:] {
   357  		switch b {
   358  		case '\n':
   359  			col = 0
   360  			line++
   361  			continue
   362  		case ' ':
   363  			col++
   364  			continue
   365  		case '"', '\'':
   366  			l.i += i + 1
   367  			l.col = col + 1
   368  			l.line = line
   369  			// Note that we don't handle raw strings here. Anecdotally, that seems relatively rare...
   370  			tok := l.consumePossiblyTripleQuotedString(b, token.Pos, false)
   371  			token.Value = token.Value[:len(token.Value)-1] + tok.Value[1:]
   372  			return token
   373  		default:
   374  			return token
   375  		}
   376  	}
   377  	return token
   378  }
   379  
   380  // consumeIdent consumes all characters of an identifier.
   381  func (l *lex) consumeIdent(pos Position) Token {
   382  	s := make([]rune, 0, 100)
   383  	for {
   384  		c := rune(l.b[l.i])
   385  		if c >= utf8.RuneSelf {
   386  			// Multi-byte encoded in utf-8.
   387  			r, n := utf8.DecodeRune(l.b[l.i:])
   388  			c = r
   389  			l.i += n
   390  			l.col += n
   391  			if !unicode.IsLetter(c) && !unicode.IsDigit(c) {
   392  				fail(pos, "Illegal Unicode identifier %c", c)
   393  			}
   394  			s = append(s, c)
   395  			continue
   396  		}
   397  		l.i++
   398  		l.col++
   399  		switch c {
   400  		case ' ':
   401  			// End of identifier, but no unconsuming needed.
   402  			return Token{Type: Ident, Value: string(s), Pos: pos}
   403  		case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   404  			s = append(s, c)
   405  		default:
   406  			// End of identifier. Unconsume the last character so it gets handled next time.
   407  			l.i--
   408  			l.col--
   409  			return Token{Type: Ident, Value: string(s), Pos: pos}
   410  		}
   411  	}
   412  }