github.com/evanw/esbuild@v0.21.4/internal/css_lexer/css_lexer.go (about)

     1  package css_lexer
     2  
     3  import (
     4  	"strings"
     5  	"unicode/utf8"
     6  
     7  	"github.com/evanw/esbuild/internal/logger"
     8  )
     9  
    10  // The lexer converts a source file to a stream of tokens. Unlike esbuild's
    11  // JavaScript lexer, this CSS lexer runs to completion before the CSS parser
    12  // begins, resulting in a single array of all tokens in the file.
    13  
    14  type T uint8
    15  
    16  const eof = -1
    17  
    18  const (
    19  	TEndOfFile T = iota
    20  
    21  	TAtKeyword
    22  	TUnterminatedString
    23  	TBadURL
    24  	TCDC // "-->"
    25  	TCDO // "<!--"
    26  	TCloseBrace
    27  	TCloseBracket
    28  	TCloseParen
    29  	TColon
    30  	TComma
    31  	TDelim
    32  	TDelimAmpersand
    33  	TDelimAsterisk
    34  	TDelimBar
    35  	TDelimCaret
    36  	TDelimDollar
    37  	TDelimDot
    38  	TDelimEquals
    39  	TDelimExclamation
    40  	TDelimGreaterThan
    41  	TDelimMinus
    42  	TDelimPlus
    43  	TDelimSlash
    44  	TDelimTilde
    45  	TDimension
    46  	TFunction
    47  	THash
    48  	TIdent
    49  	TNumber
    50  	TOpenBrace
    51  	TOpenBracket
    52  	TOpenParen
    53  	TPercentage
    54  	TSemicolon
    55  	TString
    56  	TURL
    57  	TWhitespace
    58  
    59  	// This is never something that the lexer generates directly. Instead this is
    60  	// an esbuild-specific token for global/local names that "TIdent" tokens may
    61  	// be changed into.
    62  	TSymbol
    63  )
    64  
    65  var tokenToString = []string{
    66  	"end of file",
    67  	"@-keyword",
    68  	"bad string token",
    69  	"bad URL token",
    70  	"\"-->\"",
    71  	"\"<!--\"",
    72  	"\"}\"",
    73  	"\"]\"",
    74  	"\")\"",
    75  	"\":\"",
    76  	"\",\"",
    77  	"delimiter",
    78  	"\"&\"",
    79  	"\"*\"",
    80  	"\"|\"",
    81  	"\"^\"",
    82  	"\"$\"",
    83  	"\".\"",
    84  	"\"=\"",
    85  	"\"!\"",
    86  	"\">\"",
    87  	"\"-\"",
    88  	"\"+\"",
    89  	"\"/\"",
    90  	"\"~\"",
    91  	"dimension",
    92  	"function token",
    93  	"hash token",
    94  	"identifier",
    95  	"number",
    96  	"\"{\"",
    97  	"\"[\"",
    98  	"\"(\"",
    99  	"percentage",
   100  	"\";\"",
   101  	"string token",
   102  	"URL token",
   103  	"whitespace",
   104  
   105  	"identifier",
   106  }
   107  
   108  func (t T) String() string {
   109  	return tokenToString[t]
   110  }
   111  
   112  func (t T) IsNumeric() bool {
   113  	return t == TNumber || t == TPercentage || t == TDimension
   114  }
   115  
   116  type TokenFlags uint8
   117  
   118  const (
   119  	IsID TokenFlags = 1 << iota
   120  	DidWarnAboutSingleLineComment
   121  )
   122  
   123  // This token struct is designed to be memory-efficient. It just references a
   124  // range in the input file instead of directly containing the substring of text
   125  // since a range takes up less memory than a string.
   126  type Token struct {
   127  	Range      logger.Range // 8 bytes
   128  	UnitOffset uint16       // 2 bytes
   129  	Kind       T            // 1 byte
   130  	Flags      TokenFlags   // 1 byte
   131  }
   132  
   133  func (token Token) DecodedText(contents string) string {
   134  	raw := contents[token.Range.Loc.Start:token.Range.End()]
   135  
   136  	switch token.Kind {
   137  	case TIdent, TDimension:
   138  		return decodeEscapesInToken(raw)
   139  
   140  	case TAtKeyword, THash:
   141  		return decodeEscapesInToken(raw[1:])
   142  
   143  	case TFunction:
   144  		return decodeEscapesInToken(raw[:len(raw)-1])
   145  
   146  	case TString:
   147  		return decodeEscapesInToken(raw[1 : len(raw)-1])
   148  
   149  	case TURL:
   150  		start := 4
   151  		end := len(raw)
   152  
   153  		// Note: URL tokens with syntax errors may not have a trailing ")"
   154  		if raw[end-1] == ')' {
   155  			end--
   156  		}
   157  
   158  		// Trim leading and trailing whitespace
   159  		for start < end && isWhitespace(rune(raw[start])) {
   160  			start++
   161  		}
   162  		for start < end && isWhitespace(rune(raw[end-1])) {
   163  			end--
   164  		}
   165  
   166  		return decodeEscapesInToken(raw[start:end])
   167  	}
   168  
   169  	return raw
   170  }
   171  
   172  type lexer struct {
   173  	Options
   174  	log                     logger.Log
   175  	source                  logger.Source
   176  	allComments             []logger.Range
   177  	legalCommentsBefore     []Comment
   178  	sourceMappingURL        logger.Span
   179  	tracker                 logger.LineColumnTracker
   180  	approximateNewlineCount int
   181  	current                 int
   182  	oldSingleLineCommentEnd logger.Loc
   183  	codePoint               rune
   184  	Token                   Token
   185  }
   186  
   187  type Comment struct {
   188  	Text            string
   189  	Loc             logger.Loc
   190  	TokenIndexAfter uint32
   191  }
   192  
   193  type TokenizeResult struct {
   194  	Tokens               []Token
   195  	AllComments          []logger.Range
   196  	LegalComments        []Comment
   197  	SourceMapComment     logger.Span
   198  	ApproximateLineCount int32
   199  }
   200  
   201  type Options struct {
   202  	RecordAllComments bool
   203  }
   204  
   205  func Tokenize(log logger.Log, source logger.Source, options Options) TokenizeResult {
   206  	lexer := lexer{
   207  		Options: options,
   208  		log:     log,
   209  		source:  source,
   210  		tracker: logger.MakeLineColumnTracker(&source),
   211  	}
   212  	lexer.step()
   213  
   214  	// The U+FEFF character is usually a zero-width non-breaking space. However,
   215  	// when it's used at the start of a text stream it is called a BOM (byte order
   216  	// mark) instead and indicates that the text stream is UTF-8 encoded. This is
   217  	// problematic for us because CSS does not treat U+FEFF as whitespace. Only
   218  	// " \t\r\n\f" characters are treated as whitespace. Skip over the BOM if it
   219  	// is present so it doesn't cause us trouble when we try to parse it.
   220  	if lexer.codePoint == '\uFEFF' {
   221  		lexer.step()
   222  	}
   223  
   224  	lexer.next()
   225  	var tokens []Token
   226  	var legalComments []Comment
   227  	for lexer.Token.Kind != TEndOfFile {
   228  		if lexer.legalCommentsBefore != nil {
   229  			for _, comment := range lexer.legalCommentsBefore {
   230  				comment.TokenIndexAfter = uint32(len(tokens))
   231  				legalComments = append(legalComments, comment)
   232  			}
   233  			lexer.legalCommentsBefore = nil
   234  		}
   235  		tokens = append(tokens, lexer.Token)
   236  		lexer.next()
   237  	}
   238  	if lexer.legalCommentsBefore != nil {
   239  		for _, comment := range lexer.legalCommentsBefore {
   240  			comment.TokenIndexAfter = uint32(len(tokens))
   241  			legalComments = append(legalComments, comment)
   242  		}
   243  		lexer.legalCommentsBefore = nil
   244  	}
   245  	return TokenizeResult{
   246  		Tokens:               tokens,
   247  		AllComments:          lexer.allComments,
   248  		LegalComments:        legalComments,
   249  		ApproximateLineCount: int32(lexer.approximateNewlineCount) + 1,
   250  		SourceMapComment:     lexer.sourceMappingURL,
   251  	}
   252  }
   253  
   254  func (lexer *lexer) step() {
   255  	codePoint, width := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current:])
   256  
   257  	// Use -1 to indicate the end of the file
   258  	if width == 0 {
   259  		codePoint = eof
   260  	}
   261  
   262  	// Track the approximate number of newlines in the file so we can preallocate
   263  	// the line offset table in the printer for source maps. The line offset table
   264  	// is the #1 highest allocation in the heap profile, so this is worth doing.
   265  	// This count is approximate because it handles "\n" and "\r\n" (the common
   266  	// cases) but not "\r" or "\u2028" or "\u2029". Getting this wrong is harmless
   267  	// because it's only a preallocation. The array will just grow if it's too small.
   268  	if codePoint == '\n' {
   269  		lexer.approximateNewlineCount++
   270  	}
   271  
   272  	lexer.codePoint = codePoint
   273  	lexer.Token.Range.Len = int32(lexer.current) - lexer.Token.Range.Loc.Start
   274  	lexer.current += width
   275  }
   276  
   277  func (lexer *lexer) next() {
   278  	// Reference: https://www.w3.org/TR/css-syntax-3/
   279  
   280  	for {
   281  		lexer.Token = Token{Range: logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}}}
   282  
   283  		switch lexer.codePoint {
   284  		case eof:
   285  			lexer.Token.Kind = TEndOfFile
   286  
   287  		case '/':
   288  			lexer.step()
   289  			switch lexer.codePoint {
   290  			case '*':
   291  				lexer.step()
   292  				lexer.consumeToEndOfMultiLineComment(lexer.Token.Range)
   293  				continue
   294  			case '/':
   295  				// Warn when people use "//" comments, which are invalid in CSS
   296  				loc := lexer.Token.Range.Loc
   297  				if loc.Start >= lexer.oldSingleLineCommentEnd.Start {
   298  					contents := lexer.source.Contents
   299  					end := lexer.current
   300  					for end < len(contents) && !isNewline(rune(contents[end])) {
   301  						end++
   302  					}
   303  					lexer.log.AddID(logger.MsgID_CSS_JSCommentInCSS, logger.Warning, &lexer.tracker, logger.Range{Loc: loc, Len: 2},
   304  						"Comments in CSS use \"/* ... */\" instead of \"//\"")
   305  					lexer.oldSingleLineCommentEnd.Start = int32(end)
   306  					lexer.Token.Flags |= DidWarnAboutSingleLineComment
   307  				}
   308  			}
   309  			lexer.Token.Kind = TDelimSlash
   310  
   311  		case ' ', '\t', '\n', '\r', '\f':
   312  			lexer.step()
   313  			for {
   314  				if isWhitespace(lexer.codePoint) {
   315  					lexer.step()
   316  				} else if lexer.codePoint == '/' && lexer.current < len(lexer.source.Contents) && lexer.source.Contents[lexer.current] == '*' {
   317  					startRange := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 2}
   318  					lexer.step()
   319  					lexer.step()
   320  					lexer.consumeToEndOfMultiLineComment(startRange)
   321  				} else {
   322  					break
   323  				}
   324  			}
   325  			lexer.Token.Kind = TWhitespace
   326  
   327  		case '"', '\'':
   328  			lexer.Token.Kind = lexer.consumeString()
   329  
   330  		case '#':
   331  			lexer.step()
   332  			if IsNameContinue(lexer.codePoint) || lexer.isValidEscape() {
   333  				lexer.Token.Kind = THash
   334  				if lexer.wouldStartIdentifier() {
   335  					lexer.Token.Flags |= IsID
   336  				}
   337  				lexer.consumeName()
   338  			} else {
   339  				lexer.Token.Kind = TDelim
   340  			}
   341  
   342  		case '(':
   343  			lexer.step()
   344  			lexer.Token.Kind = TOpenParen
   345  
   346  		case ')':
   347  			lexer.step()
   348  			lexer.Token.Kind = TCloseParen
   349  
   350  		case '[':
   351  			lexer.step()
   352  			lexer.Token.Kind = TOpenBracket
   353  
   354  		case ']':
   355  			lexer.step()
   356  			lexer.Token.Kind = TCloseBracket
   357  
   358  		case '{':
   359  			lexer.step()
   360  			lexer.Token.Kind = TOpenBrace
   361  
   362  		case '}':
   363  			lexer.step()
   364  			lexer.Token.Kind = TCloseBrace
   365  
   366  		case ',':
   367  			lexer.step()
   368  			lexer.Token.Kind = TComma
   369  
   370  		case ':':
   371  			lexer.step()
   372  			lexer.Token.Kind = TColon
   373  
   374  		case ';':
   375  			lexer.step()
   376  			lexer.Token.Kind = TSemicolon
   377  
   378  		case '+':
   379  			if lexer.wouldStartNumber() {
   380  				lexer.Token.Kind = lexer.consumeNumeric()
   381  			} else {
   382  				lexer.step()
   383  				lexer.Token.Kind = TDelimPlus
   384  			}
   385  
   386  		case '.':
   387  			if lexer.wouldStartNumber() {
   388  				lexer.Token.Kind = lexer.consumeNumeric()
   389  			} else {
   390  				lexer.step()
   391  				lexer.Token.Kind = TDelimDot
   392  			}
   393  
   394  		case '-':
   395  			if lexer.wouldStartNumber() {
   396  				lexer.Token.Kind = lexer.consumeNumeric()
   397  			} else if lexer.current+2 <= len(lexer.source.Contents) && lexer.source.Contents[lexer.current:lexer.current+2] == "->" {
   398  				lexer.step()
   399  				lexer.step()
   400  				lexer.step()
   401  				lexer.Token.Kind = TCDC
   402  			} else if lexer.wouldStartIdentifier() {
   403  				lexer.Token.Kind = lexer.consumeIdentLike()
   404  			} else {
   405  				lexer.step()
   406  				lexer.Token.Kind = TDelimMinus
   407  			}
   408  
   409  		case '<':
   410  			if lexer.current+3 <= len(lexer.source.Contents) && lexer.source.Contents[lexer.current:lexer.current+3] == "!--" {
   411  				lexer.step()
   412  				lexer.step()
   413  				lexer.step()
   414  				lexer.step()
   415  				lexer.Token.Kind = TCDO
   416  			} else {
   417  				lexer.step()
   418  				lexer.Token.Kind = TDelim
   419  			}
   420  
   421  		case '@':
   422  			lexer.step()
   423  			if lexer.wouldStartIdentifier() {
   424  				lexer.consumeName()
   425  				lexer.Token.Kind = TAtKeyword
   426  			} else {
   427  				lexer.Token.Kind = TDelim
   428  			}
   429  
   430  		case '\\':
   431  			if lexer.isValidEscape() {
   432  				lexer.Token.Kind = lexer.consumeIdentLike()
   433  			} else {
   434  				lexer.step()
   435  				lexer.log.AddError(&lexer.tracker, lexer.Token.Range, "Invalid escape")
   436  				lexer.Token.Kind = TDelim
   437  			}
   438  
   439  		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   440  			lexer.Token.Kind = lexer.consumeNumeric()
   441  
   442  		case '>':
   443  			lexer.step()
   444  			lexer.Token.Kind = TDelimGreaterThan
   445  
   446  		case '~':
   447  			lexer.step()
   448  			lexer.Token.Kind = TDelimTilde
   449  
   450  		case '&':
   451  			lexer.step()
   452  			lexer.Token.Kind = TDelimAmpersand
   453  
   454  		case '*':
   455  			lexer.step()
   456  			lexer.Token.Kind = TDelimAsterisk
   457  
   458  		case '|':
   459  			lexer.step()
   460  			lexer.Token.Kind = TDelimBar
   461  
   462  		case '!':
   463  			lexer.step()
   464  			lexer.Token.Kind = TDelimExclamation
   465  
   466  		case '=':
   467  			lexer.step()
   468  			lexer.Token.Kind = TDelimEquals
   469  
   470  		case '^':
   471  			lexer.step()
   472  			lexer.Token.Kind = TDelimCaret
   473  
   474  		case '$':
   475  			lexer.step()
   476  			lexer.Token.Kind = TDelimDollar
   477  
   478  		default:
   479  			if IsNameStart(lexer.codePoint) {
   480  				lexer.Token.Kind = lexer.consumeIdentLike()
   481  			} else {
   482  				lexer.step()
   483  				lexer.Token.Kind = TDelim
   484  			}
   485  		}
   486  
   487  		return
   488  	}
   489  }
   490  
   491  func (lexer *lexer) consumeToEndOfMultiLineComment(startRange logger.Range) {
   492  	startOfSourceMappingURL := 0
   493  	isLegalComment := false
   494  
   495  	switch lexer.codePoint {
   496  	case '#', '@':
   497  		// Keep track of the contents of the "sourceMappingURL=" comment
   498  		if strings.HasPrefix(lexer.source.Contents[lexer.current:], " sourceMappingURL=") {
   499  			startOfSourceMappingURL = lexer.current + len(" sourceMappingURL=")
   500  		}
   501  
   502  	case '!':
   503  		// Remember if this is a legal comment
   504  		isLegalComment = true
   505  	}
   506  
   507  	for {
   508  		switch lexer.codePoint {
   509  		case '*':
   510  			endOfSourceMappingURL := lexer.current - 1
   511  			lexer.step()
   512  			if lexer.codePoint == '/' {
   513  				commentEnd := lexer.current
   514  				lexer.step()
   515  
   516  				// Record the source mapping URL
   517  				if startOfSourceMappingURL != 0 {
   518  					r := logger.Range{Loc: logger.Loc{Start: int32(startOfSourceMappingURL)}}
   519  					text := lexer.source.Contents[startOfSourceMappingURL:endOfSourceMappingURL]
   520  					for int(r.Len) < len(text) && !isWhitespace(rune(text[r.Len])) {
   521  						r.Len++
   522  					}
   523  					lexer.sourceMappingURL = logger.Span{Text: text[:r.Len], Range: r}
   524  				}
   525  
   526  				// Record all comments
   527  				commentRange := logger.Range{Loc: startRange.Loc, Len: int32(commentEnd) - startRange.Loc.Start}
   528  				if lexer.RecordAllComments {
   529  					lexer.allComments = append(lexer.allComments, commentRange)
   530  				}
   531  
   532  				// Record legal comments
   533  				if text := lexer.source.Contents[startRange.Loc.Start:commentEnd]; isLegalComment || containsAtPreserveOrAtLicense(text) {
   534  					text = lexer.source.CommentTextWithoutIndent(commentRange)
   535  					lexer.legalCommentsBefore = append(lexer.legalCommentsBefore, Comment{Loc: startRange.Loc, Text: text})
   536  				}
   537  				return
   538  			}
   539  
   540  		case eof: // This indicates the end of the file
   541  			lexer.log.AddErrorWithNotes(&lexer.tracker, logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}},
   542  				"Expected \"*/\" to terminate multi-line comment",
   543  				[]logger.MsgData{lexer.tracker.MsgData(startRange, "The multi-line comment starts here:")})
   544  			return
   545  
   546  		default:
   547  			lexer.step()
   548  		}
   549  	}
   550  }
   551  
   552  func containsAtPreserveOrAtLicense(text string) bool {
   553  	for i, c := range text {
   554  		if c == '@' && (strings.HasPrefix(text[i+1:], "preserve") || strings.HasPrefix(text[i+1:], "license")) {
   555  			return true
   556  		}
   557  	}
   558  	return false
   559  }
   560  
   561  func (lexer *lexer) isValidEscape() bool {
   562  	if lexer.codePoint != '\\' {
   563  		return false
   564  	}
   565  	c, _ := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current:])
   566  	return !isNewline(c)
   567  }
   568  
   569  func (lexer *lexer) wouldStartIdentifier() bool {
   570  	if IsNameStart(lexer.codePoint) {
   571  		return true
   572  	}
   573  
   574  	if lexer.codePoint == '-' {
   575  		c, width := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current:])
   576  		if c == utf8.RuneError && width <= 1 {
   577  			return false // Decoding error
   578  		}
   579  		if IsNameStart(c) || c == '-' {
   580  			return true
   581  		}
   582  		if c == '\\' {
   583  			c2, _ := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current+width:])
   584  			return !isNewline(c2)
   585  		}
   586  		return false
   587  	}
   588  
   589  	return lexer.isValidEscape()
   590  }
   591  
   592  func WouldStartIdentifierWithoutEscapes(text string) bool {
   593  	c, width := utf8.DecodeRuneInString(text)
   594  	if c == utf8.RuneError && width <= 1 {
   595  		return false // Decoding error
   596  	}
   597  	if IsNameStart(c) {
   598  		return true
   599  	}
   600  
   601  	if c == '-' {
   602  		c2, width2 := utf8.DecodeRuneInString(text[width:])
   603  		if c2 == utf8.RuneError && width2 <= 1 {
   604  			return false // Decoding error
   605  		}
   606  		if IsNameStart(c2) || c2 == '-' {
   607  			return true
   608  		}
   609  	}
   610  	return false
   611  }
   612  
   613  func RangeOfIdentifier(source logger.Source, loc logger.Loc) logger.Range {
   614  	text := source.Contents[loc.Start:]
   615  	if len(text) == 0 {
   616  		return logger.Range{Loc: loc, Len: 0}
   617  	}
   618  
   619  	i := 0
   620  	n := len(text)
   621  
   622  	for {
   623  		c, width := utf8.DecodeRuneInString(text[i:])
   624  		if IsNameContinue(c) {
   625  			i += width
   626  			continue
   627  		}
   628  
   629  		// Handle an escape
   630  		if c == '\\' && i+1 < n && !isNewline(rune(text[i+1])) {
   631  			i += width // Skip the backslash
   632  			c, width = utf8.DecodeRuneInString(text[i:])
   633  			if _, ok := isHex(c); ok {
   634  				i += width
   635  				c, width = utf8.DecodeRuneInString(text[i:])
   636  				for j := 0; j < 5; j++ {
   637  					if _, ok := isHex(c); !ok {
   638  						break
   639  					}
   640  					i += width
   641  					c, width = utf8.DecodeRuneInString(text[i:])
   642  				}
   643  				if isWhitespace(c) {
   644  					i += width
   645  				}
   646  			}
   647  			continue
   648  		}
   649  
   650  		break
   651  	}
   652  
   653  	// Don't end with a whitespace
   654  	if i > 0 && isWhitespace(rune(text[i-1])) {
   655  		i--
   656  	}
   657  
   658  	return logger.Range{Loc: loc, Len: int32(i)}
   659  }
   660  
   661  func (lexer *lexer) wouldStartNumber() bool {
   662  	if lexer.codePoint >= '0' && lexer.codePoint <= '9' {
   663  		return true
   664  	} else if lexer.codePoint == '.' {
   665  		contents := lexer.source.Contents
   666  		if lexer.current < len(contents) {
   667  			c := contents[lexer.current]
   668  			return c >= '0' && c <= '9'
   669  		}
   670  	} else if lexer.codePoint == '+' || lexer.codePoint == '-' {
   671  		contents := lexer.source.Contents
   672  		n := len(contents)
   673  		if lexer.current < n {
   674  			c := contents[lexer.current]
   675  			if c >= '0' && c <= '9' {
   676  				return true
   677  			}
   678  			if c == '.' && lexer.current+1 < n {
   679  				c = contents[lexer.current+1]
   680  				return c >= '0' && c <= '9'
   681  			}
   682  		}
   683  	}
   684  	return false
   685  }
   686  
   687  // Note: This function is hot in profiles
   688  func (lexer *lexer) consumeName() string {
   689  	// Common case: no escapes, identifier is a substring of the input. Doing this
   690  	// in a tight loop that avoids UTF-8 decoding and that increments a single
   691  	// number instead of doing "step()" is noticeably faster. For example, doing
   692  	// this sped up end-to-end parsing and printing of a large CSS file from 97ms
   693  	// to 84ms (around 15% faster).
   694  	contents := lexer.source.Contents
   695  	if IsNameContinue(lexer.codePoint) {
   696  		n := len(contents)
   697  		i := lexer.current
   698  		for i < n && IsNameContinue(rune(contents[i])) {
   699  			i++
   700  		}
   701  		lexer.current = i
   702  		lexer.step()
   703  	}
   704  	raw := contents[lexer.Token.Range.Loc.Start:lexer.Token.Range.End()]
   705  	if !lexer.isValidEscape() {
   706  		return raw
   707  	}
   708  
   709  	// Uncommon case: escapes, identifier is allocated
   710  	sb := strings.Builder{}
   711  	sb.WriteString(raw)
   712  	sb.WriteRune(lexer.consumeEscape())
   713  	for {
   714  		if IsNameContinue(lexer.codePoint) {
   715  			sb.WriteRune(lexer.codePoint)
   716  			lexer.step()
   717  		} else if lexer.isValidEscape() {
   718  			sb.WriteRune(lexer.consumeEscape())
   719  		} else {
   720  			break
   721  		}
   722  	}
   723  	return sb.String()
   724  }
   725  
   726  func (lexer *lexer) consumeEscape() rune {
   727  	lexer.step() // Skip the backslash
   728  	c := lexer.codePoint
   729  
   730  	if hex, ok := isHex(c); ok {
   731  		lexer.step()
   732  		for i := 0; i < 5; i++ {
   733  			if next, ok := isHex(lexer.codePoint); ok {
   734  				lexer.step()
   735  				hex = hex*16 + next
   736  			} else {
   737  				break
   738  			}
   739  		}
   740  		if isWhitespace(lexer.codePoint) {
   741  			lexer.step()
   742  		}
   743  		if hex == 0 || (hex >= 0xD800 && hex <= 0xDFFF) || hex > 0x10FFFF {
   744  			return utf8.RuneError
   745  		}
   746  		return rune(hex)
   747  	}
   748  
   749  	if c == eof {
   750  		return utf8.RuneError
   751  	}
   752  
   753  	lexer.step()
   754  	return c
   755  }
   756  
   757  func (lexer *lexer) consumeIdentLike() T {
   758  	name := lexer.consumeName()
   759  
   760  	if lexer.codePoint == '(' {
   761  		matchingLoc := logger.Loc{Start: lexer.Token.Range.End()}
   762  		lexer.step()
   763  		if len(name) == 3 {
   764  			u, r, l := name[0], name[1], name[2]
   765  			if (u == 'u' || u == 'U') && (r == 'r' || r == 'R') && (l == 'l' || l == 'L') {
   766  				// Save state
   767  				approximateNewlineCount := lexer.approximateNewlineCount
   768  				codePoint := lexer.codePoint
   769  				tokenRangeLen := lexer.Token.Range.Len
   770  				current := lexer.current
   771  
   772  				// Check to see if this is a URL token instead of a function
   773  				for isWhitespace(lexer.codePoint) {
   774  					lexer.step()
   775  				}
   776  				if lexer.codePoint != '"' && lexer.codePoint != '\'' {
   777  					return lexer.consumeURL(matchingLoc)
   778  				}
   779  
   780  				// Restore state (i.e. backtrack)
   781  				lexer.approximateNewlineCount = approximateNewlineCount
   782  				lexer.codePoint = codePoint
   783  				lexer.Token.Range.Len = tokenRangeLen
   784  				lexer.current = current
   785  			}
   786  		}
   787  		return TFunction
   788  	}
   789  
   790  	return TIdent
   791  }
   792  
   793  func (lexer *lexer) consumeURL(matchingLoc logger.Loc) T {
   794  validURL:
   795  	for {
   796  		switch lexer.codePoint {
   797  		case ')':
   798  			lexer.step()
   799  			return TURL
   800  
   801  		case eof:
   802  			loc := logger.Loc{Start: lexer.Token.Range.End()}
   803  			lexer.log.AddIDWithNotes(logger.MsgID_CSS_CSSSyntaxError, logger.Warning, &lexer.tracker, logger.Range{Loc: loc}, "Expected \")\" to end URL token",
   804  				[]logger.MsgData{lexer.tracker.MsgData(logger.Range{Loc: matchingLoc, Len: 1}, "The unbalanced \"(\" is here:")})
   805  			return TURL
   806  
   807  		case ' ', '\t', '\n', '\r', '\f':
   808  			lexer.step()
   809  			for isWhitespace(lexer.codePoint) {
   810  				lexer.step()
   811  			}
   812  			if lexer.codePoint != ')' {
   813  				loc := logger.Loc{Start: lexer.Token.Range.End()}
   814  				lexer.log.AddIDWithNotes(logger.MsgID_CSS_CSSSyntaxError, logger.Warning, &lexer.tracker, logger.Range{Loc: loc}, "Expected \")\" to end URL token",
   815  					[]logger.MsgData{lexer.tracker.MsgData(logger.Range{Loc: matchingLoc, Len: 1}, "The unbalanced \"(\" is here:")})
   816  				if lexer.codePoint == eof {
   817  					return TURL
   818  				}
   819  				break validURL
   820  			}
   821  			lexer.step()
   822  			return TURL
   823  
   824  		case '"', '\'', '(':
   825  			r := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 1}
   826  			lexer.log.AddIDWithNotes(logger.MsgID_CSS_CSSSyntaxError, logger.Warning, &lexer.tracker, r, "Expected \")\" to end URL token",
   827  				[]logger.MsgData{lexer.tracker.MsgData(logger.Range{Loc: matchingLoc, Len: 1}, "The unbalanced \"(\" is here:")})
   828  			break validURL
   829  
   830  		case '\\':
   831  			if !lexer.isValidEscape() {
   832  				r := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 1}
   833  				lexer.log.AddID(logger.MsgID_CSS_CSSSyntaxError, logger.Warning, &lexer.tracker, r, "Invalid escape")
   834  				break validURL
   835  			}
   836  			lexer.consumeEscape()
   837  
   838  		default:
   839  			if isNonPrintable(lexer.codePoint) {
   840  				r := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 1}
   841  				lexer.log.AddID(logger.MsgID_CSS_CSSSyntaxError, logger.Warning, &lexer.tracker, r, "Unexpected non-printable character in URL token")
   842  				break validURL
   843  			}
   844  			lexer.step()
   845  		}
   846  	}
   847  
   848  	// Consume the remnants of a bad url
   849  	for {
   850  		switch lexer.codePoint {
   851  		case ')', eof:
   852  			lexer.step()
   853  			return TBadURL
   854  
   855  		case '\\':
   856  			if lexer.isValidEscape() {
   857  				lexer.consumeEscape()
   858  			}
   859  		}
   860  		lexer.step()
   861  	}
   862  }
   863  
   864  func (lexer *lexer) consumeString() T {
   865  	quote := lexer.codePoint
   866  	lexer.step()
   867  
   868  	for {
   869  		switch lexer.codePoint {
   870  		case '\\':
   871  			lexer.step()
   872  
   873  			// Handle Windows CRLF
   874  			if lexer.codePoint == '\r' {
   875  				lexer.step()
   876  				if lexer.codePoint == '\n' {
   877  					lexer.step()
   878  				}
   879  				continue
   880  			}
   881  
   882  			// Otherwise, fall through to ignore the character after the backslash
   883  
   884  		case eof, '\n', '\r', '\f':
   885  			lexer.log.AddID(logger.MsgID_CSS_CSSSyntaxError, logger.Warning, &lexer.tracker,
   886  				logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}},
   887  				"Unterminated string token")
   888  			return TUnterminatedString
   889  
   890  		case quote:
   891  			lexer.step()
   892  			return TString
   893  		}
   894  		lexer.step()
   895  	}
   896  }
   897  
   898  func (lexer *lexer) consumeNumeric() T {
   899  	// Skip over leading sign
   900  	if lexer.codePoint == '+' || lexer.codePoint == '-' {
   901  		lexer.step()
   902  	}
   903  
   904  	// Skip over leading digits
   905  	for lexer.codePoint >= '0' && lexer.codePoint <= '9' {
   906  		lexer.step()
   907  	}
   908  
   909  	// Skip over digits after dot
   910  	if lexer.codePoint == '.' {
   911  		lexer.step()
   912  		for lexer.codePoint >= '0' && lexer.codePoint <= '9' {
   913  			lexer.step()
   914  		}
   915  	}
   916  
   917  	// Skip over exponent
   918  	if lexer.codePoint == 'e' || lexer.codePoint == 'E' {
   919  		contents := lexer.source.Contents
   920  
   921  		// Look ahead before advancing to make sure this is an exponent, not a unit
   922  		if lexer.current < len(contents) {
   923  			c := contents[lexer.current]
   924  			if (c == '+' || c == '-') && lexer.current+1 < len(contents) {
   925  				c = contents[lexer.current+1]
   926  			}
   927  
   928  			// Only consume this if it's an exponent
   929  			if c >= '0' && c <= '9' {
   930  				lexer.step()
   931  				if lexer.codePoint == '+' || lexer.codePoint == '-' {
   932  					lexer.step()
   933  				}
   934  				for lexer.codePoint >= '0' && lexer.codePoint <= '9' {
   935  					lexer.step()
   936  				}
   937  			}
   938  		}
   939  	}
   940  
   941  	// Determine the numeric type
   942  	if lexer.wouldStartIdentifier() {
   943  		lexer.Token.UnitOffset = uint16(lexer.Token.Range.Len)
   944  		lexer.consumeName()
   945  		return TDimension
   946  	}
   947  	if lexer.codePoint == '%' {
   948  		lexer.step()
   949  		return TPercentage
   950  	}
   951  	return TNumber
   952  }
   953  
   954  func IsNameStart(c rune) bool {
   955  	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80 || c == '\x00'
   956  }
   957  
   958  func IsNameContinue(c rune) bool {
   959  	return IsNameStart(c) || (c >= '0' && c <= '9') || c == '-'
   960  }
   961  
   962  func isNewline(c rune) bool {
   963  	switch c {
   964  	case '\n', '\r', '\f':
   965  		return true
   966  	}
   967  	return false
   968  }
   969  
   970  func isWhitespace(c rune) bool {
   971  	switch c {
   972  	case ' ', '\t', '\n', '\r', '\f':
   973  		return true
   974  	}
   975  	return false
   976  }
   977  
   978  func isHex(c rune) (int, bool) {
   979  	if c >= '0' && c <= '9' {
   980  		return int(c - '0'), true
   981  	}
   982  	if c >= 'a' && c <= 'f' {
   983  		return int(c + (10 - 'a')), true
   984  	}
   985  	if c >= 'A' && c <= 'F' {
   986  		return int(c + (10 - 'A')), true
   987  	}
   988  	return 0, false
   989  }
   990  
   991  func isNonPrintable(c rune) bool {
   992  	return c <= 0x08 || c == 0x0B || (c >= 0x0E && c <= 0x1F) || c == 0x7F
   993  }
   994  
   995  func decodeEscapesInToken(inner string) string {
   996  	i := 0
   997  
   998  	for i < len(inner) {
   999  		if c := inner[i]; c == '\\' || c == '\x00' {
  1000  			break
  1001  		}
  1002  		i++
  1003  	}
  1004  
  1005  	if i == len(inner) {
  1006  		return inner
  1007  	}
  1008  
  1009  	sb := strings.Builder{}
  1010  	sb.WriteString(inner[:i])
  1011  	inner = inner[i:]
  1012  
  1013  	for len(inner) > 0 {
  1014  		c, width := utf8.DecodeRuneInString(inner)
  1015  		inner = inner[width:]
  1016  
  1017  		if c != '\\' {
  1018  			if c == '\x00' {
  1019  				c = utf8.RuneError
  1020  			}
  1021  			sb.WriteRune(c)
  1022  			continue
  1023  		}
  1024  
  1025  		if len(inner) == 0 {
  1026  			sb.WriteRune(utf8.RuneError)
  1027  			continue
  1028  		}
  1029  
  1030  		c, width = utf8.DecodeRuneInString(inner)
  1031  		inner = inner[width:]
  1032  		hex, ok := isHex(c)
  1033  
  1034  		if !ok {
  1035  			if c == '\n' || c == '\f' {
  1036  				continue
  1037  			}
  1038  
  1039  			// Handle Windows CRLF
  1040  			if c == '\r' {
  1041  				c, width = utf8.DecodeRuneInString(inner)
  1042  				if c == '\n' {
  1043  					inner = inner[width:]
  1044  				}
  1045  				continue
  1046  			}
  1047  
  1048  			// If we get here, this is not a valid escape. However, this is still
  1049  			// allowed. In this case the backslash is just ignored.
  1050  			sb.WriteRune(c)
  1051  			continue
  1052  		}
  1053  
  1054  		// Parse up to five additional hex characters (so six in total)
  1055  		for i := 0; i < 5 && len(inner) > 0; i++ {
  1056  			c, width = utf8.DecodeRuneInString(inner)
  1057  			if next, ok := isHex(c); ok {
  1058  				inner = inner[width:]
  1059  				hex = hex*16 + next
  1060  			} else {
  1061  				break
  1062  			}
  1063  		}
  1064  
  1065  		if len(inner) > 0 {
  1066  			c, width = utf8.DecodeRuneInString(inner)
  1067  			if isWhitespace(c) {
  1068  				inner = inner[width:]
  1069  			}
  1070  		}
  1071  
  1072  		if hex == 0 || (hex >= 0xD800 && hex <= 0xDFFF) || hex > 0x10FFFF {
  1073  			sb.WriteRune(utf8.RuneError)
  1074  			continue
  1075  		}
  1076  
  1077  		sb.WriteRune(rune(hex))
  1078  	}
  1079  
  1080  	return sb.String()
  1081  }