github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/flosch/pongo2.v3/lexer.go (about)

     1  package pongo2
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"unicode/utf8"
     7  )
     8  
     9  const (
    10  	TokenError = iota
    11  	EOF
    12  
    13  	TokenHTML
    14  
    15  	TokenKeyword
    16  	TokenIdentifier
    17  	TokenString
    18  	TokenNumber
    19  	TokenSymbol
    20  )
    21  
    22  var (
    23  	tokenSpaceChars                = " \n\r\t"
    24  	tokenIdentifierChars           = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"
    25  	tokenIdentifierCharsWithDigits = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
    26  	tokenDigits                    = "0123456789"
    27  
    28  	// Available symbols in pongo2 (within filters/tag)
    29  	TokenSymbols = []string{
    30  		// 3-Char symbols
    31  
    32  		// 2-Char symbols
    33  		"==", ">=", "<=", "&&", "||", "{{", "}}", "{%", "%}", "!=", "<>",
    34  
    35  		// 1-Char symbol
    36  		"(", ")", "+", "-", "*", "<", ">", "/", "^", ",", ".", "!", "|", ":", "=", "%",
    37  	}
    38  
    39  	// Available keywords in pongo2
    40  	TokenKeywords = []string{"in", "and", "or", "not", "true", "false", "as", "export"}
    41  )
    42  
    43  type TokenType int
    44  type Token struct {
    45  	Filename string
    46  	Typ      TokenType
    47  	Val      string
    48  	Line     int
    49  	Col      int
    50  }
    51  
    52  type lexerStateFn func() lexerStateFn
    53  type lexer struct {
    54  	name      string
    55  	input     string
    56  	start     int // start pos of the item
    57  	pos       int // current pos
    58  	width     int // width of last rune
    59  	tokens    []*Token
    60  	errored   bool
    61  	startline int
    62  	startcol  int
    63  	line      int
    64  	col       int
    65  
    66  	in_verbatim   bool
    67  	verbatim_name string
    68  }
    69  
    70  func (t *Token) String() string {
    71  	val := t.Val
    72  	if len(val) > 1000 {
    73  		val = fmt.Sprintf("%s...%s", val[:10], val[len(val)-5:len(val)])
    74  	}
    75  
    76  	typ := ""
    77  	switch t.Typ {
    78  	case TokenHTML:
    79  		typ = "HTML"
    80  	case TokenError:
    81  		typ = "Error"
    82  	case TokenIdentifier:
    83  		typ = "Identifier"
    84  	case TokenKeyword:
    85  		typ = "Keyword"
    86  	case TokenNumber:
    87  		typ = "Number"
    88  	case TokenString:
    89  		typ = "String"
    90  	case TokenSymbol:
    91  		typ = "Symbol"
    92  	default:
    93  		typ = "Unknown"
    94  	}
    95  
    96  	return fmt.Sprintf("<Token Typ=%s (%d) Val='%s' Line=%d Col=%d>",
    97  		typ, t.Typ, val, t.Line, t.Col)
    98  }
    99  
   100  func lex(name string, input string) ([]*Token, *Error) {
   101  	l := &lexer{
   102  		name:      name,
   103  		input:     input,
   104  		tokens:    make([]*Token, 0, 100),
   105  		line:      1,
   106  		col:       1,
   107  		startline: 1,
   108  		startcol:  1,
   109  	}
   110  	l.run()
   111  	if l.errored {
   112  		errtoken := l.tokens[len(l.tokens)-1]
   113  		return nil, &Error{
   114  			Filename: name,
   115  			Line:     errtoken.Line,
   116  			Column:   errtoken.Col,
   117  			Sender:   "lexer",
   118  			ErrorMsg: errtoken.Val,
   119  		}
   120  	}
   121  	return l.tokens, nil
   122  }
   123  
   124  func (l *lexer) value() string {
   125  	return l.input[l.start:l.pos]
   126  }
   127  
   128  func (l *lexer) length() int {
   129  	return l.pos - l.start
   130  }
   131  
   132  func (l *lexer) emit(t TokenType) {
   133  	tok := &Token{
   134  		Filename: l.name,
   135  		Typ:      t,
   136  		Val:      l.value(),
   137  		Line:     l.startline,
   138  		Col:      l.startcol,
   139  	}
   140  
   141  	if t == TokenString {
   142  		// Escape sequence \" in strings
   143  		tok.Val = strings.Replace(tok.Val, `\"`, `"`, -1)
   144  		tok.Val = strings.Replace(tok.Val, `\\`, `\`, -1)
   145  	}
   146  
   147  	l.tokens = append(l.tokens, tok)
   148  	l.start = l.pos
   149  	l.startline = l.line
   150  	l.startcol = l.col
   151  }
   152  
   153  func (l *lexer) next() rune {
   154  	if l.pos >= len(l.input) {
   155  		l.width = 0
   156  		return EOF
   157  	}
   158  	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
   159  	l.width = w
   160  	l.pos += l.width
   161  	l.col += l.width
   162  	return r
   163  }
   164  
   165  func (l *lexer) backup() {
   166  	l.pos -= l.width
   167  	l.col -= l.width
   168  }
   169  
   170  func (l *lexer) peek() rune {
   171  	r := l.next()
   172  	l.backup()
   173  	return r
   174  }
   175  
   176  func (l *lexer) ignore() {
   177  	l.start = l.pos
   178  	l.startline = l.line
   179  	l.startcol = l.col
   180  }
   181  
   182  func (l *lexer) accept(what string) bool {
   183  	if strings.IndexRune(what, l.next()) >= 0 {
   184  		return true
   185  	}
   186  	l.backup()
   187  	return false
   188  }
   189  
   190  func (l *lexer) acceptRun(what string) {
   191  	for strings.IndexRune(what, l.next()) >= 0 {
   192  	}
   193  	l.backup()
   194  }
   195  
   196  func (l *lexer) errorf(format string, args ...interface{}) lexerStateFn {
   197  	t := &Token{
   198  		Filename: l.name,
   199  		Typ:      TokenError,
   200  		Val:      fmt.Sprintf(format, args...),
   201  		Line:     l.startline,
   202  		Col:      l.startcol,
   203  	}
   204  	l.tokens = append(l.tokens, t)
   205  	l.errored = true
   206  	l.startline = l.line
   207  	l.startcol = l.col
   208  	return nil
   209  }
   210  
   211  func (l *lexer) eof() bool {
   212  	return l.start >= len(l.input)-1
   213  }
   214  
   215  func (l *lexer) run() {
   216  	for {
   217  		// TODO: Support verbatim tag names
   218  		// https://docs.djangoproject.com/en/dev/ref/templates/builtins/#verbatim
   219  		if l.in_verbatim {
   220  			name := l.verbatim_name
   221  			if name != "" {
   222  				name += " "
   223  			}
   224  			if strings.HasPrefix(l.input[l.pos:], fmt.Sprintf("{%% endverbatim %s%%}", name)) { // end verbatim
   225  				if l.pos > l.start {
   226  					l.emit(TokenHTML)
   227  				}
   228  				w := len("{% endverbatim %}")
   229  				l.pos += w
   230  				l.col += w
   231  				l.ignore()
   232  				l.in_verbatim = false
   233  			}
   234  		} else if strings.HasPrefix(l.input[l.pos:], "{% verbatim %}") { // tag
   235  			if l.pos > l.start {
   236  				l.emit(TokenHTML)
   237  			}
   238  			l.in_verbatim = true
   239  			w := len("{% verbatim %}")
   240  			l.pos += w
   241  			l.col += w
   242  			l.ignore()
   243  		}
   244  
   245  		if !l.in_verbatim {
   246  			// Ignore single-line comments {# ... #}
   247  			if strings.HasPrefix(l.input[l.pos:], "{#") {
   248  				if l.pos > l.start {
   249  					l.emit(TokenHTML)
   250  				}
   251  
   252  				l.pos += 2 // pass '{#'
   253  				l.col += 2
   254  
   255  				for {
   256  					switch l.peek() {
   257  					case EOF:
   258  						l.errorf("Single-line comment not closed.")
   259  						return
   260  					case '\n':
   261  						l.errorf("Newline not permitted in a single-line comment.")
   262  						return
   263  					}
   264  
   265  					if strings.HasPrefix(l.input[l.pos:], "#}") {
   266  						l.pos += 2 // pass '#}'
   267  						l.col += 2
   268  						break
   269  					}
   270  
   271  					l.next()
   272  				}
   273  				l.ignore() // ignore whole comment
   274  
   275  				// Comment skipped
   276  				continue // next token
   277  			}
   278  
   279  			if strings.HasPrefix(l.input[l.pos:], "{{") || // variable
   280  				strings.HasPrefix(l.input[l.pos:], "{%") { // tag
   281  				if l.pos > l.start {
   282  					l.emit(TokenHTML)
   283  				}
   284  				l.tokenize()
   285  				if l.errored {
   286  					return
   287  				}
   288  				continue
   289  			}
   290  		}
   291  
   292  		switch l.peek() {
   293  		case '\n':
   294  			l.line++
   295  			l.col = 0
   296  		}
   297  		if l.next() == EOF {
   298  			break
   299  		}
   300  	}
   301  
   302  	if l.pos > l.start {
   303  		l.emit(TokenHTML)
   304  	}
   305  
   306  	if l.in_verbatim {
   307  		l.errorf("verbatim-tag not closed, got EOF.")
   308  	}
   309  }
   310  
   311  func (l *lexer) tokenize() {
   312  	for state := l.stateCode; state != nil; {
   313  		state = state()
   314  	}
   315  }
   316  
   317  func (l *lexer) stateCode() lexerStateFn {
   318  outer_loop:
   319  	for {
   320  		switch {
   321  		case l.accept(tokenSpaceChars):
   322  			if l.value() == "\n" {
   323  				return l.errorf("Newline not allowed within tag/variable.")
   324  			}
   325  			l.ignore()
   326  			continue
   327  		case l.accept(tokenIdentifierChars):
   328  			return l.stateIdentifier
   329  		case l.accept(tokenDigits):
   330  			return l.stateNumber
   331  		case l.accept(`"`):
   332  			return l.stateString
   333  		}
   334  
   335  		// Check for symbol
   336  		for _, sym := range TokenSymbols {
   337  			if strings.HasPrefix(l.input[l.start:], sym) {
   338  				l.pos += len(sym)
   339  				l.col += l.length()
   340  				l.emit(TokenSymbol)
   341  
   342  				if sym == "%}" || sym == "}}" {
   343  					// Tag/variable end, return after emit
   344  					return nil
   345  				}
   346  
   347  				continue outer_loop
   348  			}
   349  		}
   350  
   351  		if l.pos < len(l.input) {
   352  			return l.errorf("Unknown character: %q (%d)", l.peek(), l.peek())
   353  		}
   354  
   355  		break
   356  	}
   357  
   358  	// Normal shut down
   359  	return nil
   360  }
   361  
   362  func (l *lexer) stateIdentifier() lexerStateFn {
   363  	l.acceptRun(tokenIdentifierChars)
   364  	l.acceptRun(tokenIdentifierCharsWithDigits)
   365  	for _, kw := range TokenKeywords {
   366  		if kw == l.value() {
   367  			l.emit(TokenKeyword)
   368  			return l.stateCode
   369  		}
   370  	}
   371  	l.emit(TokenIdentifier)
   372  	return l.stateCode
   373  }
   374  
   375  func (l *lexer) stateNumber() lexerStateFn {
   376  	l.acceptRun(tokenDigits)
   377  	/*
   378  		Maybe context-sensitive number lexing?
   379  		* comments.0.Text // first comment
   380  		* usercomments.1.0 // second user, first comment
   381  		* if (score >= 8.5) // 8.5 as a number
   382  
   383  		if l.peek() == '.' {
   384  			l.accept(".")
   385  			if !l.accept(tokenDigits) {
   386  				return l.errorf("Malformed number.")
   387  			}
   388  			l.acceptRun(tokenDigits)
   389  		}
   390  	*/
   391  	l.emit(TokenNumber)
   392  	return l.stateCode
   393  }
   394  
   395  func (l *lexer) stateString() lexerStateFn {
   396  	l.ignore()
   397  	l.startcol -= 1 // we're starting the position at the first "
   398  	for !l.accept(`"`) {
   399  		switch l.next() {
   400  		case '\\':
   401  			// escape sequence
   402  			switch l.peek() {
   403  			case '"', '\\':
   404  				l.next()
   405  			default:
   406  				return l.errorf("Unknown escape sequence: \\%c", l.peek())
   407  			}
   408  		case EOF:
   409  			return l.errorf("Unexpected EOF, string not closed.")
   410  		case '\n':
   411  			return l.errorf("Newline in string is not allowed.")
   412  		}
   413  	}
   414  	l.backup()
   415  	l.emit(TokenString)
   416  
   417  	l.next()
   418  	l.ignore()
   419  
   420  	return l.stateCode
   421  }