github.com/kristoff-it/hugo@v0.47.1/hugolib/shortcodeparser.go (about)

     1  // Copyright 2015 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package hugolib
    15  
    16  import (
    17  	"fmt"
    18  	"strings"
    19  	"unicode"
    20  	"unicode/utf8"
    21  )
    22  
    23  // The lexical scanning below is highly inspired by the great talk given by
    24  // Rob Pike called "Lexical Scanning in Go" (it's on YouTube, Google it!).
    25  // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
    26  
    27  // parsing
    28  
    29  type pageTokens struct {
    30  	lexer     *pagelexer
    31  	token     [3]item // 3-item look-ahead is what we currently need
    32  	peekCount int
    33  }
    34  
    35  func (t *pageTokens) next() item {
    36  	if t.peekCount > 0 {
    37  		t.peekCount--
    38  	} else {
    39  		t.token[0] = t.lexer.nextItem()
    40  	}
    41  	return t.token[t.peekCount]
    42  }
    43  
    44  // backs up one token.
    45  func (t *pageTokens) backup() {
    46  	t.peekCount++
    47  }
    48  
    49  // backs up two tokens.
    50  func (t *pageTokens) backup2(t1 item) {
    51  	t.token[1] = t1
    52  	t.peekCount = 2
    53  }
    54  
    55  // backs up three tokens.
    56  func (t *pageTokens) backup3(t2, t1 item) {
    57  	t.token[1] = t1
    58  	t.token[2] = t2
    59  	t.peekCount = 3
    60  }
    61  
    62  // check for non-error and non-EOF types coming next
    63  func (t *pageTokens) isValueNext() bool {
    64  	i := t.peek()
    65  	return i.typ != tError && i.typ != tEOF
    66  }
    67  
    68  // look at, but do not consume, the next item
    69  // repeated, sequential calls will return the same item
    70  func (t *pageTokens) peek() item {
    71  	if t.peekCount > 0 {
    72  		return t.token[t.peekCount-1]
    73  	}
    74  	t.peekCount = 1
    75  	t.token[0] = t.lexer.nextItem()
    76  	return t.token[0]
    77  }
    78  
    79  // convencience method to consume the next n tokens, but back off Errors and EOF
    80  func (t *pageTokens) consume(cnt int) {
    81  	for i := 0; i < cnt; i++ {
    82  		token := t.next()
    83  		if token.typ == tError || token.typ == tEOF {
    84  			t.backup()
    85  			break
    86  		}
    87  	}
    88  }
    89  
    90  // lexical scanning
    91  
    92  // position (in bytes)
    93  type pos int
    94  
    95  type item struct {
    96  	typ itemType
    97  	pos pos
    98  	val string
    99  }
   100  
   101  func (i item) String() string {
   102  	switch {
   103  	case i.typ == tEOF:
   104  		return "EOF"
   105  	case i.typ == tError:
   106  		return i.val
   107  	case i.typ > tKeywordMarker:
   108  		return fmt.Sprintf("<%s>", i.val)
   109  	case len(i.val) > 20:
   110  		return fmt.Sprintf("%.20q...", i.val)
   111  	}
   112  	return fmt.Sprintf("[%s]", i.val)
   113  }
   114  
   115  type itemType int
   116  
   117  const (
   118  	tError itemType = iota
   119  	tEOF
   120  
   121  	// shortcode items
   122  	tLeftDelimScNoMarkup
   123  	tRightDelimScNoMarkup
   124  	tLeftDelimScWithMarkup
   125  	tRightDelimScWithMarkup
   126  	tScClose
   127  	tScName
   128  	tScParam
   129  	tScParamVal
   130  
   131  	//itemIdentifier
   132  	tText // plain text, used for everything outside the shortcodes
   133  
   134  	// preserved for later - keywords come after this
   135  	tKeywordMarker
   136  )
   137  
   138  const eof = -1
   139  
   140  // returns the next state in scanner.
   141  type stateFunc func(*pagelexer) stateFunc
   142  
   143  type pagelexer struct {
   144  	name    string
   145  	input   string
   146  	state   stateFunc
   147  	pos     pos // input position
   148  	start   pos // item start position
   149  	width   pos // width of last element
   150  	lastPos pos // position of the last item returned by nextItem
   151  
   152  	// shortcode state
   153  	currLeftDelimItem  itemType
   154  	currRightDelimItem itemType
   155  	currShortcodeName  string          // is only set when a shortcode is in opened state
   156  	closingState       int             // > 0 = on its way to be closed
   157  	elementStepNum     int             // step number in element
   158  	paramElements      int             // number of elements (name + value = 2) found first
   159  	openShortcodes     map[string]bool // set of shortcodes in open state
   160  
   161  	// items delivered to client
   162  	items []item
   163  }
   164  
   165  // note: the input position here is normally 0 (start), but
   166  // can be set if position of first shortcode is known
   167  func newShortcodeLexer(name, input string, inputPosition pos) *pagelexer {
   168  	lexer := &pagelexer{
   169  		name:               name,
   170  		input:              input,
   171  		currLeftDelimItem:  tLeftDelimScNoMarkup,
   172  		currRightDelimItem: tRightDelimScNoMarkup,
   173  		pos:                inputPosition,
   174  		openShortcodes:     make(map[string]bool),
   175  		items:              make([]item, 0, 5),
   176  	}
   177  	lexer.runShortcodeLexer()
   178  	return lexer
   179  }
   180  
   181  // main loop
   182  // this looks kind of funky, but it works
   183  func (l *pagelexer) runShortcodeLexer() {
   184  	for l.state = lexTextOutsideShortcodes; l.state != nil; {
   185  		l.state = l.state(l)
   186  	}
   187  }
   188  
   189  // state functions
   190  
   191  const (
   192  	leftDelimScNoMarkup    = "{{<"
   193  	rightDelimScNoMarkup   = ">}}"
   194  	leftDelimScWithMarkup  = "{{%"
   195  	rightDelimScWithMarkup = "%}}"
   196  	leftComment            = "/*" // comments in this context us used to to mark shortcodes as "not really a shortcode"
   197  	rightComment           = "*/"
   198  )
   199  
   200  func (l *pagelexer) next() rune {
   201  	if int(l.pos) >= len(l.input) {
   202  		l.width = 0
   203  		return eof
   204  	}
   205  
   206  	// looks expensive, but should produce the same iteration sequence as the string range loop
   207  	// see: http://blog.golang.org/strings
   208  	runeValue, runeWidth := utf8.DecodeRuneInString(l.input[l.pos:])
   209  	l.width = pos(runeWidth)
   210  	l.pos += l.width
   211  	return runeValue
   212  }
   213  
   214  // peek, but no consume
   215  func (l *pagelexer) peek() rune {
   216  	r := l.next()
   217  	l.backup()
   218  	return r
   219  }
   220  
   221  // steps back one
   222  func (l *pagelexer) backup() {
   223  	l.pos -= l.width
   224  }
   225  
   226  // sends an item back to the client.
   227  func (l *pagelexer) emit(t itemType) {
   228  	l.items = append(l.items, item{t, l.start, l.input[l.start:l.pos]})
   229  	l.start = l.pos
   230  }
   231  
   232  // special case, do not send '\\' back to client
   233  func (l *pagelexer) ignoreEscapesAndEmit(t itemType) {
   234  	val := strings.Map(func(r rune) rune {
   235  		if r == '\\' {
   236  			return -1
   237  		}
   238  		return r
   239  	}, l.input[l.start:l.pos])
   240  	l.items = append(l.items, item{t, l.start, val})
   241  	l.start = l.pos
   242  }
   243  
   244  // gets the current value (for debugging and error handling)
   245  func (l *pagelexer) current() string {
   246  	return l.input[l.start:l.pos]
   247  }
   248  
   249  // ignore current element
   250  func (l *pagelexer) ignore() {
   251  	l.start = l.pos
   252  }
   253  
   254  // nice to have in error logs
   255  func (l *pagelexer) lineNum() int {
   256  	return strings.Count(l.input[:l.lastPos], "\n") + 1
   257  }
   258  
   259  // nil terminates the parser
   260  func (l *pagelexer) errorf(format string, args ...interface{}) stateFunc {
   261  	l.items = append(l.items, item{tError, l.start, fmt.Sprintf(format, args...)})
   262  	return nil
   263  }
   264  
   265  // consumes and returns the next item
   266  func (l *pagelexer) nextItem() item {
   267  	item := l.items[0]
   268  	l.items = l.items[1:]
   269  	l.lastPos = item.pos
   270  	return item
   271  }
   272  
   273  // scans until an opening shortcode opening bracket.
   274  // if no shortcodes, it will keep on scanning until EOF
   275  func lexTextOutsideShortcodes(l *pagelexer) stateFunc {
   276  	for {
   277  		if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {
   278  			if l.pos > l.start {
   279  				l.emit(tText)
   280  			}
   281  			if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) {
   282  				l.currLeftDelimItem = tLeftDelimScWithMarkup
   283  				l.currRightDelimItem = tRightDelimScWithMarkup
   284  			} else {
   285  				l.currLeftDelimItem = tLeftDelimScNoMarkup
   286  				l.currRightDelimItem = tRightDelimScNoMarkup
   287  			}
   288  			return lexShortcodeLeftDelim
   289  
   290  		}
   291  		if l.next() == eof {
   292  			break
   293  		}
   294  	}
   295  	// Done!
   296  	if l.pos > l.start {
   297  		l.emit(tText)
   298  	}
   299  	l.emit(tEOF)
   300  	return nil
   301  }
   302  
   303  func lexShortcodeLeftDelim(l *pagelexer) stateFunc {
   304  	l.pos += pos(len(l.currentLeftShortcodeDelim()))
   305  	if strings.HasPrefix(l.input[l.pos:], leftComment) {
   306  		return lexShortcodeComment
   307  	}
   308  	l.emit(l.currentLeftShortcodeDelimItem())
   309  	l.elementStepNum = 0
   310  	l.paramElements = 0
   311  	return lexInsideShortcode
   312  }
   313  
   314  func lexShortcodeComment(l *pagelexer) stateFunc {
   315  	posRightComment := strings.Index(l.input[l.pos:], rightComment+l.currentRightShortcodeDelim())
   316  	if posRightComment <= 1 {
   317  		return l.errorf("comment must be closed")
   318  	}
   319  	// we emit all as text, except the comment markers
   320  	l.emit(tText)
   321  	l.pos += pos(len(leftComment))
   322  	l.ignore()
   323  	l.pos += pos(posRightComment - len(leftComment))
   324  	l.emit(tText)
   325  	l.pos += pos(len(rightComment))
   326  	l.ignore()
   327  	l.pos += pos(len(l.currentRightShortcodeDelim()))
   328  	l.emit(tText)
   329  	return lexTextOutsideShortcodes
   330  }
   331  
   332  func lexShortcodeRightDelim(l *pagelexer) stateFunc {
   333  	l.closingState = 0
   334  	l.pos += pos(len(l.currentRightShortcodeDelim()))
   335  	l.emit(l.currentRightShortcodeDelimItem())
   336  	return lexTextOutsideShortcodes
   337  }
   338  
   339  // either:
   340  // 1. param
   341  // 2. "param" or "param\"
   342  // 3. param="123" or param="123\"
   343  // 4. param="Some \"escaped\" text"
   344  func lexShortcodeParam(l *pagelexer, escapedQuoteStart bool) stateFunc {
   345  
   346  	first := true
   347  	nextEq := false
   348  
   349  	var r rune
   350  
   351  	for {
   352  		r = l.next()
   353  		if first {
   354  			if r == '"' {
   355  				// a positional param with quotes
   356  				if l.paramElements == 2 {
   357  					return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")
   358  				}
   359  				l.paramElements = 1
   360  				l.backup()
   361  				return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)
   362  			}
   363  			first = false
   364  		} else if r == '=' {
   365  			// a named param
   366  			l.backup()
   367  			nextEq = true
   368  			break
   369  		}
   370  
   371  		if !isAlphaNumericOrHyphen(r) {
   372  			l.backup()
   373  			break
   374  		}
   375  	}
   376  
   377  	if l.paramElements == 0 {
   378  		l.paramElements++
   379  
   380  		if nextEq {
   381  			l.paramElements++
   382  		}
   383  	} else {
   384  		if nextEq && l.paramElements == 1 {
   385  			return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())
   386  		} else if !nextEq && l.paramElements == 2 {
   387  			return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())
   388  		}
   389  	}
   390  
   391  	l.emit(tScParam)
   392  	return lexInsideShortcode
   393  
   394  }
   395  
   396  func lexShortcodeQuotedParamVal(l *pagelexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc {
   397  	openQuoteFound := false
   398  	escapedInnerQuoteFound := false
   399  	escapedQuoteState := 0
   400  
   401  Loop:
   402  	for {
   403  		switch r := l.next(); {
   404  		case r == '\\':
   405  			if l.peek() == '"' {
   406  				if openQuoteFound && !escapedQuotedValuesAllowed {
   407  					l.backup()
   408  					break Loop
   409  				} else if openQuoteFound {
   410  					// the coming quoute is inside
   411  					escapedInnerQuoteFound = true
   412  					escapedQuoteState = 1
   413  				}
   414  			}
   415  		case r == eof, r == '\n':
   416  			return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())
   417  		case r == '"':
   418  			if escapedQuoteState == 0 {
   419  				if openQuoteFound {
   420  					l.backup()
   421  					break Loop
   422  
   423  				} else {
   424  					openQuoteFound = true
   425  					l.ignore()
   426  				}
   427  			} else {
   428  				escapedQuoteState = 0
   429  			}
   430  
   431  		}
   432  	}
   433  
   434  	if escapedInnerQuoteFound {
   435  		l.ignoreEscapesAndEmit(typ)
   436  	} else {
   437  		l.emit(typ)
   438  	}
   439  
   440  	r := l.next()
   441  
   442  	if r == '\\' {
   443  		if l.peek() == '"' {
   444  			// ignore the escaped closing quote
   445  			l.ignore()
   446  			l.next()
   447  			l.ignore()
   448  		}
   449  	} else if r == '"' {
   450  		// ignore closing quote
   451  		l.ignore()
   452  	} else {
   453  		// handled by next state
   454  		l.backup()
   455  	}
   456  
   457  	return lexInsideShortcode
   458  }
   459  
   460  // scans an alphanumeric inside shortcode
   461  func lexIdentifierInShortcode(l *pagelexer) stateFunc {
   462  	lookForEnd := false
   463  Loop:
   464  	for {
   465  		switch r := l.next(); {
   466  		case isAlphaNumericOrHyphen(r):
   467  		// Allow forward slash inside names to make it possible to create namespaces.
   468  		case r == '/':
   469  		default:
   470  			l.backup()
   471  			word := l.input[l.start:l.pos]
   472  			if l.closingState > 0 && !l.openShortcodes[word] {
   473  				return l.errorf("closing tag for shortcode '%s' does not match start tag", word)
   474  			} else if l.closingState > 0 {
   475  				l.openShortcodes[word] = false
   476  				lookForEnd = true
   477  			}
   478  
   479  			l.closingState = 0
   480  			l.currShortcodeName = word
   481  			l.openShortcodes[word] = true
   482  			l.elementStepNum++
   483  			l.emit(tScName)
   484  			break Loop
   485  		}
   486  	}
   487  
   488  	if lookForEnd {
   489  		return lexEndOfShortcode
   490  	}
   491  	return lexInsideShortcode
   492  }
   493  
   494  func lexEndOfShortcode(l *pagelexer) stateFunc {
   495  	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {
   496  		return lexShortcodeRightDelim
   497  	}
   498  	switch r := l.next(); {
   499  	case isSpace(r):
   500  		l.ignore()
   501  	default:
   502  		return l.errorf("unclosed shortcode")
   503  	}
   504  	return lexEndOfShortcode
   505  }
   506  
   507  // scans the elements inside shortcode tags
   508  func lexInsideShortcode(l *pagelexer) stateFunc {
   509  	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {
   510  		return lexShortcodeRightDelim
   511  	}
   512  	switch r := l.next(); {
   513  	case r == eof:
   514  		// eol is allowed inside shortcodes; this may go to end of document before it fails
   515  		return l.errorf("unclosed shortcode action")
   516  	case isSpace(r), isEndOfLine(r):
   517  		l.ignore()
   518  	case r == '=':
   519  		l.ignore()
   520  		return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)
   521  	case r == '/':
   522  		if l.currShortcodeName == "" {
   523  			return l.errorf("got closing shortcode, but none is open")
   524  		}
   525  		l.closingState++
   526  		l.emit(tScClose)
   527  	case r == '\\':
   528  		l.ignore()
   529  		if l.peek() == '"' {
   530  			return lexShortcodeParam(l, true)
   531  		}
   532  	case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes
   533  		l.backup()
   534  		return lexShortcodeParam(l, false)
   535  	case isAlphaNumeric(r):
   536  		l.backup()
   537  		return lexIdentifierInShortcode
   538  	default:
   539  		return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)
   540  	}
   541  	return lexInsideShortcode
   542  }
   543  
   544  // state helpers
   545  
   546  func (l *pagelexer) currentLeftShortcodeDelimItem() itemType {
   547  	return l.currLeftDelimItem
   548  }
   549  
   550  func (l *pagelexer) currentRightShortcodeDelimItem() itemType {
   551  	return l.currRightDelimItem
   552  }
   553  
   554  func (l *pagelexer) currentLeftShortcodeDelim() string {
   555  	if l.currLeftDelimItem == tLeftDelimScWithMarkup {
   556  		return leftDelimScWithMarkup
   557  	}
   558  	return leftDelimScNoMarkup
   559  
   560  }
   561  
   562  func (l *pagelexer) currentRightShortcodeDelim() string {
   563  	if l.currRightDelimItem == tRightDelimScWithMarkup {
   564  		return rightDelimScWithMarkup
   565  	}
   566  	return rightDelimScNoMarkup
   567  }
   568  
   569  // helper functions
   570  
   571  func isSpace(r rune) bool {
   572  	return r == ' ' || r == '\t'
   573  }
   574  
   575  func isAlphaNumericOrHyphen(r rune) bool {
   576  	// let unquoted YouTube ids as positional params slip through (they contain hyphens)
   577  	return isAlphaNumeric(r) || r == '-'
   578  }
   579  
   580  func isEndOfLine(r rune) bool {
   581  	return r == '\r' || r == '\n'
   582  }
   583  
   584  func isAlphaNumeric(r rune) bool {
   585  	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
   586  }