github.com/kovansky/hugo@v0.92.3-0.20220224232819-63076e4ff19f/parser/pageparser/pagelexer.go (about)

     1  // Copyright 2018 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package pageparser
    15  
    16  import (
    17  	"bytes"
    18  	"fmt"
    19  	"unicode"
    20  	"unicode/utf8"
    21  )
    22  
    23  const eof = -1
    24  
    25  // returns the next state in scanner.
    26  type stateFunc func(*pageLexer) stateFunc
    27  
    28  type pageLexer struct {
    29  	input      []byte
    30  	stateStart stateFunc
    31  	state      stateFunc
    32  	pos        int // input position
    33  	start      int // item start position
    34  	width      int // width of last element
    35  
    36  	// Contains lexers for shortcodes and other main section
    37  	// elements.
    38  	sectionHandlers *sectionHandlers
    39  
    40  	cfg Config
    41  
    42  	// The summary divider to look for.
    43  	summaryDivider []byte
    44  	// Set when we have parsed any summary divider
    45  	summaryDividerChecked bool
    46  	// Whether we're in a HTML comment.
    47  	isInHTMLComment bool
    48  
    49  	lexerShortcodeState
    50  
    51  	// items delivered to client
    52  	items Items
    53  }
    54  
    55  // Implement the Result interface
    56  func (l *pageLexer) Iterator() *Iterator {
    57  	return l.newIterator()
    58  }
    59  
    60  func (l *pageLexer) Input() []byte {
    61  	return l.input
    62  }
    63  
    64  type Config struct {
    65  	EnableEmoji bool
    66  }
    67  
    68  // note: the input position here is normally 0 (start), but
    69  // can be set if position of first shortcode is known
    70  func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
    71  	lexer := &pageLexer{
    72  		input:      input,
    73  		stateStart: stateStart,
    74  		cfg:        cfg,
    75  		lexerShortcodeState: lexerShortcodeState{
    76  			currLeftDelimItem:  tLeftDelimScNoMarkup,
    77  			currRightDelimItem: tRightDelimScNoMarkup,
    78  			openShortcodes:     make(map[string]bool),
    79  		},
    80  		items: make([]Item, 0, 5),
    81  	}
    82  
    83  	lexer.sectionHandlers = createSectionHandlers(lexer)
    84  
    85  	return lexer
    86  }
    87  
    88  func (l *pageLexer) newIterator() *Iterator {
    89  	return &Iterator{l: l, lastPos: -1}
    90  }
    91  
    92  // main loop
    93  func (l *pageLexer) run() *pageLexer {
    94  	for l.state = l.stateStart; l.state != nil; {
    95  		l.state = l.state(l)
    96  	}
    97  	return l
    98  }
    99  
   100  // Page syntax
   101  var (
   102  	byteOrderMark     = '\ufeff'
   103  	summaryDivider    = []byte("<!--more-->")
   104  	summaryDividerOrg = []byte("# more")
   105  	delimTOML         = []byte("+++")
   106  	delimYAML         = []byte("---")
   107  	delimOrg          = []byte("#+")
   108  	htmlCommentStart  = []byte("<!--")
   109  	htmlCommentEnd    = []byte("-->")
   110  
   111  	emojiDelim = byte(':')
   112  )
   113  
   114  func (l *pageLexer) next() rune {
   115  	if l.pos >= len(l.input) {
   116  		l.width = 0
   117  		return eof
   118  	}
   119  
   120  	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
   121  	l.width = runeWidth
   122  	l.pos += l.width
   123  	return runeValue
   124  }
   125  
   126  // peek, but no consume
   127  func (l *pageLexer) peek() rune {
   128  	r := l.next()
   129  	l.backup()
   130  	return r
   131  }
   132  
   133  // steps back one
   134  func (l *pageLexer) backup() {
   135  	l.pos -= l.width
   136  }
   137  
   138  // sends an item back to the client.
   139  func (l *pageLexer) emit(t ItemType) {
   140  	l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], false})
   141  	l.start = l.pos
   142  }
   143  
   144  // sends a string item back to the client.
   145  func (l *pageLexer) emitString(t ItemType) {
   146  	l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], true})
   147  	l.start = l.pos
   148  }
   149  
   150  func (l *pageLexer) isEOF() bool {
   151  	return l.pos >= len(l.input)
   152  }
   153  
   154  // special case, do not send '\\' back to client
   155  func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) {
   156  	val := bytes.Map(func(r rune) rune {
   157  		if r == '\\' {
   158  			return -1
   159  		}
   160  		return r
   161  	}, l.input[l.start:l.pos])
   162  	l.items = append(l.items, Item{t, l.start, val, isString})
   163  	l.start = l.pos
   164  }
   165  
   166  // gets the current value (for debugging and error handling)
   167  func (l *pageLexer) current() []byte {
   168  	return l.input[l.start:l.pos]
   169  }
   170  
   171  // ignore current element
   172  func (l *pageLexer) ignore() {
   173  	l.start = l.pos
   174  }
   175  
   176  var lf = []byte("\n")
   177  
   178  // nil terminates the parser
   179  func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc {
   180  	l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...)), true})
   181  	return nil
   182  }
   183  
   184  func (l *pageLexer) consumeCRLF() bool {
   185  	var consumed bool
   186  	for _, r := range crLf {
   187  		if l.next() != r {
   188  			l.backup()
   189  		} else {
   190  			consumed = true
   191  		}
   192  	}
   193  	return consumed
   194  }
   195  
   196  func (l *pageLexer) consumeToNextLine() {
   197  	for {
   198  		r := l.next()
   199  		if r == eof || isEndOfLine(r) {
   200  			return
   201  		}
   202  	}
   203  }
   204  
   205  func (l *pageLexer) consumeToSpace() {
   206  	for {
   207  		r := l.next()
   208  		if r == eof || unicode.IsSpace(r) {
   209  			l.backup()
   210  			return
   211  		}
   212  	}
   213  }
   214  
   215  func (l *pageLexer) consumeSpace() {
   216  	for {
   217  		r := l.next()
   218  		if r == eof || !unicode.IsSpace(r) {
   219  			l.backup()
   220  			return
   221  		}
   222  	}
   223  }
   224  
   225  // lex a string starting at ":"
   226  func lexEmoji(l *pageLexer) stateFunc {
   227  	pos := l.pos + 1
   228  	valid := false
   229  
   230  	for i := pos; i < len(l.input); i++ {
   231  		if i > pos && l.input[i] == emojiDelim {
   232  			pos = i + 1
   233  			valid = true
   234  			break
   235  		}
   236  		r, _ := utf8.DecodeRune(l.input[i:])
   237  		if !(isAlphaNumericOrHyphen(r) || r == '+') {
   238  			break
   239  		}
   240  	}
   241  
   242  	if valid {
   243  		l.pos = pos
   244  		l.emit(TypeEmoji)
   245  	} else {
   246  		l.pos++
   247  		l.emit(tText)
   248  	}
   249  
   250  	return lexMainSection
   251  }
   252  
   253  type sectionHandlers struct {
   254  	l *pageLexer
   255  
   256  	// Set when none of the sections are found so we
   257  	// can safely stop looking and skip to the end.
   258  	skipAll bool
   259  
   260  	handlers    []*sectionHandler
   261  	skipIndexes []int
   262  }
   263  
   264  func (s *sectionHandlers) skip() int {
   265  	if s.skipAll {
   266  		return -1
   267  	}
   268  
   269  	s.skipIndexes = s.skipIndexes[:0]
   270  	var shouldSkip bool
   271  	for _, skipper := range s.handlers {
   272  		idx := skipper.skip()
   273  		if idx != -1 {
   274  			shouldSkip = true
   275  			s.skipIndexes = append(s.skipIndexes, idx)
   276  		}
   277  	}
   278  
   279  	if !shouldSkip {
   280  		s.skipAll = true
   281  		return -1
   282  	}
   283  
   284  	return minIndex(s.skipIndexes...)
   285  }
   286  
   287  func createSectionHandlers(l *pageLexer) *sectionHandlers {
   288  	shortCodeHandler := &sectionHandler{
   289  		l: l,
   290  		skipFunc: func(l *pageLexer) int {
   291  			return l.index(leftDelimSc)
   292  		},
   293  		lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
   294  			if !l.isShortCodeStart() {
   295  				return origin, false
   296  			}
   297  
   298  			if l.isInline {
   299  				// If we're inside an inline shortcode, the only valid shortcode markup is
   300  				// the markup which closes it.
   301  				b := l.input[l.pos+3:]
   302  				end := indexNonWhiteSpace(b, '/')
   303  				if end != len(l.input)-1 {
   304  					b = bytes.TrimSpace(b[end+1:])
   305  					if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) {
   306  						return l.errorf("inline shortcodes do not support nesting"), true
   307  					}
   308  				}
   309  			}
   310  
   311  			if l.hasPrefix(leftDelimScWithMarkup) {
   312  				l.currLeftDelimItem = tLeftDelimScWithMarkup
   313  				l.currRightDelimItem = tRightDelimScWithMarkup
   314  			} else {
   315  				l.currLeftDelimItem = tLeftDelimScNoMarkup
   316  				l.currRightDelimItem = tRightDelimScNoMarkup
   317  			}
   318  
   319  			return lexShortcodeLeftDelim, true
   320  		},
   321  	}
   322  
   323  	summaryDividerHandler := &sectionHandler{
   324  		l: l,
   325  		skipFunc: func(l *pageLexer) int {
   326  			if l.summaryDividerChecked || l.summaryDivider == nil {
   327  				return -1
   328  			}
   329  			return l.index(l.summaryDivider)
   330  		},
   331  		lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
   332  			if !l.hasPrefix(l.summaryDivider) {
   333  				return origin, false
   334  			}
   335  
   336  			l.summaryDividerChecked = true
   337  			l.pos += len(l.summaryDivider)
   338  			// This makes it a little easier to reason about later.
   339  			l.consumeSpace()
   340  			l.emit(TypeLeadSummaryDivider)
   341  
   342  			return origin, true
   343  		},
   344  	}
   345  
   346  	handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler}
   347  
   348  	if l.cfg.EnableEmoji {
   349  		emojiHandler := &sectionHandler{
   350  			l: l,
   351  			skipFunc: func(l *pageLexer) int {
   352  				return l.indexByte(emojiDelim)
   353  			},
   354  			lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
   355  				return lexEmoji, true
   356  			},
   357  		}
   358  
   359  		handlers = append(handlers, emojiHandler)
   360  	}
   361  
   362  	return &sectionHandlers{
   363  		l:           l,
   364  		handlers:    handlers,
   365  		skipIndexes: make([]int, len(handlers)),
   366  	}
   367  }
   368  
   369  func (s *sectionHandlers) lex(origin stateFunc) stateFunc {
   370  	if s.skipAll {
   371  		return nil
   372  	}
   373  
   374  	if s.l.pos > s.l.start {
   375  		s.l.emit(tText)
   376  	}
   377  
   378  	for _, handler := range s.handlers {
   379  		if handler.skipAll {
   380  			continue
   381  		}
   382  
   383  		next, handled := handler.lexFunc(origin, handler.l)
   384  		if next == nil || handled {
   385  			return next
   386  		}
   387  	}
   388  
   389  	// Not handled by the above.
   390  	s.l.pos++
   391  
   392  	return origin
   393  }
   394  
   395  type sectionHandler struct {
   396  	l *pageLexer
   397  
   398  	// No more sections of this type.
   399  	skipAll bool
   400  
   401  	// Returns the index of the next match, -1 if none found.
   402  	skipFunc func(l *pageLexer) int
   403  
   404  	// Lex lexes the current section and returns the next state func and
   405  	// a bool telling if this section was handled.
   406  	// Note that returning nil as the next state will terminate the
   407  	// lexer.
   408  	lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool)
   409  }
   410  
   411  func (s *sectionHandler) skip() int {
   412  	if s.skipAll {
   413  		return -1
   414  	}
   415  
   416  	idx := s.skipFunc(s.l)
   417  	if idx == -1 {
   418  		s.skipAll = true
   419  	}
   420  	return idx
   421  }
   422  
   423  func lexMainSection(l *pageLexer) stateFunc {
   424  	if l.isEOF() {
   425  		return lexDone
   426  	}
   427  
   428  	if l.isInHTMLComment {
   429  		return lexEndFrontMatterHTMLComment
   430  	}
   431  
   432  	// Fast forward as far as possible.
   433  	skip := l.sectionHandlers.skip()
   434  
   435  	if skip == -1 {
   436  		l.pos = len(l.input)
   437  		return lexDone
   438  	} else if skip > 0 {
   439  		l.pos += skip
   440  	}
   441  
   442  	next := l.sectionHandlers.lex(lexMainSection)
   443  	if next != nil {
   444  		return next
   445  	}
   446  
   447  	l.pos = len(l.input)
   448  	return lexDone
   449  }
   450  
   451  func lexDone(l *pageLexer) stateFunc {
   452  	// Done!
   453  	if l.pos > l.start {
   454  		l.emit(tText)
   455  	}
   456  	l.emit(tEOF)
   457  	return nil
   458  }
   459  
   460  func (l *pageLexer) printCurrentInput() {
   461  	fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
   462  }
   463  
   464  // state helpers
   465  
   466  func (l *pageLexer) index(sep []byte) int {
   467  	return bytes.Index(l.input[l.pos:], sep)
   468  }
   469  
   470  func (l *pageLexer) indexByte(sep byte) int {
   471  	return bytes.IndexByte(l.input[l.pos:], sep)
   472  }
   473  
   474  func (l *pageLexer) hasPrefix(prefix []byte) bool {
   475  	return bytes.HasPrefix(l.input[l.pos:], prefix)
   476  }
   477  
   478  // helper functions
   479  
   480  // returns the min index >= 0
   481  func minIndex(indices ...int) int {
   482  	min := -1
   483  
   484  	for _, j := range indices {
   485  		if j < 0 {
   486  			continue
   487  		}
   488  		if min == -1 {
   489  			min = j
   490  		} else if j < min {
   491  			min = j
   492  		}
   493  	}
   494  	return min
   495  }
   496  
   497  func indexNonWhiteSpace(s []byte, in rune) int {
   498  	idx := bytes.IndexFunc(s, func(r rune) bool {
   499  		return !unicode.IsSpace(r)
   500  	})
   501  
   502  	if idx == -1 {
   503  		return -1
   504  	}
   505  
   506  	r, _ := utf8.DecodeRune(s[idx:])
   507  	if r == in {
   508  		return idx
   509  	}
   510  	return -1
   511  }
   512  
   513  func isSpace(r rune) bool {
   514  	return r == ' ' || r == '\t'
   515  }
   516  
   517  func isAlphaNumericOrHyphen(r rune) bool {
   518  	// let unquoted YouTube ids as positional params slip through (they contain hyphens)
   519  	return isAlphaNumeric(r) || r == '-'
   520  }
   521  
   522  var crLf = []rune{'\r', '\n'}
   523  
   524  func isEndOfLine(r rune) bool {
   525  	return r == '\r' || r == '\n'
   526  }
   527  
   528  func isAlphaNumeric(r rune) bool {
   529  	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
   530  }