github.com/SuCicada/su-hugo@v1.0.0/parser/pageparser/pagelexer.go (about)

     1  // Copyright 2018 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package pageparser
    15  
    16  import (
    17  	"bytes"
    18  	"fmt"
    19  	"unicode"
    20  	"unicode/utf8"
    21  )
    22  
    23  const eof = -1
    24  
    25  // returns the next state in scanner.
    26  type stateFunc func(*pageLexer) stateFunc
    27  
    28  type pageLexer struct {
    29  	input      []byte
    30  	stateStart stateFunc
    31  	state      stateFunc
    32  	pos        int // input position
    33  	start      int // item start position
    34  	width      int // width of last element
    35  
    36  	// Contains lexers for shortcodes and other main section
    37  	// elements.
    38  	sectionHandlers *sectionHandlers
    39  
    40  	cfg Config
    41  
    42  	// The summary divider to look for.
    43  	summaryDivider []byte
    44  	// Set when we have parsed any summary divider
    45  	summaryDividerChecked bool
    46  	// Whether we're in a HTML comment.
    47  	isInHTMLComment bool
    48  
    49  	lexerShortcodeState
    50  
    51  	// items delivered to client
    52  	items Items
    53  }
    54  
    55  // Implement the Result interface
    56  func (l *pageLexer) Iterator() *Iterator {
    57  	return NewIterator(l.items)
    58  }
    59  
    60  func (l *pageLexer) Input() []byte {
    61  	return l.input
    62  }
    63  
    64  type Config struct {
    65  	EnableEmoji bool
    66  }
    67  
    68  // note: the input position here is normally 0 (start), but
    69  // can be set if position of first shortcode is known
    70  func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
    71  	lexer := &pageLexer{
    72  		input:      input,
    73  		stateStart: stateStart,
    74  		cfg:        cfg,
    75  		lexerShortcodeState: lexerShortcodeState{
    76  			currLeftDelimItem:  tLeftDelimScNoMarkup,
    77  			currRightDelimItem: tRightDelimScNoMarkup,
    78  			openShortcodes:     make(map[string]bool),
    79  		},
    80  		items: make([]Item, 0, 5),
    81  	}
    82  
    83  	lexer.sectionHandlers = createSectionHandlers(lexer)
    84  
    85  	return lexer
    86  }
    87  
    88  // main loop
    89  func (l *pageLexer) run() *pageLexer {
    90  	for l.state = l.stateStart; l.state != nil; {
    91  		l.state = l.state(l)
    92  	}
    93  	return l
    94  }
    95  
    96  // Page syntax
    97  var (
    98  	byteOrderMark     = '\ufeff'
    99  	summaryDivider    = []byte("<!--more-->")
   100  	summaryDividerOrg = []byte("# more")
   101  	delimTOML         = []byte("+++")
   102  	delimYAML         = []byte("---")
   103  	delimOrg          = []byte("#+")
   104  	htmlCommentStart  = []byte("<!--")
   105  	htmlCommentEnd    = []byte("-->")
   106  
   107  	emojiDelim = byte(':')
   108  )
   109  
   110  func (l *pageLexer) next() rune {
   111  	if l.pos >= len(l.input) {
   112  		l.width = 0
   113  		return eof
   114  	}
   115  
   116  	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
   117  	l.width = runeWidth
   118  	l.pos += l.width
   119  
   120  	return runeValue
   121  }
   122  
   123  // peek, but no consume
   124  func (l *pageLexer) peek() rune {
   125  	r := l.next()
   126  	l.backup()
   127  	return r
   128  }
   129  
   130  // steps back one
   131  func (l *pageLexer) backup() {
   132  	l.pos -= l.width
   133  }
   134  
   135  func (l *pageLexer) append(item Item) {
   136  	if item.Pos() < len(l.input) {
   137  		item.firstByte = l.input[item.Pos()]
   138  	}
   139  	l.items = append(l.items, item)
   140  }
   141  
   142  // sends an item back to the client.
   143  func (l *pageLexer) emit(t ItemType) {
   144  	defer func() {
   145  		l.start = l.pos
   146  	}()
   147  
   148  	if t == tText {
   149  		// Identify any trailing whitespace/intendation.
   150  		// We currently only care about the last one.
   151  		for i := l.pos - 1; i >= l.start; i-- {
   152  			b := l.input[i]
   153  			if b != ' ' && b != '\t' && b != '\r' && b != '\n' {
   154  				break
   155  			}
   156  			if i == l.start && b != '\n' {
   157  				l.append(Item{Type: tIndentation, low: l.start, high: l.pos})
   158  				return
   159  			} else if b == '\n' && i < l.pos-1 {
   160  				l.append(Item{Type: t, low: l.start, high: i + 1})
   161  				l.append(Item{Type: tIndentation, low: i + 1, high: l.pos})
   162  				return
   163  			} else if b == '\n' && i == l.pos-1 {
   164  				break
   165  			}
   166  
   167  		}
   168  	}
   169  
   170  	l.append(Item{Type: t, low: l.start, high: l.pos})
   171  
   172  }
   173  
   174  // sends a string item back to the client.
   175  func (l *pageLexer) emitString(t ItemType) {
   176  	l.append(Item{Type: t, low: l.start, high: l.pos, isString: true})
   177  	l.start = l.pos
   178  }
   179  
   180  func (l *pageLexer) isEOF() bool {
   181  	return l.pos >= len(l.input)
   182  }
   183  
   184  // special case, do not send '\\' back to client
   185  func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) {
   186  	i := l.start
   187  	k := i
   188  
   189  	var segments []lowHigh
   190  
   191  	for i < l.pos {
   192  		r, w := utf8.DecodeRune(l.input[i:l.pos])
   193  		if r == '\\' {
   194  			if i > k {
   195  				segments = append(segments, lowHigh{k, i})
   196  			}
   197  			// See issue #10236.
   198  			// We don't send the backslash back to the client,
   199  			// which makes the end parsing simpler.
   200  			// This means that we cannot render the AST back to be
   201  			// exactly the same as the input,
   202  			// but that was also the situation before we introduced the issue in #10236.
   203  			k = i + w
   204  		}
   205  		i += w
   206  	}
   207  
   208  	if k < l.pos {
   209  		segments = append(segments, lowHigh{k, l.pos})
   210  	}
   211  
   212  	if len(segments) > 0 {
   213  		l.append(Item{Type: t, segments: segments})
   214  	}
   215  
   216  	l.start = l.pos
   217  
   218  }
   219  
   220  // gets the current value (for debugging and error handling)
   221  func (l *pageLexer) current() []byte {
   222  	return l.input[l.start:l.pos]
   223  }
   224  
   225  // ignore current element
   226  func (l *pageLexer) ignore() {
   227  	l.start = l.pos
   228  }
   229  
   230  var lf = []byte("\n")
   231  
   232  // nil terminates the parser
   233  func (l *pageLexer) errorf(format string, args ...any) stateFunc {
   234  	l.append(Item{Type: tError, Err: fmt.Errorf(format, args...)})
   235  	return nil
   236  }
   237  
   238  func (l *pageLexer) consumeCRLF() bool {
   239  	var consumed bool
   240  	for _, r := range crLf {
   241  		if l.next() != r {
   242  			l.backup()
   243  		} else {
   244  			consumed = true
   245  		}
   246  	}
   247  	return consumed
   248  }
   249  
   250  func (l *pageLexer) consumeToNextLine() {
   251  	for {
   252  		r := l.next()
   253  		if r == eof || isEndOfLine(r) {
   254  			return
   255  		}
   256  	}
   257  }
   258  
   259  func (l *pageLexer) consumeToSpace() {
   260  	for {
   261  		r := l.next()
   262  		if r == eof || unicode.IsSpace(r) {
   263  			l.backup()
   264  			return
   265  		}
   266  	}
   267  }
   268  
   269  func (l *pageLexer) consumeSpace() {
   270  	for {
   271  		r := l.next()
   272  		if r == eof || !unicode.IsSpace(r) {
   273  			l.backup()
   274  			return
   275  		}
   276  	}
   277  }
   278  
   279  // lex a string starting at ":"
   280  func lexEmoji(l *pageLexer) stateFunc {
   281  	pos := l.pos + 1
   282  	valid := false
   283  
   284  	for i := pos; i < len(l.input); i++ {
   285  		if i > pos && l.input[i] == emojiDelim {
   286  			pos = i + 1
   287  			valid = true
   288  			break
   289  		}
   290  		r, _ := utf8.DecodeRune(l.input[i:])
   291  		if !(isAlphaNumericOrHyphen(r) || r == '+') {
   292  			break
   293  		}
   294  	}
   295  
   296  	if valid {
   297  		l.pos = pos
   298  		l.emit(TypeEmoji)
   299  	} else {
   300  		l.pos++
   301  		l.emit(tText)
   302  	}
   303  
   304  	return lexMainSection
   305  }
   306  
   307  type sectionHandlers struct {
   308  	l *pageLexer
   309  
   310  	// Set when none of the sections are found so we
   311  	// can safely stop looking and skip to the end.
   312  	skipAll bool
   313  
   314  	handlers    []*sectionHandler
   315  	skipIndexes []int
   316  }
   317  
   318  func (s *sectionHandlers) skip() int {
   319  	if s.skipAll {
   320  		return -1
   321  	}
   322  
   323  	s.skipIndexes = s.skipIndexes[:0]
   324  	var shouldSkip bool
   325  	for _, skipper := range s.handlers {
   326  		idx := skipper.skip()
   327  		if idx != -1 {
   328  			shouldSkip = true
   329  			s.skipIndexes = append(s.skipIndexes, idx)
   330  		}
   331  	}
   332  
   333  	if !shouldSkip {
   334  		s.skipAll = true
   335  		return -1
   336  	}
   337  
   338  	return minIndex(s.skipIndexes...)
   339  }
   340  
   341  func createSectionHandlers(l *pageLexer) *sectionHandlers {
   342  	shortCodeHandler := &sectionHandler{
   343  		l: l,
   344  		skipFunc: func(l *pageLexer) int {
   345  			return l.index(leftDelimSc)
   346  		},
   347  		lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
   348  			if !l.isShortCodeStart() {
   349  				return origin, false
   350  			}
   351  
   352  			if l.isInline {
   353  				// If we're inside an inline shortcode, the only valid shortcode markup is
   354  				// the markup which closes it.
   355  				b := l.input[l.pos+3:]
   356  				end := indexNonWhiteSpace(b, '/')
   357  				if end != len(l.input)-1 {
   358  					b = bytes.TrimSpace(b[end+1:])
   359  					if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) {
   360  						return l.errorf("inline shortcodes do not support nesting"), true
   361  					}
   362  				}
   363  			}
   364  
   365  			if l.hasPrefix(leftDelimScWithMarkup) {
   366  				l.currLeftDelimItem = tLeftDelimScWithMarkup
   367  				l.currRightDelimItem = tRightDelimScWithMarkup
   368  			} else {
   369  				l.currLeftDelimItem = tLeftDelimScNoMarkup
   370  				l.currRightDelimItem = tRightDelimScNoMarkup
   371  			}
   372  
   373  			return lexShortcodeLeftDelim, true
   374  		},
   375  	}
   376  
   377  	summaryDividerHandler := &sectionHandler{
   378  		l: l,
   379  		skipFunc: func(l *pageLexer) int {
   380  			if l.summaryDividerChecked || l.summaryDivider == nil {
   381  				return -1
   382  			}
   383  			return l.index(l.summaryDivider)
   384  		},
   385  		lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
   386  			if !l.hasPrefix(l.summaryDivider) {
   387  				return origin, false
   388  			}
   389  
   390  			l.summaryDividerChecked = true
   391  			l.pos += len(l.summaryDivider)
   392  			// This makes it a little easier to reason about later.
   393  			l.consumeSpace()
   394  			l.emit(TypeLeadSummaryDivider)
   395  
   396  			return origin, true
   397  		},
   398  	}
   399  
   400  	handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler}
   401  
   402  	if l.cfg.EnableEmoji {
   403  		emojiHandler := &sectionHandler{
   404  			l: l,
   405  			skipFunc: func(l *pageLexer) int {
   406  				return l.indexByte(emojiDelim)
   407  			},
   408  			lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
   409  				return lexEmoji, true
   410  			},
   411  		}
   412  
   413  		handlers = append(handlers, emojiHandler)
   414  	}
   415  
   416  	return &sectionHandlers{
   417  		l:           l,
   418  		handlers:    handlers,
   419  		skipIndexes: make([]int, len(handlers)),
   420  	}
   421  }
   422  
   423  func (s *sectionHandlers) lex(origin stateFunc) stateFunc {
   424  	if s.skipAll {
   425  		return nil
   426  	}
   427  
   428  	if s.l.pos > s.l.start {
   429  		s.l.emit(tText)
   430  	}
   431  
   432  	for _, handler := range s.handlers {
   433  		if handler.skipAll {
   434  			continue
   435  		}
   436  
   437  		next, handled := handler.lexFunc(origin, handler.l)
   438  		if next == nil || handled {
   439  			return next
   440  		}
   441  	}
   442  
   443  	// Not handled by the above.
   444  	s.l.pos++
   445  
   446  	return origin
   447  }
   448  
   449  type sectionHandler struct {
   450  	l *pageLexer
   451  
   452  	// No more sections of this type.
   453  	skipAll bool
   454  
   455  	// Returns the index of the next match, -1 if none found.
   456  	skipFunc func(l *pageLexer) int
   457  
   458  	// Lex lexes the current section and returns the next state func and
   459  	// a bool telling if this section was handled.
   460  	// Note that returning nil as the next state will terminate the
   461  	// lexer.
   462  	lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool)
   463  }
   464  
   465  func (s *sectionHandler) skip() int {
   466  	if s.skipAll {
   467  		return -1
   468  	}
   469  
   470  	idx := s.skipFunc(s.l)
   471  	if idx == -1 {
   472  		s.skipAll = true
   473  	}
   474  	return idx
   475  }
   476  
   477  func lexMainSection(l *pageLexer) stateFunc {
   478  	if l.isEOF() {
   479  		return lexDone
   480  	}
   481  
   482  	if l.isInHTMLComment {
   483  		return lexEndFrontMatterHTMLComment
   484  	}
   485  
   486  	// Fast forward as far as possible.
   487  	skip := l.sectionHandlers.skip()
   488  
   489  	if skip == -1 {
   490  		l.pos = len(l.input)
   491  		return lexDone
   492  	} else if skip > 0 {
   493  		l.pos += skip
   494  	}
   495  
   496  	next := l.sectionHandlers.lex(lexMainSection)
   497  	if next != nil {
   498  		return next
   499  	}
   500  
   501  	l.pos = len(l.input)
   502  	return lexDone
   503  }
   504  
   505  func lexDone(l *pageLexer) stateFunc {
   506  	// Done!
   507  	if l.pos > l.start {
   508  		l.emit(tText)
   509  	}
   510  	l.emit(tEOF)
   511  	return nil
   512  }
   513  
   514  func (l *pageLexer) printCurrentInput() {
   515  	fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
   516  }
   517  
   518  // state helpers
   519  
   520  func (l *pageLexer) index(sep []byte) int {
   521  	return bytes.Index(l.input[l.pos:], sep)
   522  }
   523  
   524  func (l *pageLexer) indexByte(sep byte) int {
   525  	return bytes.IndexByte(l.input[l.pos:], sep)
   526  }
   527  
   528  func (l *pageLexer) hasPrefix(prefix []byte) bool {
   529  	return bytes.HasPrefix(l.input[l.pos:], prefix)
   530  }
   531  
   532  // helper functions
   533  
   534  // returns the min index >= 0
   535  func minIndex(indices ...int) int {
   536  	min := -1
   537  
   538  	for _, j := range indices {
   539  		if j < 0 {
   540  			continue
   541  		}
   542  		if min == -1 {
   543  			min = j
   544  		} else if j < min {
   545  			min = j
   546  		}
   547  	}
   548  	return min
   549  }
   550  
   551  func indexNonWhiteSpace(s []byte, in rune) int {
   552  	idx := bytes.IndexFunc(s, func(r rune) bool {
   553  		return !unicode.IsSpace(r)
   554  	})
   555  
   556  	if idx == -1 {
   557  		return -1
   558  	}
   559  
   560  	r, _ := utf8.DecodeRune(s[idx:])
   561  	if r == in {
   562  		return idx
   563  	}
   564  	return -1
   565  }
   566  
   567  func isSpace(r rune) bool {
   568  	return r == ' ' || r == '\t'
   569  }
   570  
   571  func isAlphaNumericOrHyphen(r rune) bool {
   572  	// let unquoted YouTube ids as positional params slip through (they contain hyphens)
   573  	return isAlphaNumeric(r) || r == '-'
   574  }
   575  
   576  var crLf = []rune{'\r', '\n'}
   577  
   578  func isEndOfLine(r rune) bool {
   579  	return r == '\r' || r == '\n'
   580  }
   581  
   582  func isAlphaNumeric(r rune) bool {
   583  	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
   584  }