github.com/kovansky/hugo@v0.92.3-0.20220224232819-63076e4ff19f/parser/pageparser/pagelexer.go (about) 1 // Copyright 2018 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package pageparser 15 16 import ( 17 "bytes" 18 "fmt" 19 "unicode" 20 "unicode/utf8" 21 ) 22 23 const eof = -1 24 25 // returns the next state in scanner. 26 type stateFunc func(*pageLexer) stateFunc 27 28 type pageLexer struct { 29 input []byte 30 stateStart stateFunc 31 state stateFunc 32 pos int // input position 33 start int // item start position 34 width int // width of last element 35 36 // Contains lexers for shortcodes and other main section 37 // elements. 38 sectionHandlers *sectionHandlers 39 40 cfg Config 41 42 // The summary divider to look for. 43 summaryDivider []byte 44 // Set when we have parsed any summary divider 45 summaryDividerChecked bool 46 // Whether we're in a HTML comment. 47 isInHTMLComment bool 48 49 lexerShortcodeState 50 51 // items delivered to client 52 items Items 53 } 54 55 // Implement the Result interface 56 func (l *pageLexer) Iterator() *Iterator { 57 return l.newIterator() 58 } 59 60 func (l *pageLexer) Input() []byte { 61 return l.input 62 } 63 64 type Config struct { 65 EnableEmoji bool 66 } 67 68 // note: the input position here is normally 0 (start), but 69 // can be set if position of first shortcode is known 70 func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer { 71 lexer := &pageLexer{ 72 input: input, 73 stateStart: stateStart, 74 cfg: cfg, 75 lexerShortcodeState: lexerShortcodeState{ 76 currLeftDelimItem: tLeftDelimScNoMarkup, 77 currRightDelimItem: tRightDelimScNoMarkup, 78 openShortcodes: make(map[string]bool), 79 }, 80 items: make([]Item, 0, 5), 81 } 82 83 lexer.sectionHandlers = createSectionHandlers(lexer) 84 85 return lexer 86 } 87 88 func (l *pageLexer) newIterator() *Iterator { 89 return &Iterator{l: l, lastPos: -1} 90 } 91 92 // main loop 93 func (l *pageLexer) run() *pageLexer { 94 for l.state = l.stateStart; l.state != nil; { 95 l.state = l.state(l) 96 } 97 return l 98 } 99 100 // Page syntax 101 var ( 102 byteOrderMark = '\ufeff' 103 summaryDivider = []byte("<!--more-->") 104 summaryDividerOrg = []byte("# more") 105 delimTOML = []byte("+++") 106 delimYAML = []byte("---") 107 delimOrg = []byte("#+") 108 htmlCommentStart = []byte("<!--") 109 htmlCommentEnd = []byte("-->") 110 111 emojiDelim = byte(':') 112 ) 113 114 func (l *pageLexer) next() rune { 115 if l.pos >= len(l.input) { 116 l.width = 0 117 return eof 118 } 119 120 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) 121 l.width = runeWidth 122 l.pos += l.width 123 return runeValue 124 } 125 126 // peek, but no consume 127 func (l *pageLexer) peek() rune { 128 r := l.next() 129 l.backup() 130 return r 131 } 132 133 // steps back one 134 func (l *pageLexer) backup() { 135 l.pos -= l.width 136 } 137 138 // sends an item back to the client. 139 func (l *pageLexer) emit(t ItemType) { 140 l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], false}) 141 l.start = l.pos 142 } 143 144 // sends a string item back to the client. 145 func (l *pageLexer) emitString(t ItemType) { 146 l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], true}) 147 l.start = l.pos 148 } 149 150 func (l *pageLexer) isEOF() bool { 151 return l.pos >= len(l.input) 152 } 153 154 // special case, do not send '\\' back to client 155 func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) { 156 val := bytes.Map(func(r rune) rune { 157 if r == '\\' { 158 return -1 159 } 160 return r 161 }, l.input[l.start:l.pos]) 162 l.items = append(l.items, Item{t, l.start, val, isString}) 163 l.start = l.pos 164 } 165 166 // gets the current value (for debugging and error handling) 167 func (l *pageLexer) current() []byte { 168 return l.input[l.start:l.pos] 169 } 170 171 // ignore current element 172 func (l *pageLexer) ignore() { 173 l.start = l.pos 174 } 175 176 var lf = []byte("\n") 177 178 // nil terminates the parser 179 func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc { 180 l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...)), true}) 181 return nil 182 } 183 184 func (l *pageLexer) consumeCRLF() bool { 185 var consumed bool 186 for _, r := range crLf { 187 if l.next() != r { 188 l.backup() 189 } else { 190 consumed = true 191 } 192 } 193 return consumed 194 } 195 196 func (l *pageLexer) consumeToNextLine() { 197 for { 198 r := l.next() 199 if r == eof || isEndOfLine(r) { 200 return 201 } 202 } 203 } 204 205 func (l *pageLexer) consumeToSpace() { 206 for { 207 r := l.next() 208 if r == eof || unicode.IsSpace(r) { 209 l.backup() 210 return 211 } 212 } 213 } 214 215 func (l *pageLexer) consumeSpace() { 216 for { 217 r := l.next() 218 if r == eof || !unicode.IsSpace(r) { 219 l.backup() 220 return 221 } 222 } 223 } 224 225 // lex a string starting at ":" 226 func lexEmoji(l *pageLexer) stateFunc { 227 pos := l.pos + 1 228 valid := false 229 230 for i := pos; i < len(l.input); i++ { 231 if i > pos && l.input[i] == emojiDelim { 232 pos = i + 1 233 valid = true 234 break 235 } 236 r, _ := utf8.DecodeRune(l.input[i:]) 237 if !(isAlphaNumericOrHyphen(r) || r == '+') { 238 break 239 } 240 } 241 242 if valid { 243 l.pos = pos 244 l.emit(TypeEmoji) 245 } else { 246 l.pos++ 247 l.emit(tText) 248 } 249 250 return lexMainSection 251 } 252 253 type sectionHandlers struct { 254 l *pageLexer 255 256 // Set when none of the sections are found so we 257 // can safely stop looking and skip to the end. 258 skipAll bool 259 260 handlers []*sectionHandler 261 skipIndexes []int 262 } 263 264 func (s *sectionHandlers) skip() int { 265 if s.skipAll { 266 return -1 267 } 268 269 s.skipIndexes = s.skipIndexes[:0] 270 var shouldSkip bool 271 for _, skipper := range s.handlers { 272 idx := skipper.skip() 273 if idx != -1 { 274 shouldSkip = true 275 s.skipIndexes = append(s.skipIndexes, idx) 276 } 277 } 278 279 if !shouldSkip { 280 s.skipAll = true 281 return -1 282 } 283 284 return minIndex(s.skipIndexes...) 285 } 286 287 func createSectionHandlers(l *pageLexer) *sectionHandlers { 288 shortCodeHandler := §ionHandler{ 289 l: l, 290 skipFunc: func(l *pageLexer) int { 291 return l.index(leftDelimSc) 292 }, 293 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 294 if !l.isShortCodeStart() { 295 return origin, false 296 } 297 298 if l.isInline { 299 // If we're inside an inline shortcode, the only valid shortcode markup is 300 // the markup which closes it. 301 b := l.input[l.pos+3:] 302 end := indexNonWhiteSpace(b, '/') 303 if end != len(l.input)-1 { 304 b = bytes.TrimSpace(b[end+1:]) 305 if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) { 306 return l.errorf("inline shortcodes do not support nesting"), true 307 } 308 } 309 } 310 311 if l.hasPrefix(leftDelimScWithMarkup) { 312 l.currLeftDelimItem = tLeftDelimScWithMarkup 313 l.currRightDelimItem = tRightDelimScWithMarkup 314 } else { 315 l.currLeftDelimItem = tLeftDelimScNoMarkup 316 l.currRightDelimItem = tRightDelimScNoMarkup 317 } 318 319 return lexShortcodeLeftDelim, true 320 }, 321 } 322 323 summaryDividerHandler := §ionHandler{ 324 l: l, 325 skipFunc: func(l *pageLexer) int { 326 if l.summaryDividerChecked || l.summaryDivider == nil { 327 return -1 328 } 329 return l.index(l.summaryDivider) 330 }, 331 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 332 if !l.hasPrefix(l.summaryDivider) { 333 return origin, false 334 } 335 336 l.summaryDividerChecked = true 337 l.pos += len(l.summaryDivider) 338 // This makes it a little easier to reason about later. 339 l.consumeSpace() 340 l.emit(TypeLeadSummaryDivider) 341 342 return origin, true 343 }, 344 } 345 346 handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler} 347 348 if l.cfg.EnableEmoji { 349 emojiHandler := §ionHandler{ 350 l: l, 351 skipFunc: func(l *pageLexer) int { 352 return l.indexByte(emojiDelim) 353 }, 354 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 355 return lexEmoji, true 356 }, 357 } 358 359 handlers = append(handlers, emojiHandler) 360 } 361 362 return §ionHandlers{ 363 l: l, 364 handlers: handlers, 365 skipIndexes: make([]int, len(handlers)), 366 } 367 } 368 369 func (s *sectionHandlers) lex(origin stateFunc) stateFunc { 370 if s.skipAll { 371 return nil 372 } 373 374 if s.l.pos > s.l.start { 375 s.l.emit(tText) 376 } 377 378 for _, handler := range s.handlers { 379 if handler.skipAll { 380 continue 381 } 382 383 next, handled := handler.lexFunc(origin, handler.l) 384 if next == nil || handled { 385 return next 386 } 387 } 388 389 // Not handled by the above. 390 s.l.pos++ 391 392 return origin 393 } 394 395 type sectionHandler struct { 396 l *pageLexer 397 398 // No more sections of this type. 399 skipAll bool 400 401 // Returns the index of the next match, -1 if none found. 402 skipFunc func(l *pageLexer) int 403 404 // Lex lexes the current section and returns the next state func and 405 // a bool telling if this section was handled. 406 // Note that returning nil as the next state will terminate the 407 // lexer. 408 lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool) 409 } 410 411 func (s *sectionHandler) skip() int { 412 if s.skipAll { 413 return -1 414 } 415 416 idx := s.skipFunc(s.l) 417 if idx == -1 { 418 s.skipAll = true 419 } 420 return idx 421 } 422 423 func lexMainSection(l *pageLexer) stateFunc { 424 if l.isEOF() { 425 return lexDone 426 } 427 428 if l.isInHTMLComment { 429 return lexEndFrontMatterHTMLComment 430 } 431 432 // Fast forward as far as possible. 433 skip := l.sectionHandlers.skip() 434 435 if skip == -1 { 436 l.pos = len(l.input) 437 return lexDone 438 } else if skip > 0 { 439 l.pos += skip 440 } 441 442 next := l.sectionHandlers.lex(lexMainSection) 443 if next != nil { 444 return next 445 } 446 447 l.pos = len(l.input) 448 return lexDone 449 } 450 451 func lexDone(l *pageLexer) stateFunc { 452 // Done! 453 if l.pos > l.start { 454 l.emit(tText) 455 } 456 l.emit(tEOF) 457 return nil 458 } 459 460 func (l *pageLexer) printCurrentInput() { 461 fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:])) 462 } 463 464 // state helpers 465 466 func (l *pageLexer) index(sep []byte) int { 467 return bytes.Index(l.input[l.pos:], sep) 468 } 469 470 func (l *pageLexer) indexByte(sep byte) int { 471 return bytes.IndexByte(l.input[l.pos:], sep) 472 } 473 474 func (l *pageLexer) hasPrefix(prefix []byte) bool { 475 return bytes.HasPrefix(l.input[l.pos:], prefix) 476 } 477 478 // helper functions 479 480 // returns the min index >= 0 481 func minIndex(indices ...int) int { 482 min := -1 483 484 for _, j := range indices { 485 if j < 0 { 486 continue 487 } 488 if min == -1 { 489 min = j 490 } else if j < min { 491 min = j 492 } 493 } 494 return min 495 } 496 497 func indexNonWhiteSpace(s []byte, in rune) int { 498 idx := bytes.IndexFunc(s, func(r rune) bool { 499 return !unicode.IsSpace(r) 500 }) 501 502 if idx == -1 { 503 return -1 504 } 505 506 r, _ := utf8.DecodeRune(s[idx:]) 507 if r == in { 508 return idx 509 } 510 return -1 511 } 512 513 func isSpace(r rune) bool { 514 return r == ' ' || r == '\t' 515 } 516 517 func isAlphaNumericOrHyphen(r rune) bool { 518 // let unquoted YouTube ids as positional params slip through (they contain hyphens) 519 return isAlphaNumeric(r) || r == '-' 520 } 521 522 var crLf = []rune{'\r', '\n'} 523 524 func isEndOfLine(r rune) bool { 525 return r == '\r' || r == '\n' 526 } 527 528 func isAlphaNumeric(r rune) bool { 529 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 530 }