github.com/anakojm/hugo-katex@v0.0.0-20231023141351-42d6f5de9c0b/parser/pageparser/pagelexer.go (about) 1 // Copyright 2018 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package pageparser 15 16 import ( 17 "bytes" 18 "fmt" 19 "unicode" 20 "unicode/utf8" 21 ) 22 23 const eof = -1 24 25 // returns the next state in scanner. 26 type stateFunc func(*pageLexer) stateFunc 27 28 type pageLexer struct { 29 input []byte 30 stateStart stateFunc 31 state stateFunc 32 pos int // input position 33 start int // item start position 34 width int // width of last element 35 36 // Contains lexers for shortcodes and other main section 37 // elements. 38 sectionHandlers *sectionHandlers 39 40 cfg Config 41 42 // The summary divider to look for. 43 summaryDivider []byte 44 // Set when we have parsed any summary divider 45 summaryDividerChecked bool 46 // Whether we're in a HTML comment. 47 isInHTMLComment bool 48 49 lexerShortcodeState 50 51 // items delivered to client 52 items Items 53 } 54 55 // Implement the Result interface 56 func (l *pageLexer) Iterator() *Iterator { 57 return NewIterator(l.items) 58 } 59 60 func (l *pageLexer) Input() []byte { 61 return l.input 62 } 63 64 type Config struct { 65 EnableEmoji bool 66 } 67 68 // note: the input position here is normally 0 (start), but 69 // can be set if position of first shortcode is known 70 func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer { 71 lexer := &pageLexer{ 72 input: input, 73 stateStart: stateStart, 74 cfg: cfg, 75 lexerShortcodeState: lexerShortcodeState{ 76 currLeftDelimItem: tLeftDelimScNoMarkup, 77 currRightDelimItem: tRightDelimScNoMarkup, 78 openShortcodes: make(map[string]bool), 79 }, 80 items: make([]Item, 0, 5), 81 } 82 83 lexer.sectionHandlers = createSectionHandlers(lexer) 84 85 return lexer 86 } 87 88 // main loop 89 func (l *pageLexer) run() *pageLexer { 90 for l.state = l.stateStart; l.state != nil; { 91 l.state = l.state(l) 92 } 93 return l 94 } 95 96 // Page syntax 97 var ( 98 byteOrderMark = '\ufeff' 99 summaryDivider = []byte("<!--more-->") 100 summaryDividerOrg = []byte("# more") 101 delimTOML = []byte("+++") 102 delimYAML = []byte("---") 103 delimOrg = []byte("#+") 104 htmlCommentStart = []byte("<!--") 105 htmlCommentEnd = []byte("-->") 106 107 emojiDelim = byte(':') 108 ) 109 110 func (l *pageLexer) next() rune { 111 if l.pos >= len(l.input) { 112 l.width = 0 113 return eof 114 } 115 116 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) 117 l.width = runeWidth 118 l.pos += l.width 119 120 return runeValue 121 } 122 123 // peek, but no consume 124 func (l *pageLexer) peek() rune { 125 r := l.next() 126 l.backup() 127 return r 128 } 129 130 // steps back one 131 func (l *pageLexer) backup() { 132 l.pos -= l.width 133 } 134 135 func (l *pageLexer) append(item Item) { 136 if item.Pos() < len(l.input) { 137 item.firstByte = l.input[item.Pos()] 138 } 139 l.items = append(l.items, item) 140 } 141 142 // sends an item back to the client. 143 func (l *pageLexer) emit(t ItemType) { 144 defer func() { 145 l.start = l.pos 146 }() 147 148 if t == tText { 149 // Identify any trailing whitespace/intendation. 150 // We currently only care about the last one. 151 for i := l.pos - 1; i >= l.start; i-- { 152 b := l.input[i] 153 if b != ' ' && b != '\t' && b != '\r' && b != '\n' { 154 break 155 } 156 if i == l.start && b != '\n' { 157 l.append(Item{Type: tIndentation, low: l.start, high: l.pos}) 158 return 159 } else if b == '\n' && i < l.pos-1 { 160 l.append(Item{Type: t, low: l.start, high: i + 1}) 161 l.append(Item{Type: tIndentation, low: i + 1, high: l.pos}) 162 return 163 } else if b == '\n' && i == l.pos-1 { 164 break 165 } 166 167 } 168 } 169 170 l.append(Item{Type: t, low: l.start, high: l.pos}) 171 172 } 173 174 // sends a string item back to the client. 175 func (l *pageLexer) emitString(t ItemType) { 176 l.append(Item{Type: t, low: l.start, high: l.pos, isString: true}) 177 l.start = l.pos 178 } 179 180 func (l *pageLexer) isEOF() bool { 181 return l.pos >= len(l.input) 182 } 183 184 // special case, do not send '\\' back to client 185 func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) { 186 i := l.start 187 k := i 188 189 var segments []lowHigh 190 191 for i < l.pos { 192 r, w := utf8.DecodeRune(l.input[i:l.pos]) 193 if r == '\\' { 194 if i > k { 195 segments = append(segments, lowHigh{k, i}) 196 } 197 // See issue #10236. 198 // We don't send the backslash back to the client, 199 // which makes the end parsing simpler. 200 // This means that we cannot render the AST back to be 201 // exactly the same as the input, 202 // but that was also the situation before we introduced the issue in #10236. 203 k = i + w 204 } 205 i += w 206 } 207 208 if k < l.pos { 209 segments = append(segments, lowHigh{k, l.pos}) 210 } 211 212 if len(segments) > 0 { 213 l.append(Item{Type: t, segments: segments}) 214 } 215 216 l.start = l.pos 217 218 } 219 220 // gets the current value (for debugging and error handling) 221 func (l *pageLexer) current() []byte { 222 return l.input[l.start:l.pos] 223 } 224 225 // ignore current element 226 func (l *pageLexer) ignore() { 227 l.start = l.pos 228 } 229 230 var lf = []byte("\n") 231 232 // nil terminates the parser 233 func (l *pageLexer) errorf(format string, args ...any) stateFunc { 234 l.append(Item{Type: tError, Err: fmt.Errorf(format, args...)}) 235 return nil 236 } 237 238 func (l *pageLexer) consumeCRLF() bool { 239 var consumed bool 240 for _, r := range crLf { 241 if l.next() != r { 242 l.backup() 243 } else { 244 consumed = true 245 } 246 } 247 return consumed 248 } 249 250 func (l *pageLexer) consumeToNextLine() { 251 for { 252 r := l.next() 253 if r == eof || isEndOfLine(r) { 254 return 255 } 256 } 257 } 258 259 func (l *pageLexer) consumeToSpace() { 260 for { 261 r := l.next() 262 if r == eof || unicode.IsSpace(r) { 263 l.backup() 264 return 265 } 266 } 267 } 268 269 func (l *pageLexer) consumeSpace() { 270 for { 271 r := l.next() 272 if r == eof || !unicode.IsSpace(r) { 273 l.backup() 274 return 275 } 276 } 277 } 278 279 // lex a string starting at ":" 280 func lexEmoji(l *pageLexer) stateFunc { 281 pos := l.pos + 1 282 valid := false 283 284 for i := pos; i < len(l.input); i++ { 285 if i > pos && l.input[i] == emojiDelim { 286 pos = i + 1 287 valid = true 288 break 289 } 290 r, _ := utf8.DecodeRune(l.input[i:]) 291 if !(isAlphaNumericOrHyphen(r) || r == '+') { 292 break 293 } 294 } 295 296 if valid { 297 l.pos = pos 298 l.emit(TypeEmoji) 299 } else { 300 l.pos++ 301 l.emit(tText) 302 } 303 304 return lexMainSection 305 } 306 307 type sectionHandlers struct { 308 l *pageLexer 309 310 // Set when none of the sections are found so we 311 // can safely stop looking and skip to the end. 312 skipAll bool 313 314 handlers []*sectionHandler 315 skipIndexes []int 316 } 317 318 func (s *sectionHandlers) skip() int { 319 if s.skipAll { 320 return -1 321 } 322 323 s.skipIndexes = s.skipIndexes[:0] 324 var shouldSkip bool 325 for _, skipper := range s.handlers { 326 idx := skipper.skip() 327 if idx != -1 { 328 shouldSkip = true 329 s.skipIndexes = append(s.skipIndexes, idx) 330 } 331 } 332 333 if !shouldSkip { 334 s.skipAll = true 335 return -1 336 } 337 338 return minIndex(s.skipIndexes...) 339 } 340 341 func createSectionHandlers(l *pageLexer) *sectionHandlers { 342 shortCodeHandler := §ionHandler{ 343 l: l, 344 skipFunc: func(l *pageLexer) int { 345 return l.index(leftDelimSc) 346 }, 347 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 348 if !l.isShortCodeStart() { 349 return origin, false 350 } 351 352 if l.isInline { 353 // If we're inside an inline shortcode, the only valid shortcode markup is 354 // the markup which closes it. 355 b := l.input[l.pos+3:] 356 end := indexNonWhiteSpace(b, '/') 357 if end != len(l.input)-1 { 358 b = bytes.TrimSpace(b[end+1:]) 359 if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) { 360 return l.errorf("inline shortcodes do not support nesting"), true 361 } 362 } 363 } 364 365 if l.hasPrefix(leftDelimScWithMarkup) { 366 l.currLeftDelimItem = tLeftDelimScWithMarkup 367 l.currRightDelimItem = tRightDelimScWithMarkup 368 } else { 369 l.currLeftDelimItem = tLeftDelimScNoMarkup 370 l.currRightDelimItem = tRightDelimScNoMarkup 371 } 372 373 return lexShortcodeLeftDelim, true 374 }, 375 } 376 377 summaryDividerHandler := §ionHandler{ 378 l: l, 379 skipFunc: func(l *pageLexer) int { 380 if l.summaryDividerChecked || l.summaryDivider == nil { 381 return -1 382 } 383 return l.index(l.summaryDivider) 384 }, 385 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 386 if !l.hasPrefix(l.summaryDivider) { 387 return origin, false 388 } 389 390 l.summaryDividerChecked = true 391 l.pos += len(l.summaryDivider) 392 // This makes it a little easier to reason about later. 393 l.consumeSpace() 394 l.emit(TypeLeadSummaryDivider) 395 396 return origin, true 397 }, 398 } 399 400 handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler} 401 402 if l.cfg.EnableEmoji { 403 emojiHandler := §ionHandler{ 404 l: l, 405 skipFunc: func(l *pageLexer) int { 406 return l.indexByte(emojiDelim) 407 }, 408 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 409 return lexEmoji, true 410 }, 411 } 412 413 handlers = append(handlers, emojiHandler) 414 } 415 416 return §ionHandlers{ 417 l: l, 418 handlers: handlers, 419 skipIndexes: make([]int, len(handlers)), 420 } 421 } 422 423 func (s *sectionHandlers) lex(origin stateFunc) stateFunc { 424 if s.skipAll { 425 return nil 426 } 427 428 if s.l.pos > s.l.start { 429 s.l.emit(tText) 430 } 431 432 for _, handler := range s.handlers { 433 if handler.skipAll { 434 continue 435 } 436 437 next, handled := handler.lexFunc(origin, handler.l) 438 if next == nil || handled { 439 return next 440 } 441 } 442 443 // Not handled by the above. 444 s.l.pos++ 445 446 return origin 447 } 448 449 type sectionHandler struct { 450 l *pageLexer 451 452 // No more sections of this type. 453 skipAll bool 454 455 // Returns the index of the next match, -1 if none found. 456 skipFunc func(l *pageLexer) int 457 458 // Lex lexes the current section and returns the next state func and 459 // a bool telling if this section was handled. 460 // Note that returning nil as the next state will terminate the 461 // lexer. 462 lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool) 463 } 464 465 func (s *sectionHandler) skip() int { 466 if s.skipAll { 467 return -1 468 } 469 470 idx := s.skipFunc(s.l) 471 if idx == -1 { 472 s.skipAll = true 473 } 474 return idx 475 } 476 477 func lexMainSection(l *pageLexer) stateFunc { 478 if l.isEOF() { 479 return lexDone 480 } 481 482 if l.isInHTMLComment { 483 return lexEndFrontMatterHTMLComment 484 } 485 486 // Fast forward as far as possible. 487 skip := l.sectionHandlers.skip() 488 489 if skip == -1 { 490 l.pos = len(l.input) 491 return lexDone 492 } else if skip > 0 { 493 l.pos += skip 494 } 495 496 next := l.sectionHandlers.lex(lexMainSection) 497 if next != nil { 498 return next 499 } 500 501 l.pos = len(l.input) 502 return lexDone 503 } 504 505 func lexDone(l *pageLexer) stateFunc { 506 // Done! 507 if l.pos > l.start { 508 l.emit(tText) 509 } 510 l.emit(tEOF) 511 return nil 512 } 513 514 func (l *pageLexer) printCurrentInput() { 515 fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:])) 516 } 517 518 // state helpers 519 520 func (l *pageLexer) index(sep []byte) int { 521 return bytes.Index(l.input[l.pos:], sep) 522 } 523 524 func (l *pageLexer) indexByte(sep byte) int { 525 return bytes.IndexByte(l.input[l.pos:], sep) 526 } 527 528 func (l *pageLexer) hasPrefix(prefix []byte) bool { 529 return bytes.HasPrefix(l.input[l.pos:], prefix) 530 } 531 532 // helper functions 533 534 // returns the min index >= 0 535 func minIndex(indices ...int) int { 536 min := -1 537 538 for _, j := range indices { 539 if j < 0 { 540 continue 541 } 542 if min == -1 { 543 min = j 544 } else if j < min { 545 min = j 546 } 547 } 548 return min 549 } 550 551 func indexNonWhiteSpace(s []byte, in rune) int { 552 idx := bytes.IndexFunc(s, func(r rune) bool { 553 return !unicode.IsSpace(r) 554 }) 555 556 if idx == -1 { 557 return -1 558 } 559 560 r, _ := utf8.DecodeRune(s[idx:]) 561 if r == in { 562 return idx 563 } 564 return -1 565 } 566 567 func isSpace(r rune) bool { 568 return r == ' ' || r == '\t' 569 } 570 571 func isAlphaNumericOrHyphen(r rune) bool { 572 // let unquoted YouTube ids as positional params slip through (they contain hyphens) 573 return isAlphaNumeric(r) || r == '-' 574 } 575 576 var crLf = []rune{'\r', '\n'} 577 578 func isEndOfLine(r rune) bool { 579 return r == '\r' || r == '\n' 580 } 581 582 func isAlphaNumeric(r rune) bool { 583 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 584 }