github.com/whatlly/hugo@v0.47.1/hugolib/shortcodeparser.go (about) 1 // Copyright 2015 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package hugolib 15 16 import ( 17 "fmt" 18 "strings" 19 "unicode" 20 "unicode/utf8" 21 ) 22 23 // The lexical scanning below is highly inspired by the great talk given by 24 // Rob Pike called "Lexical Scanning in Go" (it's on YouTube, Google it!). 25 // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html 26 27 // parsing 28 29 type pageTokens struct { 30 lexer *pagelexer 31 token [3]item // 3-item look-ahead is what we currently need 32 peekCount int 33 } 34 35 func (t *pageTokens) next() item { 36 if t.peekCount > 0 { 37 t.peekCount-- 38 } else { 39 t.token[0] = t.lexer.nextItem() 40 } 41 return t.token[t.peekCount] 42 } 43 44 // backs up one token. 45 func (t *pageTokens) backup() { 46 t.peekCount++ 47 } 48 49 // backs up two tokens. 50 func (t *pageTokens) backup2(t1 item) { 51 t.token[1] = t1 52 t.peekCount = 2 53 } 54 55 // backs up three tokens. 56 func (t *pageTokens) backup3(t2, t1 item) { 57 t.token[1] = t1 58 t.token[2] = t2 59 t.peekCount = 3 60 } 61 62 // check for non-error and non-EOF types coming next 63 func (t *pageTokens) isValueNext() bool { 64 i := t.peek() 65 return i.typ != tError && i.typ != tEOF 66 } 67 68 // look at, but do not consume, the next item 69 // repeated, sequential calls will return the same item 70 func (t *pageTokens) peek() item { 71 if t.peekCount > 0 { 72 return t.token[t.peekCount-1] 73 } 74 t.peekCount = 1 75 t.token[0] = t.lexer.nextItem() 76 return t.token[0] 77 } 78 79 // convencience method to consume the next n tokens, but back off Errors and EOF 80 func (t *pageTokens) consume(cnt int) { 81 for i := 0; i < cnt; i++ { 82 token := t.next() 83 if token.typ == tError || token.typ == tEOF { 84 t.backup() 85 break 86 } 87 } 88 } 89 90 // lexical scanning 91 92 // position (in bytes) 93 type pos int 94 95 type item struct { 96 typ itemType 97 pos pos 98 val string 99 } 100 101 func (i item) String() string { 102 switch { 103 case i.typ == tEOF: 104 return "EOF" 105 case i.typ == tError: 106 return i.val 107 case i.typ > tKeywordMarker: 108 return fmt.Sprintf("<%s>", i.val) 109 case len(i.val) > 20: 110 return fmt.Sprintf("%.20q...", i.val) 111 } 112 return fmt.Sprintf("[%s]", i.val) 113 } 114 115 type itemType int 116 117 const ( 118 tError itemType = iota 119 tEOF 120 121 // shortcode items 122 tLeftDelimScNoMarkup 123 tRightDelimScNoMarkup 124 tLeftDelimScWithMarkup 125 tRightDelimScWithMarkup 126 tScClose 127 tScName 128 tScParam 129 tScParamVal 130 131 //itemIdentifier 132 tText // plain text, used for everything outside the shortcodes 133 134 // preserved for later - keywords come after this 135 tKeywordMarker 136 ) 137 138 const eof = -1 139 140 // returns the next state in scanner. 141 type stateFunc func(*pagelexer) stateFunc 142 143 type pagelexer struct { 144 name string 145 input string 146 state stateFunc 147 pos pos // input position 148 start pos // item start position 149 width pos // width of last element 150 lastPos pos // position of the last item returned by nextItem 151 152 // shortcode state 153 currLeftDelimItem itemType 154 currRightDelimItem itemType 155 currShortcodeName string // is only set when a shortcode is in opened state 156 closingState int // > 0 = on its way to be closed 157 elementStepNum int // step number in element 158 paramElements int // number of elements (name + value = 2) found first 159 openShortcodes map[string]bool // set of shortcodes in open state 160 161 // items delivered to client 162 items []item 163 } 164 165 // note: the input position here is normally 0 (start), but 166 // can be set if position of first shortcode is known 167 func newShortcodeLexer(name, input string, inputPosition pos) *pagelexer { 168 lexer := &pagelexer{ 169 name: name, 170 input: input, 171 currLeftDelimItem: tLeftDelimScNoMarkup, 172 currRightDelimItem: tRightDelimScNoMarkup, 173 pos: inputPosition, 174 openShortcodes: make(map[string]bool), 175 items: make([]item, 0, 5), 176 } 177 lexer.runShortcodeLexer() 178 return lexer 179 } 180 181 // main loop 182 // this looks kind of funky, but it works 183 func (l *pagelexer) runShortcodeLexer() { 184 for l.state = lexTextOutsideShortcodes; l.state != nil; { 185 l.state = l.state(l) 186 } 187 } 188 189 // state functions 190 191 const ( 192 leftDelimScNoMarkup = "{{<" 193 rightDelimScNoMarkup = ">}}" 194 leftDelimScWithMarkup = "{{%" 195 rightDelimScWithMarkup = "%}}" 196 leftComment = "/*" // comments in this context us used to to mark shortcodes as "not really a shortcode" 197 rightComment = "*/" 198 ) 199 200 func (l *pagelexer) next() rune { 201 if int(l.pos) >= len(l.input) { 202 l.width = 0 203 return eof 204 } 205 206 // looks expensive, but should produce the same iteration sequence as the string range loop 207 // see: http://blog.golang.org/strings 208 runeValue, runeWidth := utf8.DecodeRuneInString(l.input[l.pos:]) 209 l.width = pos(runeWidth) 210 l.pos += l.width 211 return runeValue 212 } 213 214 // peek, but no consume 215 func (l *pagelexer) peek() rune { 216 r := l.next() 217 l.backup() 218 return r 219 } 220 221 // steps back one 222 func (l *pagelexer) backup() { 223 l.pos -= l.width 224 } 225 226 // sends an item back to the client. 227 func (l *pagelexer) emit(t itemType) { 228 l.items = append(l.items, item{t, l.start, l.input[l.start:l.pos]}) 229 l.start = l.pos 230 } 231 232 // special case, do not send '\\' back to client 233 func (l *pagelexer) ignoreEscapesAndEmit(t itemType) { 234 val := strings.Map(func(r rune) rune { 235 if r == '\\' { 236 return -1 237 } 238 return r 239 }, l.input[l.start:l.pos]) 240 l.items = append(l.items, item{t, l.start, val}) 241 l.start = l.pos 242 } 243 244 // gets the current value (for debugging and error handling) 245 func (l *pagelexer) current() string { 246 return l.input[l.start:l.pos] 247 } 248 249 // ignore current element 250 func (l *pagelexer) ignore() { 251 l.start = l.pos 252 } 253 254 // nice to have in error logs 255 func (l *pagelexer) lineNum() int { 256 return strings.Count(l.input[:l.lastPos], "\n") + 1 257 } 258 259 // nil terminates the parser 260 func (l *pagelexer) errorf(format string, args ...interface{}) stateFunc { 261 l.items = append(l.items, item{tError, l.start, fmt.Sprintf(format, args...)}) 262 return nil 263 } 264 265 // consumes and returns the next item 266 func (l *pagelexer) nextItem() item { 267 item := l.items[0] 268 l.items = l.items[1:] 269 l.lastPos = item.pos 270 return item 271 } 272 273 // scans until an opening shortcode opening bracket. 274 // if no shortcodes, it will keep on scanning until EOF 275 func lexTextOutsideShortcodes(l *pagelexer) stateFunc { 276 for { 277 if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) { 278 if l.pos > l.start { 279 l.emit(tText) 280 } 281 if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) { 282 l.currLeftDelimItem = tLeftDelimScWithMarkup 283 l.currRightDelimItem = tRightDelimScWithMarkup 284 } else { 285 l.currLeftDelimItem = tLeftDelimScNoMarkup 286 l.currRightDelimItem = tRightDelimScNoMarkup 287 } 288 return lexShortcodeLeftDelim 289 290 } 291 if l.next() == eof { 292 break 293 } 294 } 295 // Done! 296 if l.pos > l.start { 297 l.emit(tText) 298 } 299 l.emit(tEOF) 300 return nil 301 } 302 303 func lexShortcodeLeftDelim(l *pagelexer) stateFunc { 304 l.pos += pos(len(l.currentLeftShortcodeDelim())) 305 if strings.HasPrefix(l.input[l.pos:], leftComment) { 306 return lexShortcodeComment 307 } 308 l.emit(l.currentLeftShortcodeDelimItem()) 309 l.elementStepNum = 0 310 l.paramElements = 0 311 return lexInsideShortcode 312 } 313 314 func lexShortcodeComment(l *pagelexer) stateFunc { 315 posRightComment := strings.Index(l.input[l.pos:], rightComment+l.currentRightShortcodeDelim()) 316 if posRightComment <= 1 { 317 return l.errorf("comment must be closed") 318 } 319 // we emit all as text, except the comment markers 320 l.emit(tText) 321 l.pos += pos(len(leftComment)) 322 l.ignore() 323 l.pos += pos(posRightComment - len(leftComment)) 324 l.emit(tText) 325 l.pos += pos(len(rightComment)) 326 l.ignore() 327 l.pos += pos(len(l.currentRightShortcodeDelim())) 328 l.emit(tText) 329 return lexTextOutsideShortcodes 330 } 331 332 func lexShortcodeRightDelim(l *pagelexer) stateFunc { 333 l.closingState = 0 334 l.pos += pos(len(l.currentRightShortcodeDelim())) 335 l.emit(l.currentRightShortcodeDelimItem()) 336 return lexTextOutsideShortcodes 337 } 338 339 // either: 340 // 1. param 341 // 2. "param" or "param\" 342 // 3. param="123" or param="123\" 343 // 4. param="Some \"escaped\" text" 344 func lexShortcodeParam(l *pagelexer, escapedQuoteStart bool) stateFunc { 345 346 first := true 347 nextEq := false 348 349 var r rune 350 351 for { 352 r = l.next() 353 if first { 354 if r == '"' { 355 // a positional param with quotes 356 if l.paramElements == 2 { 357 return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters") 358 } 359 l.paramElements = 1 360 l.backup() 361 return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam) 362 } 363 first = false 364 } else if r == '=' { 365 // a named param 366 l.backup() 367 nextEq = true 368 break 369 } 370 371 if !isAlphaNumericOrHyphen(r) { 372 l.backup() 373 break 374 } 375 } 376 377 if l.paramElements == 0 { 378 l.paramElements++ 379 380 if nextEq { 381 l.paramElements++ 382 } 383 } else { 384 if nextEq && l.paramElements == 1 { 385 return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current()) 386 } else if !nextEq && l.paramElements == 2 { 387 return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current()) 388 } 389 } 390 391 l.emit(tScParam) 392 return lexInsideShortcode 393 394 } 395 396 func lexShortcodeQuotedParamVal(l *pagelexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc { 397 openQuoteFound := false 398 escapedInnerQuoteFound := false 399 escapedQuoteState := 0 400 401 Loop: 402 for { 403 switch r := l.next(); { 404 case r == '\\': 405 if l.peek() == '"' { 406 if openQuoteFound && !escapedQuotedValuesAllowed { 407 l.backup() 408 break Loop 409 } else if openQuoteFound { 410 // the coming quoute is inside 411 escapedInnerQuoteFound = true 412 escapedQuoteState = 1 413 } 414 } 415 case r == eof, r == '\n': 416 return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current()) 417 case r == '"': 418 if escapedQuoteState == 0 { 419 if openQuoteFound { 420 l.backup() 421 break Loop 422 423 } else { 424 openQuoteFound = true 425 l.ignore() 426 } 427 } else { 428 escapedQuoteState = 0 429 } 430 431 } 432 } 433 434 if escapedInnerQuoteFound { 435 l.ignoreEscapesAndEmit(typ) 436 } else { 437 l.emit(typ) 438 } 439 440 r := l.next() 441 442 if r == '\\' { 443 if l.peek() == '"' { 444 // ignore the escaped closing quote 445 l.ignore() 446 l.next() 447 l.ignore() 448 } 449 } else if r == '"' { 450 // ignore closing quote 451 l.ignore() 452 } else { 453 // handled by next state 454 l.backup() 455 } 456 457 return lexInsideShortcode 458 } 459 460 // scans an alphanumeric inside shortcode 461 func lexIdentifierInShortcode(l *pagelexer) stateFunc { 462 lookForEnd := false 463 Loop: 464 for { 465 switch r := l.next(); { 466 case isAlphaNumericOrHyphen(r): 467 // Allow forward slash inside names to make it possible to create namespaces. 468 case r == '/': 469 default: 470 l.backup() 471 word := l.input[l.start:l.pos] 472 if l.closingState > 0 && !l.openShortcodes[word] { 473 return l.errorf("closing tag for shortcode '%s' does not match start tag", word) 474 } else if l.closingState > 0 { 475 l.openShortcodes[word] = false 476 lookForEnd = true 477 } 478 479 l.closingState = 0 480 l.currShortcodeName = word 481 l.openShortcodes[word] = true 482 l.elementStepNum++ 483 l.emit(tScName) 484 break Loop 485 } 486 } 487 488 if lookForEnd { 489 return lexEndOfShortcode 490 } 491 return lexInsideShortcode 492 } 493 494 func lexEndOfShortcode(l *pagelexer) stateFunc { 495 if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) { 496 return lexShortcodeRightDelim 497 } 498 switch r := l.next(); { 499 case isSpace(r): 500 l.ignore() 501 default: 502 return l.errorf("unclosed shortcode") 503 } 504 return lexEndOfShortcode 505 } 506 507 // scans the elements inside shortcode tags 508 func lexInsideShortcode(l *pagelexer) stateFunc { 509 if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) { 510 return lexShortcodeRightDelim 511 } 512 switch r := l.next(); { 513 case r == eof: 514 // eol is allowed inside shortcodes; this may go to end of document before it fails 515 return l.errorf("unclosed shortcode action") 516 case isSpace(r), isEndOfLine(r): 517 l.ignore() 518 case r == '=': 519 l.ignore() 520 return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal) 521 case r == '/': 522 if l.currShortcodeName == "" { 523 return l.errorf("got closing shortcode, but none is open") 524 } 525 l.closingState++ 526 l.emit(tScClose) 527 case r == '\\': 528 l.ignore() 529 if l.peek() == '"' { 530 return lexShortcodeParam(l, true) 531 } 532 case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes 533 l.backup() 534 return lexShortcodeParam(l, false) 535 case isAlphaNumeric(r): 536 l.backup() 537 return lexIdentifierInShortcode 538 default: 539 return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r) 540 } 541 return lexInsideShortcode 542 } 543 544 // state helpers 545 546 func (l *pagelexer) currentLeftShortcodeDelimItem() itemType { 547 return l.currLeftDelimItem 548 } 549 550 func (l *pagelexer) currentRightShortcodeDelimItem() itemType { 551 return l.currRightDelimItem 552 } 553 554 func (l *pagelexer) currentLeftShortcodeDelim() string { 555 if l.currLeftDelimItem == tLeftDelimScWithMarkup { 556 return leftDelimScWithMarkup 557 } 558 return leftDelimScNoMarkup 559 560 } 561 562 func (l *pagelexer) currentRightShortcodeDelim() string { 563 if l.currRightDelimItem == tRightDelimScWithMarkup { 564 return rightDelimScWithMarkup 565 } 566 return rightDelimScNoMarkup 567 } 568 569 // helper functions 570 571 func isSpace(r rune) bool { 572 return r == ' ' || r == '\t' 573 } 574 575 func isAlphaNumericOrHyphen(r rune) bool { 576 // let unquoted YouTube ids as positional params slip through (they contain hyphens) 577 return isAlphaNumeric(r) || r == '-' 578 } 579 580 func isEndOfLine(r rune) bool { 581 return r == '\r' || r == '\n' 582 } 583 584 func isAlphaNumeric(r rune) bool { 585 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 586 }