github.com/olliephillips/hugo@v0.42.2/parser/page.go (about) 1 // Copyright 2016n The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package parser 15 16 import ( 17 "bufio" 18 "bytes" 19 "fmt" 20 "io" 21 "regexp" 22 "strings" 23 "unicode" 24 25 "github.com/chaseadamsio/goorgeous" 26 ) 27 28 const ( 29 // TODO(bep) Do we really have to export these? 30 31 // HTMLLead identifies the start of HTML documents. 32 HTMLLead = "<" 33 // YAMLLead identifies the start of YAML frontmatter. 34 YAMLLead = "-" 35 // YAMLDelimUnix identifies the end of YAML front matter on Unix. 36 YAMLDelimUnix = "---\n" 37 // YAMLDelimDOS identifies the end of YAML front matter on Windows. 38 YAMLDelimDOS = "---\r\n" 39 // YAMLDelim identifies the YAML front matter delimiter. 40 YAMLDelim = "---" 41 // TOMLLead identifies the start of TOML front matter. 42 TOMLLead = "+" 43 // TOMLDelimUnix identifies the end of TOML front matter on Unix. 44 TOMLDelimUnix = "+++\n" 45 // TOMLDelimDOS identifies the end of TOML front matter on Windows. 46 TOMLDelimDOS = "+++\r\n" 47 // TOMLDelim identifies the TOML front matter delimiter. 48 TOMLDelim = "+++" 49 // JSONLead identifies the start of JSON frontmatter. 50 JSONLead = "{" 51 // HTMLCommentStart identifies the start of HTML comment. 52 HTMLCommentStart = "<!--" 53 // HTMLCommentEnd identifies the end of HTML comment. 54 HTMLCommentEnd = "-->" 55 // BOM Unicode byte order marker 56 BOM = '\ufeff' 57 ) 58 59 var ( 60 delims = regexp.MustCompile( 61 "^(" + regexp.QuoteMeta(YAMLDelim) + `\s*\n|` + regexp.QuoteMeta(TOMLDelim) + `\s*\n|` + regexp.QuoteMeta(JSONLead) + ")", 62 ) 63 ) 64 65 // Page represents a parsed content page. 66 type Page interface { 67 // FrontMatter contains the raw frontmatter with relevant delimiters. 68 FrontMatter() []byte 69 70 // Content contains the raw page content. 71 Content() []byte 72 73 // IsRenderable denotes that the page should be rendered. 74 IsRenderable() bool 75 76 // Metadata returns the unmarshalled frontmatter data. 77 Metadata() (map[string]interface{}, error) 78 } 79 80 // page implements the Page interface. 81 type page struct { 82 render bool 83 frontmatter []byte 84 content []byte 85 } 86 87 // Content returns the raw page content. 88 func (p *page) Content() []byte { 89 return p.content 90 } 91 92 // FrontMatter contains the raw frontmatter with relevant delimiters. 93 func (p *page) FrontMatter() []byte { 94 return p.frontmatter 95 } 96 97 // IsRenderable denotes that the page should be rendered. 98 func (p *page) IsRenderable() bool { 99 return p.render 100 } 101 102 // Metadata returns the unmarshalled frontmatter data. 103 func (p *page) Metadata() (meta map[string]interface{}, err error) { 104 frontmatter := p.FrontMatter() 105 106 if len(frontmatter) != 0 { 107 fm := DetectFrontMatter(rune(frontmatter[0])) 108 if fm != nil { 109 meta, err = fm.Parse(frontmatter) 110 } 111 } 112 return 113 } 114 115 // ReadFrom reads the content from an io.Reader and constructs a page. 116 func ReadFrom(r io.Reader) (p Page, err error) { 117 reader := bufio.NewReader(r) 118 119 // chomp BOM and assume UTF-8 120 if err = chompBOM(reader); err != nil && err != io.EOF { 121 return 122 } 123 if err = chompWhitespace(reader); err != nil && err != io.EOF { 124 return 125 } 126 if err = chompFrontmatterStartComment(reader); err != nil && err != io.EOF { 127 return 128 } 129 130 firstLine, err := peekLine(reader) 131 if err != nil && err != io.EOF { 132 return 133 } 134 135 newp := new(page) 136 newp.render = shouldRender(firstLine) 137 138 if newp.render && isFrontMatterDelim(firstLine) { 139 left, right := determineDelims(firstLine) 140 fm, err := extractFrontMatterDelims(reader, left, right) 141 if err != nil { 142 return nil, err 143 } 144 newp.frontmatter = fm 145 } else if newp.render && goorgeous.IsKeyword(firstLine) { 146 fm, err := goorgeous.ExtractOrgHeaders(reader) 147 if err != nil { 148 return nil, err 149 } 150 newp.frontmatter = fm 151 } 152 153 content, err := extractContent(reader) 154 if err != nil { 155 return nil, err 156 } 157 158 newp.content = content 159 160 return newp, nil 161 } 162 163 // chompBOM scans any leading Unicode Byte Order Markers from r. 164 func chompBOM(r io.RuneScanner) (err error) { 165 for { 166 c, _, err := r.ReadRune() 167 if err != nil { 168 return err 169 } 170 if c != BOM { 171 r.UnreadRune() 172 return nil 173 } 174 } 175 } 176 177 // chompWhitespace scans any leading Unicode whitespace from r. 178 func chompWhitespace(r io.RuneScanner) (err error) { 179 for { 180 c, _, err := r.ReadRune() 181 if err != nil { 182 return err 183 } 184 if !unicode.IsSpace(c) { 185 r.UnreadRune() 186 return nil 187 } 188 } 189 } 190 191 // chompFrontmatterStartComment checks r for a leading HTML comment. If a 192 // comment is found, it is read from r and then whitespace is trimmed from the 193 // beginning of r. 194 func chompFrontmatterStartComment(r *bufio.Reader) (err error) { 195 candidate, err := r.Peek(32) 196 if err != nil { 197 return err 198 } 199 200 str := string(candidate) 201 if strings.HasPrefix(str, HTMLCommentStart) { 202 lineEnd := strings.IndexAny(str, "\n") 203 if lineEnd == -1 { 204 //TODO: if we can't find it, Peek more? 205 return nil 206 } 207 testStr := strings.TrimSuffix(str[0:lineEnd], "\r") 208 if strings.Contains(testStr, HTMLCommentEnd) { 209 return nil 210 } 211 buf := make([]byte, lineEnd) 212 if _, err = r.Read(buf); err != nil { 213 return 214 } 215 if err = chompWhitespace(r); err != nil { 216 return err 217 } 218 } 219 220 return nil 221 } 222 223 // chompFrontmatterEndComment checks r for a trailing HTML comment. 224 func chompFrontmatterEndComment(r *bufio.Reader) (err error) { 225 candidate, err := r.Peek(32) 226 if err != nil { 227 return err 228 } 229 230 str := string(candidate) 231 lineEnd := strings.IndexAny(str, "\n") 232 if lineEnd == -1 { 233 return nil 234 } 235 testStr := strings.TrimSuffix(str[0:lineEnd], "\r") 236 if strings.Contains(testStr, HTMLCommentStart) { 237 return nil 238 } 239 240 //TODO: if we can't find it, Peek more? 241 if strings.HasSuffix(testStr, HTMLCommentEnd) { 242 buf := make([]byte, lineEnd) 243 if _, err = r.Read(buf); err != nil { 244 return 245 } 246 if err = chompWhitespace(r); err != nil { 247 return err 248 } 249 } 250 251 return nil 252 } 253 254 func peekLine(r *bufio.Reader) (line []byte, err error) { 255 firstFive, err := r.Peek(5) 256 if err != nil { 257 return 258 } 259 idx := bytes.IndexByte(firstFive, '\n') 260 if idx == -1 { 261 return firstFive, nil 262 } 263 idx++ // include newline. 264 return firstFive[:idx], nil 265 } 266 267 func shouldRender(lead []byte) (frontmatter bool) { 268 if len(lead) <= 0 { 269 return 270 } 271 272 if bytes.Equal(lead[:1], []byte(HTMLLead)) { 273 return 274 } 275 return true 276 } 277 278 func isFrontMatterDelim(data []byte) bool { 279 return delims.Match(data) 280 } 281 282 func determineDelims(firstLine []byte) (left, right []byte) { 283 switch firstLine[0] { 284 case YAMLLead[0]: 285 return []byte(YAMLDelim), []byte(YAMLDelim) 286 case TOMLLead[0]: 287 return []byte(TOMLDelim), []byte(TOMLDelim) 288 case JSONLead[0]: 289 return []byte(JSONLead), []byte("}") 290 default: 291 panic(fmt.Sprintf("Unable to determine delims from %q", firstLine)) 292 } 293 } 294 295 // extractFrontMatterDelims takes a frontmatter from the content bufio.Reader. 296 // Beginning white spaces of the bufio.Reader must be trimmed before call this 297 // function. 298 func extractFrontMatterDelims(r *bufio.Reader, left, right []byte) (fm []byte, err error) { 299 var ( 300 c byte 301 buf bytes.Buffer 302 level int 303 sameDelim = bytes.Equal(left, right) 304 inQuote bool 305 escapeState int 306 ) 307 // Frontmatter must start with a delimiter. To check it first, 308 // pre-reads beginning delimiter length - 1 bytes from Reader 309 for i := 0; i < len(left)-1; i++ { 310 if c, err = r.ReadByte(); err != nil { 311 return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String()) 312 } 313 if err = buf.WriteByte(c); err != nil { 314 return nil, err 315 } 316 } 317 318 // Reads a character from Reader one by one and checks it matches the 319 // last character of one of delimiters to find the last character of 320 // frontmatter. If it matches, makes sure it contains the delimiter 321 // and if so, also checks it is followed by CR+LF or LF when YAML, 322 // TOML case. In JSON case, nested delimiters must be parsed and it 323 // is expected that the delimiter only contains one character. 324 for { 325 if c, err = r.ReadByte(); err != nil { 326 return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String()) 327 } 328 if err = buf.WriteByte(c); err != nil { 329 return nil, err 330 } 331 332 switch c { 333 case '"': 334 if escapeState != 1 { 335 inQuote = !inQuote 336 } 337 case '\\': 338 escapeState++ 339 case left[len(left)-1]: 340 if sameDelim { // YAML, TOML case 341 if bytes.HasSuffix(buf.Bytes(), left) && (buf.Len() == len(left) || buf.Bytes()[buf.Len()-len(left)-1] == '\n') { 342 nextByte: 343 c, err = r.ReadByte() 344 if err != nil { 345 // It is ok that the end delimiter ends with EOF 346 if err != io.EOF || level != 1 { 347 return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String()) 348 } 349 } else { 350 switch c { 351 case '\n': 352 // ok 353 case ' ': 354 // Consume this byte and try to match again 355 goto nextByte 356 case '\r': 357 if err = buf.WriteByte(c); err != nil { 358 return nil, err 359 } 360 if c, err = r.ReadByte(); err != nil { 361 return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String()) 362 } 363 if c != '\n' { 364 return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len()) 365 } 366 default: 367 return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len()) 368 } 369 if err = buf.WriteByte(c); err != nil { 370 return nil, err 371 } 372 } 373 if level == 0 { 374 level = 1 375 } else { 376 level = 0 377 } 378 } 379 } else { // JSON case 380 if !inQuote { 381 level++ 382 } 383 } 384 case right[len(right)-1]: // JSON case only reaches here 385 if !inQuote { 386 level-- 387 } 388 } 389 390 if level == 0 { 391 // Consumes white spaces immediately behind frontmatter 392 if err = chompWhitespace(r); err != nil && err != io.EOF { 393 return nil, err 394 } 395 if err = chompFrontmatterEndComment(r); err != nil && err != io.EOF { 396 return nil, err 397 } 398 399 return buf.Bytes(), nil 400 } 401 402 if c != '\\' { 403 escapeState = 0 404 } 405 406 } 407 } 408 409 func extractContent(r io.Reader) (content []byte, err error) { 410 wr := new(bytes.Buffer) 411 if _, err = wr.ReadFrom(r); err != nil { 412 return 413 } 414 return wr.Bytes(), nil 415 }