github.com/lyeb/hugo@v0.47.1/parser/page.go (about)

     1  // Copyright 2016n The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package parser
    15  
    16  import (
    17  	"bufio"
    18  	"bytes"
    19  	"fmt"
    20  	"io"
    21  	"regexp"
    22  	"strings"
    23  	"unicode"
    24  
    25  	"github.com/chaseadamsio/goorgeous"
    26  )
    27  
    28  const (
    29  	// TODO(bep) Do we really have to export these?
    30  
    31  	// HTMLLead identifies the start of HTML documents.
    32  	HTMLLead = "<"
    33  	// YAMLLead identifies the start of YAML frontmatter.
    34  	YAMLLead = "-"
    35  	// YAMLDelimUnix identifies the end of YAML front matter on Unix.
    36  	YAMLDelimUnix = "---\n"
    37  	// YAMLDelimDOS identifies the end of YAML front matter on Windows.
    38  	YAMLDelimDOS = "---\r\n"
    39  	// YAMLDelim identifies the YAML front matter delimiter.
    40  	YAMLDelim = "---"
    41  	// TOMLLead identifies the start of TOML front matter.
    42  	TOMLLead = "+"
    43  	// TOMLDelimUnix identifies the end of TOML front matter on Unix.
    44  	TOMLDelimUnix = "+++\n"
    45  	// TOMLDelimDOS identifies the end of TOML front matter on Windows.
    46  	TOMLDelimDOS = "+++\r\n"
    47  	// TOMLDelim identifies the TOML front matter delimiter.
    48  	TOMLDelim = "+++"
    49  	// JSONLead identifies the start of JSON frontmatter.
    50  	JSONLead = "{"
    51  	// HTMLCommentStart identifies the start of HTML comment.
    52  	HTMLCommentStart = "<!--"
    53  	// HTMLCommentEnd identifies the end of HTML comment.
    54  	HTMLCommentEnd = "-->"
    55  	// BOM Unicode byte order marker
    56  	BOM = '\ufeff'
    57  )
    58  
    59  var (
    60  	delims = regexp.MustCompile(
    61  		"^(" + regexp.QuoteMeta(YAMLDelim) + `\s*\n|` + regexp.QuoteMeta(TOMLDelim) + `\s*\n|` + regexp.QuoteMeta(JSONLead) + ")",
    62  	)
    63  )
    64  
    65  // Page represents a parsed content page.
    66  type Page interface {
    67  	// FrontMatter contains the raw frontmatter with relevant delimiters.
    68  	FrontMatter() []byte
    69  
    70  	// Content contains the raw page content.
    71  	Content() []byte
    72  
    73  	// IsRenderable denotes that the page should be rendered.
    74  	IsRenderable() bool
    75  
    76  	// Metadata returns the unmarshalled frontmatter data.
    77  	Metadata() (map[string]interface{}, error)
    78  }
    79  
    80  // page implements the Page interface.
    81  type page struct {
    82  	render      bool
    83  	frontmatter []byte
    84  	content     []byte
    85  }
    86  
    87  // Content returns the raw page content.
    88  func (p *page) Content() []byte {
    89  	return p.content
    90  }
    91  
    92  // FrontMatter contains the raw frontmatter with relevant delimiters.
    93  func (p *page) FrontMatter() []byte {
    94  	return p.frontmatter
    95  }
    96  
    97  // IsRenderable denotes that the page should be rendered.
    98  func (p *page) IsRenderable() bool {
    99  	return p.render
   100  }
   101  
   102  // Metadata returns the unmarshalled frontmatter data.
   103  func (p *page) Metadata() (meta map[string]interface{}, err error) {
   104  	frontmatter := p.FrontMatter()
   105  
   106  	if len(frontmatter) != 0 {
   107  		fm := DetectFrontMatter(rune(frontmatter[0]))
   108  		if fm != nil {
   109  			meta, err = fm.Parse(frontmatter)
   110  		}
   111  	}
   112  	return
   113  }
   114  
   115  // ReadFrom reads the content from an io.Reader and constructs a page.
   116  func ReadFrom(r io.Reader) (p Page, err error) {
   117  	reader := bufio.NewReader(r)
   118  
   119  	// chomp BOM and assume UTF-8
   120  	if err = chompBOM(reader); err != nil && err != io.EOF {
   121  		return
   122  	}
   123  	if err = chompWhitespace(reader); err != nil && err != io.EOF {
   124  		return
   125  	}
   126  	if err = chompFrontmatterStartComment(reader); err != nil && err != io.EOF {
   127  		return
   128  	}
   129  
   130  	firstLine, err := peekLine(reader)
   131  	if err != nil && err != io.EOF {
   132  		return
   133  	}
   134  
   135  	newp := new(page)
   136  	newp.render = shouldRender(firstLine)
   137  
   138  	if newp.render && isFrontMatterDelim(firstLine) {
   139  		left, right := determineDelims(firstLine)
   140  		fm, err := extractFrontMatterDelims(reader, left, right)
   141  		if err != nil {
   142  			return nil, err
   143  		}
   144  		newp.frontmatter = fm
   145  	} else if newp.render && goorgeous.IsKeyword(firstLine) {
   146  		fm, err := goorgeous.ExtractOrgHeaders(reader)
   147  		if err != nil {
   148  			return nil, err
   149  		}
   150  		newp.frontmatter = fm
   151  	}
   152  
   153  	content, err := extractContent(reader)
   154  	if err != nil {
   155  		return nil, err
   156  	}
   157  
   158  	newp.content = content
   159  
   160  	return newp, nil
   161  }
   162  
   163  // chompBOM scans any leading Unicode Byte Order Markers from r.
   164  func chompBOM(r io.RuneScanner) (err error) {
   165  	for {
   166  		c, _, err := r.ReadRune()
   167  		if err != nil {
   168  			return err
   169  		}
   170  		if c != BOM {
   171  			r.UnreadRune()
   172  			return nil
   173  		}
   174  	}
   175  }
   176  
   177  // chompWhitespace scans any leading Unicode whitespace from r.
   178  func chompWhitespace(r io.RuneScanner) (err error) {
   179  	for {
   180  		c, _, err := r.ReadRune()
   181  		if err != nil {
   182  			return err
   183  		}
   184  		if !unicode.IsSpace(c) {
   185  			r.UnreadRune()
   186  			return nil
   187  		}
   188  	}
   189  }
   190  
   191  // chompFrontmatterStartComment checks r for a leading HTML comment.  If a
   192  // comment is found, it is read from r and then whitespace is trimmed from the
   193  // beginning of r.
   194  func chompFrontmatterStartComment(r *bufio.Reader) (err error) {
   195  	candidate, err := r.Peek(32)
   196  	if err != nil {
   197  		return err
   198  	}
   199  
   200  	str := string(candidate)
   201  	if strings.HasPrefix(str, HTMLCommentStart) {
   202  		lineEnd := strings.IndexAny(str, "\n")
   203  		if lineEnd == -1 {
   204  			//TODO: if we can't find it, Peek more?
   205  			return nil
   206  		}
   207  		testStr := strings.TrimSuffix(str[0:lineEnd], "\r")
   208  		if strings.Contains(testStr, HTMLCommentEnd) {
   209  			return nil
   210  		}
   211  		buf := make([]byte, lineEnd)
   212  		if _, err = r.Read(buf); err != nil {
   213  			return
   214  		}
   215  		if err = chompWhitespace(r); err != nil {
   216  			return err
   217  		}
   218  	}
   219  
   220  	return nil
   221  }
   222  
   223  // chompFrontmatterEndComment checks r for a trailing HTML comment.
   224  func chompFrontmatterEndComment(r *bufio.Reader) (err error) {
   225  	candidate, err := r.Peek(32)
   226  	if err != nil {
   227  		return err
   228  	}
   229  
   230  	str := string(candidate)
   231  	lineEnd := strings.IndexAny(str, "\n")
   232  	if lineEnd == -1 {
   233  		return nil
   234  	}
   235  	testStr := strings.TrimSuffix(str[0:lineEnd], "\r")
   236  	if strings.Contains(testStr, HTMLCommentStart) {
   237  		return nil
   238  	}
   239  
   240  	//TODO: if we can't find it, Peek more?
   241  	if strings.HasSuffix(testStr, HTMLCommentEnd) {
   242  		buf := make([]byte, lineEnd)
   243  		if _, err = r.Read(buf); err != nil {
   244  			return
   245  		}
   246  		if err = chompWhitespace(r); err != nil {
   247  			return err
   248  		}
   249  	}
   250  
   251  	return nil
   252  }
   253  
   254  func peekLine(r *bufio.Reader) (line []byte, err error) {
   255  	firstFive, err := r.Peek(5)
   256  	if err != nil {
   257  		return
   258  	}
   259  	idx := bytes.IndexByte(firstFive, '\n')
   260  	if idx == -1 {
   261  		return firstFive, nil
   262  	}
   263  	idx++ // include newline.
   264  	return firstFive[:idx], nil
   265  }
   266  
   267  func shouldRender(lead []byte) (frontmatter bool) {
   268  	if len(lead) <= 0 {
   269  		return
   270  	}
   271  
   272  	if bytes.Equal(lead[:1], []byte(HTMLLead)) {
   273  		return
   274  	}
   275  	return true
   276  }
   277  
   278  func isFrontMatterDelim(data []byte) bool {
   279  	return delims.Match(data)
   280  }
   281  
   282  func determineDelims(firstLine []byte) (left, right []byte) {
   283  	switch firstLine[0] {
   284  	case YAMLLead[0]:
   285  		return []byte(YAMLDelim), []byte(YAMLDelim)
   286  	case TOMLLead[0]:
   287  		return []byte(TOMLDelim), []byte(TOMLDelim)
   288  	case JSONLead[0]:
   289  		return []byte(JSONLead), []byte("}")
   290  	default:
   291  		panic(fmt.Sprintf("Unable to determine delims from %q", firstLine))
   292  	}
   293  }
   294  
   295  // extractFrontMatterDelims takes a frontmatter from the content bufio.Reader.
   296  // Beginning white spaces of the bufio.Reader must be trimmed before call this
   297  // function.
   298  func extractFrontMatterDelims(r *bufio.Reader, left, right []byte) (fm []byte, err error) {
   299  	var (
   300  		c           byte
   301  		buf         bytes.Buffer
   302  		level       int
   303  		sameDelim   = bytes.Equal(left, right)
   304  		inQuote     bool
   305  		escapeState int
   306  	)
   307  	// Frontmatter must start with a delimiter. To check it first,
   308  	// pre-reads beginning delimiter length - 1 bytes from Reader
   309  	for i := 0; i < len(left)-1; i++ {
   310  		if c, err = r.ReadByte(); err != nil {
   311  			return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String())
   312  		}
   313  		if err = buf.WriteByte(c); err != nil {
   314  			return nil, err
   315  		}
   316  	}
   317  
   318  	// Reads a character from Reader one by one and checks it matches the
   319  	// last character of one of delimiters to find the last character of
   320  	// frontmatter. If it matches, makes sure it contains the delimiter
   321  	// and if so, also checks it is followed by CR+LF or LF when YAML,
   322  	// TOML case. In JSON case, nested delimiters must be parsed and it
   323  	// is expected that the delimiter only contains one character.
   324  	for {
   325  		if c, err = r.ReadByte(); err != nil {
   326  			return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String())
   327  		}
   328  		if err = buf.WriteByte(c); err != nil {
   329  			return nil, err
   330  		}
   331  
   332  		switch c {
   333  		case '"':
   334  			if escapeState != 1 {
   335  				inQuote = !inQuote
   336  			}
   337  		case '\\':
   338  			escapeState++
   339  		case left[len(left)-1]:
   340  			if sameDelim { // YAML, TOML case
   341  				if bytes.HasSuffix(buf.Bytes(), left) && (buf.Len() == len(left) || buf.Bytes()[buf.Len()-len(left)-1] == '\n') {
   342  				nextByte:
   343  					c, err = r.ReadByte()
   344  					if err != nil {
   345  						// It is ok that the end delimiter ends with EOF
   346  						if err != io.EOF || level != 1 {
   347  							return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String())
   348  						}
   349  					} else {
   350  						switch c {
   351  						case '\n':
   352  							// ok
   353  						case ' ':
   354  							// Consume this byte and try to match again
   355  							goto nextByte
   356  						case '\r':
   357  							if err = buf.WriteByte(c); err != nil {
   358  								return nil, err
   359  							}
   360  							if c, err = r.ReadByte(); err != nil {
   361  								return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s\n%.100s...", buf.Len(), err, buf.String())
   362  							}
   363  							if c != '\n' {
   364  								return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len())
   365  							}
   366  						default:
   367  							return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len())
   368  						}
   369  						if err = buf.WriteByte(c); err != nil {
   370  							return nil, err
   371  						}
   372  					}
   373  					if level == 0 {
   374  						level = 1
   375  					} else {
   376  						level = 0
   377  					}
   378  				}
   379  			} else { // JSON case
   380  				if !inQuote {
   381  					level++
   382  				}
   383  			}
   384  		case right[len(right)-1]: // JSON case only reaches here
   385  			if !inQuote {
   386  				level--
   387  			}
   388  		}
   389  
   390  		if level == 0 {
   391  			// Consumes white spaces immediately behind frontmatter
   392  			if err = chompWhitespace(r); err != nil && err != io.EOF {
   393  				return nil, err
   394  			}
   395  			if err = chompFrontmatterEndComment(r); err != nil && err != io.EOF {
   396  				return nil, err
   397  			}
   398  
   399  			return buf.Bytes(), nil
   400  		}
   401  
   402  		if c != '\\' {
   403  			escapeState = 0
   404  		}
   405  
   406  	}
   407  }
   408  
   409  func extractContent(r io.Reader) (content []byte, err error) {
   410  	wr := new(bytes.Buffer)
   411  	if _, err = wr.ReadFrom(r); err != nil {
   412  		return
   413  	}
   414  	return wr.Bytes(), nil
   415  }