github.com/linchen2chris/hugo@v0.0.0-20230307053224-cec209389705/publisher/htmlElementsCollector.go (about)

     1  // Copyright 2020 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package publisher
    15  
    16  import (
    17  	"bytes"
    18  	"regexp"
    19  	"sort"
    20  	"strings"
    21  	"sync"
    22  	"unicode"
    23  	"unicode/utf8"
    24  
    25  	"golang.org/x/net/html"
    26  
    27  	"github.com/gohugoio/hugo/helpers"
    28  )
    29  
    30  const eof = -1
    31  
    32  var (
    33  	htmlJsonFixer = strings.NewReplacer(", ", "\n")
    34  	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
    35  	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
    36  
    37  	skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
    38  	skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
    39  
    40  	exceptionList = map[string]bool{
    41  		"thead": true,
    42  		"tbody": true,
    43  		"tfoot": true,
    44  		"td":    true,
    45  		"tr":    true,
    46  	}
    47  )
    48  
    49  func newHTMLElementsCollector() *htmlElementsCollector {
    50  	return &htmlElementsCollector{
    51  		elementSet: make(map[string]bool),
    52  	}
    53  }
    54  
    55  func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
    56  	w := &htmlElementsCollectorWriter{
    57  		collector: collector,
    58  		state:     htmlLexStart,
    59  	}
    60  
    61  	w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
    62  
    63  	return w
    64  }
    65  
    66  // HTMLElements holds lists of tags and attribute values for classes and id.
    67  type HTMLElements struct {
    68  	Tags    []string `json:"tags"`
    69  	Classes []string `json:"classes"`
    70  	IDs     []string `json:"ids"`
    71  }
    72  
    73  func (h *HTMLElements) Merge(other HTMLElements) {
    74  	h.Tags = append(h.Tags, other.Tags...)
    75  	h.Classes = append(h.Classes, other.Classes...)
    76  	h.IDs = append(h.IDs, other.IDs...)
    77  
    78  	h.Tags = helpers.UniqueStringsReuse(h.Tags)
    79  	h.Classes = helpers.UniqueStringsReuse(h.Classes)
    80  	h.IDs = helpers.UniqueStringsReuse(h.IDs)
    81  }
    82  
    83  func (h *HTMLElements) Sort() {
    84  	sort.Strings(h.Tags)
    85  	sort.Strings(h.Classes)
    86  	sort.Strings(h.IDs)
    87  }
    88  
    89  type htmlElement struct {
    90  	Tag     string
    91  	Classes []string
    92  	IDs     []string
    93  }
    94  
    95  type htmlElementsCollector struct {
    96  	// Contains the raw HTML string. We will get the same element
    97  	// several times, and want to avoid costly reparsing when this
    98  	// is used for aggregated data only.
    99  	elementSet map[string]bool
   100  
   101  	elements []htmlElement
   102  
   103  	mu sync.RWMutex
   104  }
   105  
   106  func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
   107  	var (
   108  		classes []string
   109  		ids     []string
   110  		tags    []string
   111  	)
   112  
   113  	for _, el := range c.elements {
   114  		classes = append(classes, el.Classes...)
   115  		ids = append(ids, el.IDs...)
   116  		tags = append(tags, el.Tag)
   117  	}
   118  
   119  	classes = helpers.UniqueStringsSorted(classes)
   120  	ids = helpers.UniqueStringsSorted(ids)
   121  	tags = helpers.UniqueStringsSorted(tags)
   122  
   123  	els := HTMLElements{
   124  		Classes: classes,
   125  		IDs:     ids,
   126  		Tags:    tags,
   127  	}
   128  
   129  	return els
   130  }
   131  
   132  type htmlElementsCollectorWriter struct {
   133  	collector *htmlElementsCollector
   134  
   135  	r     rune   // Current rune
   136  	width int    // The width in bytes of r
   137  	input []byte // The current slice written to Write
   138  	pos   int    // The current position in input
   139  
   140  	err error
   141  
   142  	inQuote rune
   143  
   144  	buff bytes.Buffer
   145  
   146  	// Current state
   147  	state htmlCollectorStateFunc
   148  
   149  	// Precompiled state funcs
   150  	defaultLexElementInside htmlCollectorStateFunc
   151  }
   152  
   153  // Write collects HTML elements from p, which must contain complete runes.
   154  func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
   155  	if p == nil {
   156  		return 0, nil
   157  	}
   158  
   159  	w.input = p
   160  
   161  	for {
   162  		w.r = w.next()
   163  		if w.r == eof || w.r == utf8.RuneError {
   164  			break
   165  		}
   166  		w.state = w.state(w)
   167  	}
   168  
   169  	w.pos = 0
   170  	w.input = nil
   171  
   172  	return len(p), nil
   173  }
   174  
   175  func (l *htmlElementsCollectorWriter) backup() {
   176  	l.pos -= l.width
   177  	l.r, _ = utf8.DecodeRune(l.input[l.pos:])
   178  }
   179  
   180  func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
   181  	var s htmlCollectorStateFunc
   182  	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
   183  		w.buff.WriteRune(w.r)
   184  		if condition() {
   185  			w.buff.Reset()
   186  			return resolve
   187  		}
   188  		return s
   189  	}
   190  	return s
   191  }
   192  
   193  func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
   194  	var s htmlCollectorStateFunc
   195  	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
   196  		if condition(w.r) {
   197  			return resolve
   198  		}
   199  		return s
   200  	}
   201  	return s
   202  }
   203  
   204  // Starts with e.g. "<body " or "<div"
   205  func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
   206  	var s htmlCollectorStateFunc
   207  	s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   208  		w.buff.WriteRune(w.r)
   209  
   210  		// Skip any text inside a quote.
   211  		if w.r == '\'' || w.r == '"' {
   212  			if w.inQuote == w.r {
   213  				w.inQuote = 0
   214  			} else if w.inQuote == 0 {
   215  				w.inQuote = w.r
   216  			}
   217  		}
   218  
   219  		if w.inQuote != 0 {
   220  			return s
   221  		}
   222  
   223  		if w.r == '>' {
   224  
   225  			// Work with the bytes slice as long as it's practical,
   226  			// to save memory allocations.
   227  			b := w.buff.Bytes()
   228  
   229  			defer func() {
   230  				w.buff.Reset()
   231  			}()
   232  
   233  			// First check if we have processed this element before.
   234  			w.collector.mu.RLock()
   235  
   236  			seen := w.collector.elementSet[string(b)]
   237  			w.collector.mu.RUnlock()
   238  			if seen {
   239  				return resolve
   240  			}
   241  
   242  			s := w.buff.String()
   243  
   244  			if s == "" {
   245  				return resolve
   246  			}
   247  
   248  			// Parse each collected element.
   249  			el, err := parseHTMLElement(s)
   250  			if err != nil {
   251  				w.err = err
   252  				return resolve
   253  			}
   254  
   255  			// Write this tag to the element set.
   256  			w.collector.mu.Lock()
   257  			w.collector.elementSet[s] = true
   258  			w.collector.elements = append(w.collector.elements, el)
   259  			w.collector.mu.Unlock()
   260  
   261  			return resolve
   262  
   263  		}
   264  
   265  		return s
   266  	}
   267  
   268  	return s
   269  }
   270  
   271  func (l *htmlElementsCollectorWriter) next() rune {
   272  	if l.pos >= len(l.input) {
   273  		l.width = 0
   274  		return eof
   275  	}
   276  
   277  	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
   278  
   279  	l.width = runeWidth
   280  	l.pos += l.width
   281  	return runeValue
   282  }
   283  
   284  // returns the next state in HTML element scanner.
   285  type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
   286  
   287  // At "<", buffer empty.
   288  // Potentially starting a HTML element.
   289  func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   290  	if w.r == '>' || unicode.IsSpace(w.r) {
   291  		if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
   292  			w.buff.Reset()
   293  			return htmlLexStart
   294  		}
   295  
   296  		tagName := w.buff.Bytes()[1:]
   297  		isSelfClosing := tagName[len(tagName)-1] == '/'
   298  
   299  		switch {
   300  		case !isSelfClosing && skipInnerElementRe.Match(tagName):
   301  			// pre, script etc. We collect classes etc. on the surrounding
   302  			// element, but skip the inner content.
   303  			w.backup()
   304  
   305  			// tagName will be overwritten, so make a copy.
   306  			tagNameCopy := make([]byte, len(tagName))
   307  			copy(tagNameCopy, tagName)
   308  
   309  			return w.lexElementInside(
   310  				w.consumeBuffUntil(
   311  					func() bool {
   312  						if w.r != '>' {
   313  							return false
   314  						}
   315  						return isClosedByTag(w.buff.Bytes(), tagNameCopy)
   316  					},
   317  					htmlLexStart,
   318  				))
   319  		case skipAllElementRe.Match(tagName):
   320  			// E.g. "<!DOCTYPE ..."
   321  			w.buff.Reset()
   322  			return w.consumeRuneUntil(func(r rune) bool {
   323  				return r == '>'
   324  			}, htmlLexStart)
   325  		default:
   326  			w.backup()
   327  			return w.defaultLexElementInside
   328  		}
   329  	}
   330  
   331  	w.buff.WriteRune(w.r)
   332  
   333  	// If it's a comment, skip to its end.
   334  	if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
   335  		w.buff.Reset()
   336  		return htmlLexToEndOfComment
   337  	}
   338  
   339  	return htmlLexElementStart
   340  }
   341  
   342  // Entry state func.
   343  // Looks for a opening bracket, '<'.
   344  func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   345  	if w.r == '<' {
   346  		w.backup()
   347  		w.buff.Reset()
   348  		return htmlLexElementStart
   349  	}
   350  
   351  	return htmlLexStart
   352  }
   353  
   354  // After "<!--", buff empty.
   355  func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   356  	w.buff.WriteRune(w.r)
   357  
   358  	if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
   359  		// Done, start looking for HTML elements again.
   360  		return htmlLexStart
   361  	}
   362  
   363  	return htmlLexToEndOfComment
   364  }
   365  
   366  func parseHTMLElement(elStr string) (el htmlElement, err error) {
   367  
   368  	tagName := parseStartTag(elStr)
   369  
   370  	el.Tag = strings.ToLower(tagName)
   371  	tagNameToParse := el.Tag
   372  
   373  	// The net/html parser does not handle single table elements as input, e.g. tbody.
   374  	// We only care about the element/class/ids, so just store away the original tag name
   375  	// and pretend it's a <div>.
   376  	if exceptionList[el.Tag] {
   377  		elStr = strings.Replace(elStr, tagName, "div", 1)
   378  		tagNameToParse = "div"
   379  	}
   380  
   381  	n, err := html.Parse(strings.NewReader(elStr))
   382  	if err != nil {
   383  		return
   384  	}
   385  
   386  	var walk func(*html.Node)
   387  	walk = func(n *html.Node) {
   388  		if n.Type == html.ElementNode && n.Data == tagNameToParse {
   389  			for _, a := range n.Attr {
   390  				switch {
   391  				case strings.EqualFold(a.Key, "id"):
   392  					// There should be only one, but one never knows...
   393  					el.IDs = append(el.IDs, a.Val)
   394  				default:
   395  					if classAttrRe.MatchString(a.Key) {
   396  						el.Classes = append(el.Classes, strings.Fields(a.Val)...)
   397  					} else {
   398  						key := strings.ToLower(a.Key)
   399  						val := strings.TrimSpace(a.Val)
   400  						if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
   401  							// This looks like a Vue or AlpineJS class binding.
   402  							val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
   403  							lines := strings.Split(val, "\n")
   404  							for i, l := range lines {
   405  								lines[i] = strings.TrimSpace(l)
   406  							}
   407  							val = strings.Join(lines, "\n")
   408  							val = jsonAttrRe.ReplaceAllString(val, "$1")
   409  							el.Classes = append(el.Classes, strings.Fields(val)...)
   410  						}
   411  					}
   412  				}
   413  			}
   414  		}
   415  
   416  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   417  			walk(c)
   418  		}
   419  	}
   420  
   421  	walk(n)
   422  
   423  	return
   424  }
   425  
   426  // Variants of s
   427  //
   428  //	<body class="b a">
   429  //	<div>
   430  func parseStartTag(s string) string {
   431  	spaceIndex := strings.IndexFunc(s, func(r rune) bool {
   432  		return unicode.IsSpace(r)
   433  	})
   434  
   435  	if spaceIndex == -1 {
   436  		s = s[1 : len(s)-1]
   437  	} else {
   438  		s = s[1:spaceIndex]
   439  	}
   440  
   441  	if s[len(s)-1] == '/' {
   442  		// Self closing.
   443  		s = s[:len(s)-1]
   444  	}
   445  
   446  	return s
   447  
   448  }
   449  
   450  // isClosedByTag reports whether b ends with a closing tag for tagName.
   451  func isClosedByTag(b, tagName []byte) bool {
   452  	if len(b) == 0 {
   453  		return false
   454  	}
   455  
   456  	if b[len(b)-1] != '>' {
   457  		return false
   458  	}
   459  
   460  	var (
   461  		lo int
   462  		hi int
   463  
   464  		state  int
   465  		inWord bool
   466  	)
   467  
   468  LOOP:
   469  	for i := len(b) - 2; i >= 0; i-- {
   470  		switch {
   471  		case b[i] == '<':
   472  			if state != 1 {
   473  				return false
   474  			}
   475  			state = 2
   476  			break LOOP
   477  		case b[i] == '/':
   478  			if state != 0 {
   479  				return false
   480  			}
   481  			state++
   482  			if inWord {
   483  				lo = i + 1
   484  				inWord = false
   485  			}
   486  		case isSpace(b[i]):
   487  			if inWord {
   488  				lo = i + 1
   489  				inWord = false
   490  			}
   491  		default:
   492  			if !inWord {
   493  				hi = i + 1
   494  				inWord = true
   495  			}
   496  		}
   497  	}
   498  
   499  	if state != 2 || lo >= hi {
   500  		return false
   501  	}
   502  
   503  	return bytes.EqualFold(tagName, b[lo:hi])
   504  
   505  }
   506  
   507  func isSpace(b byte) bool {
   508  	return b == ' ' || b == '\t' || b == '\n'
   509  }