github.com/gohugoio/hugo@v0.88.1/publisher/htmlElementsCollector.go (about)

     1  // Copyright 2020 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package publisher
    15  
    16  import (
    17  	"bytes"
    18  	"regexp"
    19  	"sort"
    20  	"strings"
    21  	"sync"
    22  	"unicode"
    23  	"unicode/utf8"
    24  
    25  	"golang.org/x/net/html"
    26  
    27  	"github.com/gohugoio/hugo/helpers"
    28  )
    29  
    30  const eof = -1
    31  
    32  var (
    33  	htmlJsonFixer = strings.NewReplacer(", ", "\n")
    34  	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
    35  	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
    36  
    37  	skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
    38  	skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
    39  	endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
    40  
    41  	exceptionList = map[string]bool{
    42  		"thead": true,
    43  		"tbody": true,
    44  		"tfoot": true,
    45  		"td":    true,
    46  		"tr":    true,
    47  	}
    48  )
    49  
    50  func newHTMLElementsCollector() *htmlElementsCollector {
    51  	return &htmlElementsCollector{
    52  		elementSet: make(map[string]bool),
    53  	}
    54  }
    55  
    56  func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
    57  	w := &htmlElementsCollectorWriter{
    58  		collector: collector,
    59  		state:     htmlLexStart,
    60  	}
    61  
    62  	w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
    63  
    64  	return w
    65  }
    66  
    67  // HTMLElements holds lists of tags and attribute values for classes and id.
    68  type HTMLElements struct {
    69  	Tags    []string `json:"tags"`
    70  	Classes []string `json:"classes"`
    71  	IDs     []string `json:"ids"`
    72  }
    73  
    74  func (h *HTMLElements) Merge(other HTMLElements) {
    75  	h.Tags = append(h.Tags, other.Tags...)
    76  	h.Classes = append(h.Classes, other.Classes...)
    77  	h.IDs = append(h.IDs, other.IDs...)
    78  
    79  	h.Tags = helpers.UniqueStringsReuse(h.Tags)
    80  	h.Classes = helpers.UniqueStringsReuse(h.Classes)
    81  	h.IDs = helpers.UniqueStringsReuse(h.IDs)
    82  }
    83  
    84  func (h *HTMLElements) Sort() {
    85  	sort.Strings(h.Tags)
    86  	sort.Strings(h.Classes)
    87  	sort.Strings(h.IDs)
    88  }
    89  
    90  type htmlElement struct {
    91  	Tag     string
    92  	Classes []string
    93  	IDs     []string
    94  }
    95  
    96  type htmlElementsCollector struct {
    97  	// Contains the raw HTML string. We will get the same element
    98  	// several times, and want to avoid costly reparsing when this
    99  	// is used for aggregated data only.
   100  	elementSet map[string]bool
   101  
   102  	elements []htmlElement
   103  
   104  	mu sync.RWMutex
   105  }
   106  
   107  func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
   108  	var (
   109  		classes []string
   110  		ids     []string
   111  		tags    []string
   112  	)
   113  
   114  	for _, el := range c.elements {
   115  		classes = append(classes, el.Classes...)
   116  		ids = append(ids, el.IDs...)
   117  		tags = append(tags, el.Tag)
   118  	}
   119  
   120  	classes = helpers.UniqueStringsSorted(classes)
   121  	ids = helpers.UniqueStringsSorted(ids)
   122  	tags = helpers.UniqueStringsSorted(tags)
   123  
   124  	els := HTMLElements{
   125  		Classes: classes,
   126  		IDs:     ids,
   127  		Tags:    tags,
   128  	}
   129  
   130  	return els
   131  }
   132  
   133  type htmlElementsCollectorWriter struct {
   134  	collector *htmlElementsCollector
   135  
   136  	r     rune   // Current rune
   137  	width int    // The width in bytes of r
   138  	input []byte // The current slice written to Write
   139  	pos   int    // The current position in input
   140  
   141  	err error
   142  
   143  	inQuote rune
   144  
   145  	buff bytes.Buffer
   146  
   147  	// Current state
   148  	state htmlCollectorStateFunc
   149  
   150  	// Precompiled state funcs
   151  	defaultLexElementInside htmlCollectorStateFunc
   152  }
   153  
   154  // Write collects HTML elements from p, which must contain complete runes.
   155  func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
   156  	if p == nil {
   157  		return 0, nil
   158  	}
   159  
   160  	w.input = p
   161  
   162  	for {
   163  		w.r = w.next()
   164  		if w.r == eof || w.r == utf8.RuneError {
   165  			break
   166  		}
   167  		w.state = w.state(w)
   168  	}
   169  
   170  	w.pos = 0
   171  	w.input = nil
   172  
   173  	return len(p), nil
   174  }
   175  
   176  func (l *htmlElementsCollectorWriter) backup() {
   177  	l.pos -= l.width
   178  	l.r, _ = utf8.DecodeRune(l.input[l.pos:])
   179  }
   180  
   181  func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
   182  	var s htmlCollectorStateFunc
   183  	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
   184  		w.buff.WriteRune(w.r)
   185  		if condition() {
   186  			w.buff.Reset()
   187  			return resolve
   188  		}
   189  		return s
   190  	}
   191  	return s
   192  }
   193  
   194  func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
   195  	var s htmlCollectorStateFunc
   196  	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
   197  		if condition(w.r) {
   198  			return resolve
   199  		}
   200  		return s
   201  	}
   202  	return s
   203  }
   204  
   205  // Starts with e.g. "<body " or "<div"
   206  func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
   207  	var s htmlCollectorStateFunc
   208  	s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   209  		w.buff.WriteRune(w.r)
   210  
   211  		// Skip any text inside a quote.
   212  		if w.r == '\'' || w.r == '"' {
   213  			if w.inQuote == w.r {
   214  				w.inQuote = 0
   215  			} else if w.inQuote == 0 {
   216  				w.inQuote = w.r
   217  			}
   218  		}
   219  
   220  		if w.inQuote != 0 {
   221  			return s
   222  		}
   223  
   224  		if w.r == '>' {
   225  
   226  			// Work with the bytes slice as long as it's practical,
   227  			// to save memory allocations.
   228  			b := w.buff.Bytes()
   229  
   230  			defer func() {
   231  				w.buff.Reset()
   232  			}()
   233  
   234  			// First check if we have processed this element before.
   235  			w.collector.mu.RLock()
   236  
   237  			seen := w.collector.elementSet[string(b)]
   238  			w.collector.mu.RUnlock()
   239  			if seen {
   240  				return resolve
   241  			}
   242  
   243  			s := w.buff.String()
   244  
   245  			if s == "" {
   246  				return resolve
   247  			}
   248  
   249  			// Parse each collected element.
   250  			el, err := parseHTMLElement(s)
   251  			if err != nil {
   252  				w.err = err
   253  				return resolve
   254  			}
   255  
   256  			// Write this tag to the element set.
   257  			w.collector.mu.Lock()
   258  			w.collector.elementSet[s] = true
   259  			w.collector.elements = append(w.collector.elements, el)
   260  			w.collector.mu.Unlock()
   261  
   262  			return resolve
   263  
   264  		}
   265  
   266  		return s
   267  	}
   268  
   269  	return s
   270  }
   271  
   272  func (l *htmlElementsCollectorWriter) next() rune {
   273  	if l.pos >= len(l.input) {
   274  		l.width = 0
   275  		return eof
   276  	}
   277  
   278  	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
   279  
   280  	l.width = runeWidth
   281  	l.pos += l.width
   282  	return runeValue
   283  }
   284  
   285  // returns the next state in HTML element scanner.
   286  type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
   287  
   288  // At "<", buffer empty.
   289  // Potentially starting a HTML element.
   290  func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   291  	if w.r == '>' || unicode.IsSpace(w.r) {
   292  		if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
   293  			w.buff.Reset()
   294  			return htmlLexStart
   295  		}
   296  
   297  		tagName := w.buff.Bytes()[1:]
   298  
   299  		switch {
   300  		case skipInnerElementRe.Match(tagName):
   301  			// pre, script etc. We collect classes etc. on the surrounding
   302  			// element, but skip the inner content.
   303  			w.backup()
   304  
   305  			// tagName will be overwritten, so make a copy.
   306  			tagNameCopy := make([]byte, len(tagName))
   307  			copy(tagNameCopy, tagName)
   308  
   309  			return w.lexElementInside(
   310  				w.consumeBuffUntil(
   311  					func() bool {
   312  						if w.r != '>' {
   313  							return false
   314  						}
   315  						m := endTagRe.FindSubmatch(w.buff.Bytes())
   316  						if m == nil {
   317  							return false
   318  						}
   319  						return bytes.EqualFold(m[1], tagNameCopy)
   320  					},
   321  					htmlLexStart,
   322  				))
   323  		case skipAllElementRe.Match(tagName):
   324  			// E.g. "<!DOCTYPE ..."
   325  			w.buff.Reset()
   326  			return w.consumeRuneUntil(func(r rune) bool {
   327  				return r == '>'
   328  			}, htmlLexStart)
   329  		default:
   330  			w.backup()
   331  			return w.defaultLexElementInside
   332  		}
   333  	}
   334  
   335  	w.buff.WriteRune(w.r)
   336  
   337  	// If it's a comment, skip to its end.
   338  	if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
   339  		w.buff.Reset()
   340  		return htmlLexToEndOfComment
   341  	}
   342  
   343  	return htmlLexElementStart
   344  }
   345  
   346  // Entry state func.
   347  // Looks for a opening bracket, '<'.
   348  func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   349  	if w.r == '<' {
   350  		w.backup()
   351  		w.buff.Reset()
   352  		return htmlLexElementStart
   353  	}
   354  
   355  	return htmlLexStart
   356  }
   357  
   358  // After "<!--", buff empty.
   359  func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
   360  	w.buff.WriteRune(w.r)
   361  
   362  	if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
   363  		// Done, start looking for HTML elements again.
   364  		return htmlLexStart
   365  	}
   366  
   367  	return htmlLexToEndOfComment
   368  }
   369  
   370  func parseHTMLElement(elStr string) (el htmlElement, err error) {
   371  
   372  	tagName := parseStartTag(elStr)
   373  
   374  	el.Tag = strings.ToLower(tagName)
   375  	tagNameToParse := el.Tag
   376  
   377  	// The net/html parser does not handle single table elements as input, e.g. tbody.
   378  	// We only care about the element/class/ids, so just store away the original tag name
   379  	// and pretend it's a <div>.
   380  	if exceptionList[el.Tag] {
   381  		elStr = strings.Replace(elStr, tagName, "div", 1)
   382  		tagNameToParse = "div"
   383  	}
   384  
   385  	n, err := html.Parse(strings.NewReader(elStr))
   386  	if err != nil {
   387  		return
   388  	}
   389  
   390  	var walk func(*html.Node)
   391  	walk = func(n *html.Node) {
   392  		if n.Type == html.ElementNode && n.Data == tagNameToParse {
   393  			for _, a := range n.Attr {
   394  				switch {
   395  				case strings.EqualFold(a.Key, "id"):
   396  					// There should be only one, but one never knows...
   397  					el.IDs = append(el.IDs, a.Val)
   398  				default:
   399  					if classAttrRe.MatchString(a.Key) {
   400  						el.Classes = append(el.Classes, strings.Fields(a.Val)...)
   401  					} else {
   402  						key := strings.ToLower(a.Key)
   403  						val := strings.TrimSpace(a.Val)
   404  						if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
   405  							// This looks like a Vue or AlpineJS class binding.
   406  							val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
   407  							lines := strings.Split(val, "\n")
   408  							for i, l := range lines {
   409  								lines[i] = strings.TrimSpace(l)
   410  							}
   411  							val = strings.Join(lines, "\n")
   412  							val = jsonAttrRe.ReplaceAllString(val, "$1")
   413  							el.Classes = append(el.Classes, strings.Fields(val)...)
   414  						}
   415  					}
   416  				}
   417  			}
   418  		}
   419  
   420  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   421  			walk(c)
   422  		}
   423  	}
   424  
   425  	walk(n)
   426  
   427  	return
   428  }
   429  
   430  // Variants of s
   431  //    <body class="b a">
   432  //    <div>
   433  func parseStartTag(s string) string {
   434  	spaceIndex := strings.IndexFunc(s, func(r rune) bool {
   435  		return unicode.IsSpace(r)
   436  	})
   437  
   438  	if spaceIndex == -1 {
   439  		return s[1 : len(s)-1]
   440  	}
   441  
   442  	return s[1:spaceIndex]
   443  }