github.com/linchen2chris/hugo@v0.0.0-20230307053224-cec209389705/publisher/htmlElementsCollector.go (about) 1 // Copyright 2020 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package publisher 15 16 import ( 17 "bytes" 18 "regexp" 19 "sort" 20 "strings" 21 "sync" 22 "unicode" 23 "unicode/utf8" 24 25 "golang.org/x/net/html" 26 27 "github.com/gohugoio/hugo/helpers" 28 ) 29 30 const eof = -1 31 32 var ( 33 htmlJsonFixer = strings.NewReplacer(", ", "\n") 34 jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) 35 classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) 36 37 skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) 38 skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) 39 40 exceptionList = map[string]bool{ 41 "thead": true, 42 "tbody": true, 43 "tfoot": true, 44 "td": true, 45 "tr": true, 46 } 47 ) 48 49 func newHTMLElementsCollector() *htmlElementsCollector { 50 return &htmlElementsCollector{ 51 elementSet: make(map[string]bool), 52 } 53 } 54 55 func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { 56 w := &htmlElementsCollectorWriter{ 57 collector: collector, 58 state: htmlLexStart, 59 } 60 61 w.defaultLexElementInside = w.lexElementInside(htmlLexStart) 62 63 return w 64 } 65 66 // HTMLElements holds lists of tags and attribute values for classes and id. 67 type HTMLElements struct { 68 Tags []string `json:"tags"` 69 Classes []string `json:"classes"` 70 IDs []string `json:"ids"` 71 } 72 73 func (h *HTMLElements) Merge(other HTMLElements) { 74 h.Tags = append(h.Tags, other.Tags...) 75 h.Classes = append(h.Classes, other.Classes...) 76 h.IDs = append(h.IDs, other.IDs...) 77 78 h.Tags = helpers.UniqueStringsReuse(h.Tags) 79 h.Classes = helpers.UniqueStringsReuse(h.Classes) 80 h.IDs = helpers.UniqueStringsReuse(h.IDs) 81 } 82 83 func (h *HTMLElements) Sort() { 84 sort.Strings(h.Tags) 85 sort.Strings(h.Classes) 86 sort.Strings(h.IDs) 87 } 88 89 type htmlElement struct { 90 Tag string 91 Classes []string 92 IDs []string 93 } 94 95 type htmlElementsCollector struct { 96 // Contains the raw HTML string. We will get the same element 97 // several times, and want to avoid costly reparsing when this 98 // is used for aggregated data only. 99 elementSet map[string]bool 100 101 elements []htmlElement 102 103 mu sync.RWMutex 104 } 105 106 func (c *htmlElementsCollector) getHTMLElements() HTMLElements { 107 var ( 108 classes []string 109 ids []string 110 tags []string 111 ) 112 113 for _, el := range c.elements { 114 classes = append(classes, el.Classes...) 115 ids = append(ids, el.IDs...) 116 tags = append(tags, el.Tag) 117 } 118 119 classes = helpers.UniqueStringsSorted(classes) 120 ids = helpers.UniqueStringsSorted(ids) 121 tags = helpers.UniqueStringsSorted(tags) 122 123 els := HTMLElements{ 124 Classes: classes, 125 IDs: ids, 126 Tags: tags, 127 } 128 129 return els 130 } 131 132 type htmlElementsCollectorWriter struct { 133 collector *htmlElementsCollector 134 135 r rune // Current rune 136 width int // The width in bytes of r 137 input []byte // The current slice written to Write 138 pos int // The current position in input 139 140 err error 141 142 inQuote rune 143 144 buff bytes.Buffer 145 146 // Current state 147 state htmlCollectorStateFunc 148 149 // Precompiled state funcs 150 defaultLexElementInside htmlCollectorStateFunc 151 } 152 153 // Write collects HTML elements from p, which must contain complete runes. 154 func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) { 155 if p == nil { 156 return 0, nil 157 } 158 159 w.input = p 160 161 for { 162 w.r = w.next() 163 if w.r == eof || w.r == utf8.RuneError { 164 break 165 } 166 w.state = w.state(w) 167 } 168 169 w.pos = 0 170 w.input = nil 171 172 return len(p), nil 173 } 174 175 func (l *htmlElementsCollectorWriter) backup() { 176 l.pos -= l.width 177 l.r, _ = utf8.DecodeRune(l.input[l.pos:]) 178 } 179 180 func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { 181 var s htmlCollectorStateFunc 182 s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { 183 w.buff.WriteRune(w.r) 184 if condition() { 185 w.buff.Reset() 186 return resolve 187 } 188 return s 189 } 190 return s 191 } 192 193 func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { 194 var s htmlCollectorStateFunc 195 s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { 196 if condition(w.r) { 197 return resolve 198 } 199 return s 200 } 201 return s 202 } 203 204 // Starts with e.g. "<body " or "<div" 205 func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc { 206 var s htmlCollectorStateFunc 207 s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 208 w.buff.WriteRune(w.r) 209 210 // Skip any text inside a quote. 211 if w.r == '\'' || w.r == '"' { 212 if w.inQuote == w.r { 213 w.inQuote = 0 214 } else if w.inQuote == 0 { 215 w.inQuote = w.r 216 } 217 } 218 219 if w.inQuote != 0 { 220 return s 221 } 222 223 if w.r == '>' { 224 225 // Work with the bytes slice as long as it's practical, 226 // to save memory allocations. 227 b := w.buff.Bytes() 228 229 defer func() { 230 w.buff.Reset() 231 }() 232 233 // First check if we have processed this element before. 234 w.collector.mu.RLock() 235 236 seen := w.collector.elementSet[string(b)] 237 w.collector.mu.RUnlock() 238 if seen { 239 return resolve 240 } 241 242 s := w.buff.String() 243 244 if s == "" { 245 return resolve 246 } 247 248 // Parse each collected element. 249 el, err := parseHTMLElement(s) 250 if err != nil { 251 w.err = err 252 return resolve 253 } 254 255 // Write this tag to the element set. 256 w.collector.mu.Lock() 257 w.collector.elementSet[s] = true 258 w.collector.elements = append(w.collector.elements, el) 259 w.collector.mu.Unlock() 260 261 return resolve 262 263 } 264 265 return s 266 } 267 268 return s 269 } 270 271 func (l *htmlElementsCollectorWriter) next() rune { 272 if l.pos >= len(l.input) { 273 l.width = 0 274 return eof 275 } 276 277 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) 278 279 l.width = runeWidth 280 l.pos += l.width 281 return runeValue 282 } 283 284 // returns the next state in HTML element scanner. 285 type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc 286 287 // At "<", buffer empty. 288 // Potentially starting a HTML element. 289 func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 290 if w.r == '>' || unicode.IsSpace(w.r) { 291 if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) { 292 w.buff.Reset() 293 return htmlLexStart 294 } 295 296 tagName := w.buff.Bytes()[1:] 297 isSelfClosing := tagName[len(tagName)-1] == '/' 298 299 switch { 300 case !isSelfClosing && skipInnerElementRe.Match(tagName): 301 // pre, script etc. We collect classes etc. on the surrounding 302 // element, but skip the inner content. 303 w.backup() 304 305 // tagName will be overwritten, so make a copy. 306 tagNameCopy := make([]byte, len(tagName)) 307 copy(tagNameCopy, tagName) 308 309 return w.lexElementInside( 310 w.consumeBuffUntil( 311 func() bool { 312 if w.r != '>' { 313 return false 314 } 315 return isClosedByTag(w.buff.Bytes(), tagNameCopy) 316 }, 317 htmlLexStart, 318 )) 319 case skipAllElementRe.Match(tagName): 320 // E.g. "<!DOCTYPE ..." 321 w.buff.Reset() 322 return w.consumeRuneUntil(func(r rune) bool { 323 return r == '>' 324 }, htmlLexStart) 325 default: 326 w.backup() 327 return w.defaultLexElementInside 328 } 329 } 330 331 w.buff.WriteRune(w.r) 332 333 // If it's a comment, skip to its end. 334 if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) { 335 w.buff.Reset() 336 return htmlLexToEndOfComment 337 } 338 339 return htmlLexElementStart 340 } 341 342 // Entry state func. 343 // Looks for a opening bracket, '<'. 344 func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 345 if w.r == '<' { 346 w.backup() 347 w.buff.Reset() 348 return htmlLexElementStart 349 } 350 351 return htmlLexStart 352 } 353 354 // After "<!--", buff empty. 355 func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 356 w.buff.WriteRune(w.r) 357 358 if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) { 359 // Done, start looking for HTML elements again. 360 return htmlLexStart 361 } 362 363 return htmlLexToEndOfComment 364 } 365 366 func parseHTMLElement(elStr string) (el htmlElement, err error) { 367 368 tagName := parseStartTag(elStr) 369 370 el.Tag = strings.ToLower(tagName) 371 tagNameToParse := el.Tag 372 373 // The net/html parser does not handle single table elements as input, e.g. tbody. 374 // We only care about the element/class/ids, so just store away the original tag name 375 // and pretend it's a <div>. 376 if exceptionList[el.Tag] { 377 elStr = strings.Replace(elStr, tagName, "div", 1) 378 tagNameToParse = "div" 379 } 380 381 n, err := html.Parse(strings.NewReader(elStr)) 382 if err != nil { 383 return 384 } 385 386 var walk func(*html.Node) 387 walk = func(n *html.Node) { 388 if n.Type == html.ElementNode && n.Data == tagNameToParse { 389 for _, a := range n.Attr { 390 switch { 391 case strings.EqualFold(a.Key, "id"): 392 // There should be only one, but one never knows... 393 el.IDs = append(el.IDs, a.Val) 394 default: 395 if classAttrRe.MatchString(a.Key) { 396 el.Classes = append(el.Classes, strings.Fields(a.Val)...) 397 } else { 398 key := strings.ToLower(a.Key) 399 val := strings.TrimSpace(a.Val) 400 if strings.Contains(key, "class") && strings.HasPrefix(val, "{") { 401 // This looks like a Vue or AlpineJS class binding. 402 val = htmlJsonFixer.Replace(strings.Trim(val, "{}")) 403 lines := strings.Split(val, "\n") 404 for i, l := range lines { 405 lines[i] = strings.TrimSpace(l) 406 } 407 val = strings.Join(lines, "\n") 408 val = jsonAttrRe.ReplaceAllString(val, "$1") 409 el.Classes = append(el.Classes, strings.Fields(val)...) 410 } 411 } 412 } 413 } 414 } 415 416 for c := n.FirstChild; c != nil; c = c.NextSibling { 417 walk(c) 418 } 419 } 420 421 walk(n) 422 423 return 424 } 425 426 // Variants of s 427 // 428 // <body class="b a"> 429 // <div> 430 func parseStartTag(s string) string { 431 spaceIndex := strings.IndexFunc(s, func(r rune) bool { 432 return unicode.IsSpace(r) 433 }) 434 435 if spaceIndex == -1 { 436 s = s[1 : len(s)-1] 437 } else { 438 s = s[1:spaceIndex] 439 } 440 441 if s[len(s)-1] == '/' { 442 // Self closing. 443 s = s[:len(s)-1] 444 } 445 446 return s 447 448 } 449 450 // isClosedByTag reports whether b ends with a closing tag for tagName. 451 func isClosedByTag(b, tagName []byte) bool { 452 if len(b) == 0 { 453 return false 454 } 455 456 if b[len(b)-1] != '>' { 457 return false 458 } 459 460 var ( 461 lo int 462 hi int 463 464 state int 465 inWord bool 466 ) 467 468 LOOP: 469 for i := len(b) - 2; i >= 0; i-- { 470 switch { 471 case b[i] == '<': 472 if state != 1 { 473 return false 474 } 475 state = 2 476 break LOOP 477 case b[i] == '/': 478 if state != 0 { 479 return false 480 } 481 state++ 482 if inWord { 483 lo = i + 1 484 inWord = false 485 } 486 case isSpace(b[i]): 487 if inWord { 488 lo = i + 1 489 inWord = false 490 } 491 default: 492 if !inWord { 493 hi = i + 1 494 inWord = true 495 } 496 } 497 } 498 499 if state != 2 || lo >= hi { 500 return false 501 } 502 503 return bytes.EqualFold(tagName, b[lo:hi]) 504 505 } 506 507 func isSpace(b byte) bool { 508 return b == ' ' || b == '\t' || b == '\n' 509 }