github.com/kovansky/hugo@v0.92.3-0.20220224232819-63076e4ff19f/helpers/content.go (about) 1 // Copyright 2019 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // Package helpers implements general utility functions that work with 15 // and on content. The helper functions defined here lay down the 16 // foundation of how Hugo works with files and filepaths, and perform 17 // string operations on content. 18 package helpers 19 20 import ( 21 "bytes" 22 "html/template" 23 "strings" 24 "unicode" 25 "unicode/utf8" 26 27 "github.com/gohugoio/hugo/common/hexec" 28 "github.com/gohugoio/hugo/common/loggers" 29 30 "github.com/spf13/afero" 31 32 "github.com/gohugoio/hugo/markup/converter" 33 "github.com/gohugoio/hugo/markup/converter/hooks" 34 35 "github.com/gohugoio/hugo/markup" 36 37 bp "github.com/gohugoio/hugo/bufferpool" 38 "github.com/gohugoio/hugo/config" 39 ) 40 41 var ( 42 openingPTag = []byte("<p>") 43 closingPTag = []byte("</p>") 44 paragraphIndicator = []byte("<p") 45 closingIndicator = []byte("</") 46 ) 47 48 // ContentSpec provides functionality to render markdown content. 49 type ContentSpec struct { 50 Converters markup.ConverterProvider 51 anchorNameSanitizer converter.AnchorNameSanitizer 52 getRenderer func(t hooks.RendererType, id interface{}) interface{} 53 54 // SummaryLength is the length of the summary that Hugo extracts from a content. 55 summaryLength int 56 57 BuildFuture bool 58 BuildExpired bool 59 BuildDrafts bool 60 61 Cfg config.Provider 62 } 63 64 // NewContentSpec returns a ContentSpec initialized 65 // with the appropriate fields from the given config.Provider. 66 func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.Fs, ex *hexec.Exec) (*ContentSpec, error) { 67 spec := &ContentSpec{ 68 summaryLength: cfg.GetInt("summaryLength"), 69 BuildFuture: cfg.GetBool("buildFuture"), 70 BuildExpired: cfg.GetBool("buildExpired"), 71 BuildDrafts: cfg.GetBool("buildDrafts"), 72 73 Cfg: cfg, 74 } 75 76 converterProvider, err := markup.NewConverterProvider(converter.ProviderConfig{ 77 Cfg: cfg, 78 ContentFs: contentFs, 79 Logger: logger, 80 Exec: ex, 81 }) 82 if err != nil { 83 return nil, err 84 } 85 86 spec.Converters = converterProvider 87 p := converterProvider.Get("markdown") 88 conv, err := p.New(converter.DocumentContext{}) 89 if err != nil { 90 return nil, err 91 } 92 if as, ok := conv.(converter.AnchorNameSanitizer); ok { 93 spec.anchorNameSanitizer = as 94 } else { 95 // Use Goldmark's sanitizer 96 p := converterProvider.Get("goldmark") 97 conv, err := p.New(converter.DocumentContext{}) 98 if err != nil { 99 return nil, err 100 } 101 spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer) 102 } 103 104 return spec, nil 105 } 106 107 var stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n") 108 109 // StripHTML accepts a string, strips out all HTML tags and returns it. 110 func StripHTML(s string) string { 111 // Shortcut strings with no tags in them 112 if !strings.ContainsAny(s, "<>") { 113 return s 114 } 115 s = stripHTMLReplacer.Replace(s) 116 117 // Walk through the string removing all tags 118 b := bp.GetBuffer() 119 defer bp.PutBuffer(b) 120 var inTag, isSpace, wasSpace bool 121 for _, r := range s { 122 if !inTag { 123 isSpace = false 124 } 125 126 switch { 127 case r == '<': 128 inTag = true 129 case r == '>': 130 inTag = false 131 case unicode.IsSpace(r): 132 isSpace = true 133 fallthrough 134 default: 135 if !inTag && (!isSpace || (isSpace && !wasSpace)) { 136 b.WriteRune(r) 137 } 138 } 139 140 wasSpace = isSpace 141 142 } 143 return b.String() 144 } 145 146 // stripEmptyNav strips out empty <nav> tags from content. 147 func stripEmptyNav(in []byte) []byte { 148 return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1) 149 } 150 151 // BytesToHTML converts bytes to type template.HTML. 152 func BytesToHTML(b []byte) template.HTML { 153 return template.HTML(string(b)) 154 } 155 156 // ExtractTOC extracts Table of Contents from content. 157 func ExtractTOC(content []byte) (newcontent []byte, toc []byte) { 158 if !bytes.Contains(content, []byte("<nav>")) { 159 return content, nil 160 } 161 origContent := make([]byte, len(content)) 162 copy(origContent, content) 163 first := []byte(`<nav> 164 <ul>`) 165 166 last := []byte(`</ul> 167 </nav>`) 168 169 replacement := []byte(`<nav id="TableOfContents"> 170 <ul>`) 171 172 startOfTOC := bytes.Index(content, first) 173 174 peekEnd := len(content) 175 if peekEnd > 70+startOfTOC { 176 peekEnd = 70 + startOfTOC 177 } 178 179 if startOfTOC < 0 { 180 return stripEmptyNav(content), toc 181 } 182 // Need to peek ahead to see if this nav element is actually the right one. 183 correctNav := bytes.Index(content[startOfTOC:peekEnd], []byte(`<li><a href="#`)) 184 if correctNav < 0 { // no match found 185 return content, toc 186 } 187 lengthOfTOC := bytes.Index(content[startOfTOC:], last) + len(last) 188 endOfTOC := startOfTOC + lengthOfTOC 189 190 newcontent = append(content[:startOfTOC], content[endOfTOC:]...) 191 toc = append(replacement, origContent[startOfTOC+len(first):endOfTOC]...) 192 return 193 } 194 195 func (c *ContentSpec) SanitizeAnchorName(s string) string { 196 return c.anchorNameSanitizer.SanitizeAnchorName(s) 197 } 198 199 func (c *ContentSpec) ResolveMarkup(in string) string { 200 in = strings.ToLower(in) 201 switch in { 202 case "md", "markdown", "mdown": 203 return "markdown" 204 case "html", "htm": 205 return "html" 206 default: 207 if conv := c.Converters.Get(in); conv != nil { 208 return conv.Name() 209 } 210 } 211 return "" 212 } 213 214 // TotalWords counts instance of one or more consecutive white space 215 // characters, as defined by unicode.IsSpace, in s. 216 // This is a cheaper way of word counting than the obvious len(strings.Fields(s)). 217 func TotalWords(s string) int { 218 n := 0 219 inWord := false 220 for _, r := range s { 221 wasInWord := inWord 222 inWord = !unicode.IsSpace(r) 223 if inWord && !wasInWord { 224 n++ 225 } 226 } 227 return n 228 } 229 230 // TruncateWordsByRune truncates words by runes. 231 func (c *ContentSpec) TruncateWordsByRune(in []string) (string, bool) { 232 words := make([]string, len(in)) 233 copy(words, in) 234 235 count := 0 236 for index, word := range words { 237 if count >= c.summaryLength { 238 return strings.Join(words[:index], " "), true 239 } 240 runeCount := utf8.RuneCountInString(word) 241 if len(word) == runeCount { 242 count++ 243 } else if count+runeCount < c.summaryLength { 244 count += runeCount 245 } else { 246 for ri := range word { 247 if count >= c.summaryLength { 248 truncatedWords := append(words[:index], word[:ri]) 249 return strings.Join(truncatedWords, " "), true 250 } 251 count++ 252 } 253 } 254 } 255 256 return strings.Join(words, " "), false 257 } 258 259 // TruncateWordsToWholeSentence takes content and truncates to whole sentence 260 // limited by max number of words. It also returns whether it is truncated. 261 func (c *ContentSpec) TruncateWordsToWholeSentence(s string) (string, bool) { 262 var ( 263 wordCount = 0 264 lastWordIndex = -1 265 ) 266 267 for i, r := range s { 268 if unicode.IsSpace(r) { 269 wordCount++ 270 lastWordIndex = i 271 272 if wordCount >= c.summaryLength { 273 break 274 } 275 276 } 277 } 278 279 if lastWordIndex == -1 { 280 return s, false 281 } 282 283 endIndex := -1 284 285 for j, r := range s[lastWordIndex:] { 286 if isEndOfSentence(r) { 287 endIndex = j + lastWordIndex + utf8.RuneLen(r) 288 break 289 } 290 } 291 292 if endIndex == -1 { 293 return s, false 294 } 295 296 return strings.TrimSpace(s[:endIndex]), endIndex < len(s) 297 } 298 299 // TrimShortHTML removes the <p>/</p> tags from HTML input in the situation 300 // where said tags are the only <p> tags in the input and enclose the content 301 // of the input (whitespace excluded). 302 func (c *ContentSpec) TrimShortHTML(input []byte) []byte { 303 firstOpeningP := bytes.Index(input, paragraphIndicator) 304 lastOpeningP := bytes.LastIndex(input, paragraphIndicator) 305 306 lastClosingP := bytes.LastIndex(input, closingPTag) 307 lastClosing := bytes.LastIndex(input, closingIndicator) 308 309 if firstOpeningP == lastOpeningP && lastClosingP == lastClosing { 310 input = bytes.TrimSpace(input) 311 input = bytes.TrimPrefix(input, openingPTag) 312 input = bytes.TrimSuffix(input, closingPTag) 313 input = bytes.TrimSpace(input) 314 } 315 return input 316 } 317 318 func isEndOfSentence(r rune) bool { 319 return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n' 320 } 321 322 // Kept only for benchmark. 323 func (c *ContentSpec) truncateWordsToWholeSentenceOld(content string) (string, bool) { 324 words := strings.Fields(content) 325 326 if c.summaryLength >= len(words) { 327 return strings.Join(words, " "), false 328 } 329 330 for counter, word := range words[c.summaryLength:] { 331 if strings.HasSuffix(word, ".") || 332 strings.HasSuffix(word, "?") || 333 strings.HasSuffix(word, ".\"") || 334 strings.HasSuffix(word, "!") { 335 upper := c.summaryLength + counter + 1 336 return strings.Join(words[:upper], " "), (upper < len(words)) 337 } 338 } 339 340 return strings.Join(words[:c.summaryLength], " "), true 341 }