github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/bluemonday/sanitize.go (about) 1 // Copyright (c) 2014, David Kitchen <david@buro9.com> 2 // 3 // All rights reserved. 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // 8 // * Redistributions of source code must retain the above copyright notice, this 9 // list of conditions and the following disclaimer. 10 // 11 // * Redistributions in binary form must reproduce the above copyright notice, 12 // this list of conditions and the following disclaimer in the documentation 13 // and/or other materials provided with the distribution. 14 // 15 // * Neither the name of the organisation (Microcosm) nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 package bluemonday 31 32 import ( 33 "bytes" 34 "io" 35 "net/url" 36 "strings" 37 38 "github.com/insionng/yougam/libraries/x/net/html" 39 ) 40 41 // Sanitize takes a string that contains a HTML fragment or document and applies 42 // the given policy whitelist. 43 // 44 // It returns a HTML string that has been sanitized by the policy or an empty 45 // string if an error has occurred (most likely as a consequence of extremely 46 // malformed input) 47 func (p *Policy) Sanitize(s string) string { 48 if strings.TrimSpace(s) == "" { 49 return s 50 } 51 52 return p.sanitize(strings.NewReader(s)).String() 53 } 54 55 // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies 56 // the given policy whitelist. 57 // 58 // It returns a []byte containing the HTML that has been sanitized by the policy 59 // or an empty []byte if an error has occurred (most likely as a consequence of 60 // extremely malformed input) 61 func (p *Policy) SanitizeBytes(b []byte) []byte { 62 if len(bytes.TrimSpace(b)) == 0 { 63 return b 64 } 65 66 return p.sanitize(bytes.NewReader(b)).Bytes() 67 } 68 69 // SanitizeReader takes an io.Reader that contains a HTML fragment or document 70 // and applies the given policy whitelist. 71 // 72 // It returns a bytes.Buffer containing the HTML that has been sanitized by the 73 // policy. Errors during sanitization will merely return an empty result. 74 func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { 75 return p.sanitize(r) 76 } 77 78 // Performs the actual sanitization process. 79 func (p *Policy) sanitize(r io.Reader) *bytes.Buffer { 80 81 // It is possible that the developer has created the policy via: 82 // p := bluemonday.Policy{} 83 // rather than: 84 // p := bluemonday.NewPolicy() 85 // If this is the case, and if they haven't yet triggered an action that 86 // would initiliaze the maps, then we need to do that. 87 p.init() 88 89 var buff bytes.Buffer 90 tokenizer := html.NewTokenizer(r) 91 92 skipElementContent := false 93 skippingElementsCount := 0 94 95 skipClosingTag := false 96 closingTagToSkipStack := []string{} 97 98 for { 99 if tokenizer.Next() == html.ErrorToken { 100 err := tokenizer.Err() 101 if err == io.EOF { 102 // End of input means end of processing 103 return &buff 104 } 105 106 // Raw tokenizer error 107 return &bytes.Buffer{} 108 } 109 110 token := tokenizer.Token() 111 switch token.Type { 112 case html.DoctypeToken: 113 114 if p.allowDocType { 115 buff.WriteString(token.String()) 116 } 117 118 case html.CommentToken: 119 120 // Comments are ignored by default 121 122 case html.StartTagToken: 123 124 aps, ok := p.elsAndAttrs[token.Data] 125 if !ok { 126 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { 127 skipElementContent = true 128 skippingElementsCount++ 129 } 130 break 131 } 132 133 if len(token.Attr) != 0 { 134 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) 135 } 136 137 if len(token.Attr) == 0 { 138 if !p.allowNoAttrs(token.Data) { 139 skipClosingTag = true 140 closingTagToSkipStack = append(closingTagToSkipStack, token.Data) 141 break 142 } 143 } 144 145 if !skipElementContent { 146 buff.WriteString(token.String()) 147 } 148 149 case html.EndTagToken: 150 151 if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { 152 closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] 153 if len(closingTagToSkipStack) == 0 { 154 skipClosingTag = false 155 } 156 break 157 } 158 159 if _, ok := p.elsAndAttrs[token.Data]; !ok { 160 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { 161 skippingElementsCount-- 162 if skippingElementsCount == 0 { 163 skipElementContent = false 164 } 165 } 166 break 167 } 168 169 if !skipElementContent { 170 buff.WriteString(token.String()) 171 } 172 173 case html.SelfClosingTagToken: 174 175 aps, ok := p.elsAndAttrs[token.Data] 176 if !ok { 177 break 178 } 179 180 if len(token.Attr) != 0 { 181 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) 182 } 183 184 if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { 185 break 186 } 187 188 if !skipElementContent { 189 buff.WriteString(token.String()) 190 } 191 192 case html.TextToken: 193 194 if !skipElementContent { 195 buff.WriteString(token.String()) 196 } 197 198 default: 199 // A token that didn't exist in the html package when we wrote this 200 return &bytes.Buffer{} 201 } 202 } 203 } 204 205 // sanitizeAttrs takes a set of element attribute policies and the global 206 // attribute policies and applies them to the []html.Attribute returning a set 207 // of html.Attributes that match the policies 208 func (p *Policy) sanitizeAttrs( 209 elementName string, 210 attrs []html.Attribute, 211 aps map[string]attrPolicy, 212 ) []html.Attribute { 213 214 if len(attrs) == 0 { 215 return attrs 216 } 217 218 // Builds a new attribute slice based on the whether the attribute has been 219 // whitelisted explicitly or globally. 220 cleanAttrs := []html.Attribute{} 221 for _, htmlAttr := range attrs { 222 // Is there an element specific attribute policy that applies? 223 if ap, ok := aps[htmlAttr.Key]; ok { 224 if ap.regexp != nil { 225 if ap.regexp.MatchString(htmlAttr.Val) { 226 cleanAttrs = append(cleanAttrs, htmlAttr) 227 continue 228 } 229 } else { 230 cleanAttrs = append(cleanAttrs, htmlAttr) 231 continue 232 } 233 } 234 235 // Is there a global attribute policy that applies? 236 if ap, ok := p.globalAttrs[htmlAttr.Key]; ok { 237 if ap.regexp != nil { 238 if ap.regexp.MatchString(htmlAttr.Val) { 239 cleanAttrs = append(cleanAttrs, htmlAttr) 240 } 241 } else { 242 cleanAttrs = append(cleanAttrs, htmlAttr) 243 } 244 } 245 } 246 247 if len(cleanAttrs) == 0 { 248 // If nothing was allowed, let's get out of here 249 return cleanAttrs 250 } 251 // cleanAttrs now contains the attributes that are permitted 252 253 if linkable(elementName) { 254 if p.requireParseableURLs { 255 // Ensure URLs are parseable: 256 // - a.href 257 // - area.href 258 // - link.href 259 // - blockquote.cite 260 // - q.cite 261 // - img.src 262 // - script.src 263 tmpAttrs := []html.Attribute{} 264 for _, htmlAttr := range cleanAttrs { 265 switch elementName { 266 case "a", "area", "link": 267 if htmlAttr.Key == "href" { 268 if u, ok := p.validURL(htmlAttr.Val); ok { 269 htmlAttr.Val = u 270 tmpAttrs = append(tmpAttrs, htmlAttr) 271 } 272 break 273 } 274 tmpAttrs = append(tmpAttrs, htmlAttr) 275 case "blockquote", "q": 276 if htmlAttr.Key == "cite" { 277 if u, ok := p.validURL(htmlAttr.Val); ok { 278 htmlAttr.Val = u 279 tmpAttrs = append(tmpAttrs, htmlAttr) 280 } 281 break 282 } 283 tmpAttrs = append(tmpAttrs, htmlAttr) 284 case "img", "script": 285 if htmlAttr.Key == "src" { 286 if u, ok := p.validURL(htmlAttr.Val); ok { 287 htmlAttr.Val = u 288 tmpAttrs = append(tmpAttrs, htmlAttr) 289 } 290 break 291 } 292 tmpAttrs = append(tmpAttrs, htmlAttr) 293 default: 294 tmpAttrs = append(tmpAttrs, htmlAttr) 295 } 296 } 297 cleanAttrs = tmpAttrs 298 } 299 300 if (p.requireNoFollow || 301 p.requireNoFollowFullyQualifiedLinks || 302 p.addTargetBlankToFullyQualifiedLinks) && 303 len(cleanAttrs) > 0 { 304 305 // Add rel="nofollow" if a "href" exists 306 switch elementName { 307 case "a", "area", "link": 308 var hrefFound bool 309 var externalLink bool 310 for _, htmlAttr := range cleanAttrs { 311 if htmlAttr.Key == "href" { 312 hrefFound = true 313 314 u, err := url.Parse(htmlAttr.Val) 315 if err != nil { 316 continue 317 } 318 if u.Host != "" { 319 externalLink = true 320 } 321 322 continue 323 } 324 } 325 326 if hrefFound { 327 var noFollowFound bool 328 var targetFound bool 329 330 addNoFollow := (p.requireNoFollow || 331 externalLink && p.requireNoFollowFullyQualifiedLinks) 332 333 addTargetBlank := (externalLink && 334 p.addTargetBlankToFullyQualifiedLinks) 335 336 tmpAttrs := []html.Attribute{} 337 for _, htmlAttr := range cleanAttrs { 338 339 var appended bool 340 if htmlAttr.Key == "rel" && addNoFollow { 341 342 if strings.Contains(htmlAttr.Val, "nofollow") { 343 noFollowFound = true 344 tmpAttrs = append(tmpAttrs, htmlAttr) 345 } else { 346 htmlAttr.Val += " nofollow" 347 noFollowFound = true 348 tmpAttrs = append(tmpAttrs, htmlAttr) 349 } 350 351 appended = true 352 } 353 354 if elementName == "a" && 355 htmlAttr.Key == "target" && 356 addTargetBlank { 357 358 if strings.Contains(htmlAttr.Val, "_blank") { 359 targetFound = true 360 tmpAttrs = append(tmpAttrs, htmlAttr) 361 } else { 362 htmlAttr.Val = "_blank" 363 targetFound = true 364 tmpAttrs = append(tmpAttrs, htmlAttr) 365 } 366 367 appended = true 368 } 369 370 if !appended { 371 tmpAttrs = append(tmpAttrs, htmlAttr) 372 } 373 } 374 if noFollowFound || targetFound { 375 cleanAttrs = tmpAttrs 376 } 377 378 if addNoFollow && !noFollowFound { 379 rel := html.Attribute{} 380 rel.Key = "rel" 381 rel.Val = "nofollow" 382 cleanAttrs = append(cleanAttrs, rel) 383 } 384 385 if elementName == "a" && addTargetBlank && !targetFound { 386 rel := html.Attribute{} 387 rel.Key = "target" 388 rel.Val = "_blank" 389 cleanAttrs = append(cleanAttrs, rel) 390 } 391 } 392 default: 393 } 394 } 395 } 396 397 return cleanAttrs 398 } 399 400 func (p *Policy) allowNoAttrs(elementName string) bool { 401 _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] 402 return ok 403 } 404 405 func (p *Policy) validURL(rawurl string) (string, bool) { 406 if p.requireParseableURLs { 407 // URLs do not contain whitespace 408 if strings.Contains(rawurl, " ") || 409 strings.Contains(rawurl, "\t") || 410 strings.Contains(rawurl, "\n") { 411 return "", false 412 } 413 414 u, err := url.Parse(rawurl) 415 if err != nil { 416 return "", false 417 } 418 419 if u.Scheme != "" { 420 421 urlPolicy, ok := p.allowURLSchemes[u.Scheme] 422 if !ok { 423 return "", false 424 425 } 426 427 if urlPolicy == nil || urlPolicy(u) == true { 428 return u.String(), true 429 } 430 431 return "", false 432 } 433 434 if p.allowRelativeURLs { 435 if u.String() != "" { 436 return u.String(), true 437 } 438 } 439 440 return "", false 441 } 442 443 return rawurl, true 444 } 445 446 func linkable(elementName string) bool { 447 switch elementName { 448 case "a", "area", "blockquote", "img", "link", "script": 449 return true 450 default: 451 return false 452 } 453 }