github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/bluemonday/sanitize.go (about)

     1  // Copyright (c) 2014, David Kitchen <david@buro9.com>
     2  //
     3  // All rights reserved.
     4  //
     5  // Redistribution and use in source and binary forms, with or without
     6  // modification, are permitted provided that the following conditions are met:
     7  //
     8  // * Redistributions of source code must retain the above copyright notice, this
     9  //   list of conditions and the following disclaimer.
    10  //
    11  // * Redistributions in binary form must reproduce the above copyright notice,
    12  //   this list of conditions and the following disclaimer in the documentation
    13  //   and/or other materials provided with the distribution.
    14  //
    15  // * Neither the name of the organisation (Microcosm) nor the names of its
    16  //   contributors may be used to endorse or promote products derived from
    17  //   this software without specific prior written permission.
    18  //
    19  // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    20  // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    21  // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
    22  // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
    23  // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    24  // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
    25  // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    26  // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    27  // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    28  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29  
    30  package bluemonday
    31  
    32  import (
    33  	"bytes"
    34  	"io"
    35  	"net/url"
    36  	"strings"
    37  
    38  	"github.com/insionng/yougam/libraries/x/net/html"
    39  )
    40  
    41  // Sanitize takes a string that contains a HTML fragment or document and applies
    42  // the given policy whitelist.
    43  //
    44  // It returns a HTML string that has been sanitized by the policy or an empty
    45  // string if an error has occurred (most likely as a consequence of extremely
    46  // malformed input)
    47  func (p *Policy) Sanitize(s string) string {
    48  	if strings.TrimSpace(s) == "" {
    49  		return s
    50  	}
    51  
    52  	return p.sanitize(strings.NewReader(s)).String()
    53  }
    54  
    55  // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
    56  // the given policy whitelist.
    57  //
    58  // It returns a []byte containing the HTML that has been sanitized by the policy
    59  // or an empty []byte if an error has occurred (most likely as a consequence of
    60  // extremely malformed input)
    61  func (p *Policy) SanitizeBytes(b []byte) []byte {
    62  	if len(bytes.TrimSpace(b)) == 0 {
    63  		return b
    64  	}
    65  
    66  	return p.sanitize(bytes.NewReader(b)).Bytes()
    67  }
    68  
    69  // SanitizeReader takes an io.Reader that contains a HTML fragment or document
    70  // and applies the given policy whitelist.
    71  //
    72  // It returns a bytes.Buffer containing the HTML that has been sanitized by the
    73  // policy. Errors during sanitization will merely return an empty result.
    74  func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
    75  	return p.sanitize(r)
    76  }
    77  
    78  // Performs the actual sanitization process.
    79  func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
    80  
    81  	// It is possible that the developer has created the policy via:
    82  	//   p := bluemonday.Policy{}
    83  	// rather than:
    84  	//   p := bluemonday.NewPolicy()
    85  	// If this is the case, and if they haven't yet triggered an action that
    86  	// would initiliaze the maps, then we need to do that.
    87  	p.init()
    88  
    89  	var buff bytes.Buffer
    90  	tokenizer := html.NewTokenizer(r)
    91  
    92  	skipElementContent := false
    93  	skippingElementsCount := 0
    94  
    95  	skipClosingTag := false
    96  	closingTagToSkipStack := []string{}
    97  
    98  	for {
    99  		if tokenizer.Next() == html.ErrorToken {
   100  			err := tokenizer.Err()
   101  			if err == io.EOF {
   102  				// End of input means end of processing
   103  				return &buff
   104  			}
   105  
   106  			// Raw tokenizer error
   107  			return &bytes.Buffer{}
   108  		}
   109  
   110  		token := tokenizer.Token()
   111  		switch token.Type {
   112  		case html.DoctypeToken:
   113  
   114  			if p.allowDocType {
   115  				buff.WriteString(token.String())
   116  			}
   117  
   118  		case html.CommentToken:
   119  
   120  			// Comments are ignored by default
   121  
   122  		case html.StartTagToken:
   123  
   124  			aps, ok := p.elsAndAttrs[token.Data]
   125  			if !ok {
   126  				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
   127  					skipElementContent = true
   128  					skippingElementsCount++
   129  				}
   130  				break
   131  			}
   132  
   133  			if len(token.Attr) != 0 {
   134  				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
   135  			}
   136  
   137  			if len(token.Attr) == 0 {
   138  				if !p.allowNoAttrs(token.Data) {
   139  					skipClosingTag = true
   140  					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
   141  					break
   142  				}
   143  			}
   144  
   145  			if !skipElementContent {
   146  				buff.WriteString(token.String())
   147  			}
   148  
   149  		case html.EndTagToken:
   150  
   151  			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
   152  				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
   153  				if len(closingTagToSkipStack) == 0 {
   154  					skipClosingTag = false
   155  				}
   156  				break
   157  			}
   158  
   159  			if _, ok := p.elsAndAttrs[token.Data]; !ok {
   160  				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
   161  					skippingElementsCount--
   162  					if skippingElementsCount == 0 {
   163  						skipElementContent = false
   164  					}
   165  				}
   166  				break
   167  			}
   168  
   169  			if !skipElementContent {
   170  				buff.WriteString(token.String())
   171  			}
   172  
   173  		case html.SelfClosingTagToken:
   174  
   175  			aps, ok := p.elsAndAttrs[token.Data]
   176  			if !ok {
   177  				break
   178  			}
   179  
   180  			if len(token.Attr) != 0 {
   181  				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
   182  			}
   183  
   184  			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
   185  				break
   186  			}
   187  
   188  			if !skipElementContent {
   189  				buff.WriteString(token.String())
   190  			}
   191  
   192  		case html.TextToken:
   193  
   194  			if !skipElementContent {
   195  				buff.WriteString(token.String())
   196  			}
   197  
   198  		default:
   199  			// A token that didn't exist in the html package when we wrote this
   200  			return &bytes.Buffer{}
   201  		}
   202  	}
   203  }
   204  
   205  // sanitizeAttrs takes a set of element attribute policies and the global
   206  // attribute policies and applies them to the []html.Attribute returning a set
   207  // of html.Attributes that match the policies
   208  func (p *Policy) sanitizeAttrs(
   209  	elementName string,
   210  	attrs []html.Attribute,
   211  	aps map[string]attrPolicy,
   212  ) []html.Attribute {
   213  
   214  	if len(attrs) == 0 {
   215  		return attrs
   216  	}
   217  
   218  	// Builds a new attribute slice based on the whether the attribute has been
   219  	// whitelisted explicitly or globally.
   220  	cleanAttrs := []html.Attribute{}
   221  	for _, htmlAttr := range attrs {
   222  		// Is there an element specific attribute policy that applies?
   223  		if ap, ok := aps[htmlAttr.Key]; ok {
   224  			if ap.regexp != nil {
   225  				if ap.regexp.MatchString(htmlAttr.Val) {
   226  					cleanAttrs = append(cleanAttrs, htmlAttr)
   227  					continue
   228  				}
   229  			} else {
   230  				cleanAttrs = append(cleanAttrs, htmlAttr)
   231  				continue
   232  			}
   233  		}
   234  
   235  		// Is there a global attribute policy that applies?
   236  		if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
   237  			if ap.regexp != nil {
   238  				if ap.regexp.MatchString(htmlAttr.Val) {
   239  					cleanAttrs = append(cleanAttrs, htmlAttr)
   240  				}
   241  			} else {
   242  				cleanAttrs = append(cleanAttrs, htmlAttr)
   243  			}
   244  		}
   245  	}
   246  
   247  	if len(cleanAttrs) == 0 {
   248  		// If nothing was allowed, let's get out of here
   249  		return cleanAttrs
   250  	}
   251  	// cleanAttrs now contains the attributes that are permitted
   252  
   253  	if linkable(elementName) {
   254  		if p.requireParseableURLs {
   255  			// Ensure URLs are parseable:
   256  			// - a.href
   257  			// - area.href
   258  			// - link.href
   259  			// - blockquote.cite
   260  			// - q.cite
   261  			// - img.src
   262  			// - script.src
   263  			tmpAttrs := []html.Attribute{}
   264  			for _, htmlAttr := range cleanAttrs {
   265  				switch elementName {
   266  				case "a", "area", "link":
   267  					if htmlAttr.Key == "href" {
   268  						if u, ok := p.validURL(htmlAttr.Val); ok {
   269  							htmlAttr.Val = u
   270  							tmpAttrs = append(tmpAttrs, htmlAttr)
   271  						}
   272  						break
   273  					}
   274  					tmpAttrs = append(tmpAttrs, htmlAttr)
   275  				case "blockquote", "q":
   276  					if htmlAttr.Key == "cite" {
   277  						if u, ok := p.validURL(htmlAttr.Val); ok {
   278  							htmlAttr.Val = u
   279  							tmpAttrs = append(tmpAttrs, htmlAttr)
   280  						}
   281  						break
   282  					}
   283  					tmpAttrs = append(tmpAttrs, htmlAttr)
   284  				case "img", "script":
   285  					if htmlAttr.Key == "src" {
   286  						if u, ok := p.validURL(htmlAttr.Val); ok {
   287  							htmlAttr.Val = u
   288  							tmpAttrs = append(tmpAttrs, htmlAttr)
   289  						}
   290  						break
   291  					}
   292  					tmpAttrs = append(tmpAttrs, htmlAttr)
   293  				default:
   294  					tmpAttrs = append(tmpAttrs, htmlAttr)
   295  				}
   296  			}
   297  			cleanAttrs = tmpAttrs
   298  		}
   299  
   300  		if (p.requireNoFollow ||
   301  			p.requireNoFollowFullyQualifiedLinks ||
   302  			p.addTargetBlankToFullyQualifiedLinks) &&
   303  			len(cleanAttrs) > 0 {
   304  
   305  			// Add rel="nofollow" if a "href" exists
   306  			switch elementName {
   307  			case "a", "area", "link":
   308  				var hrefFound bool
   309  				var externalLink bool
   310  				for _, htmlAttr := range cleanAttrs {
   311  					if htmlAttr.Key == "href" {
   312  						hrefFound = true
   313  
   314  						u, err := url.Parse(htmlAttr.Val)
   315  						if err != nil {
   316  							continue
   317  						}
   318  						if u.Host != "" {
   319  							externalLink = true
   320  						}
   321  
   322  						continue
   323  					}
   324  				}
   325  
   326  				if hrefFound {
   327  					var noFollowFound bool
   328  					var targetFound bool
   329  
   330  					addNoFollow := (p.requireNoFollow ||
   331  						externalLink && p.requireNoFollowFullyQualifiedLinks)
   332  
   333  					addTargetBlank := (externalLink &&
   334  						p.addTargetBlankToFullyQualifiedLinks)
   335  
   336  					tmpAttrs := []html.Attribute{}
   337  					for _, htmlAttr := range cleanAttrs {
   338  
   339  						var appended bool
   340  						if htmlAttr.Key == "rel" && addNoFollow {
   341  
   342  							if strings.Contains(htmlAttr.Val, "nofollow") {
   343  								noFollowFound = true
   344  								tmpAttrs = append(tmpAttrs, htmlAttr)
   345  							} else {
   346  								htmlAttr.Val += " nofollow"
   347  								noFollowFound = true
   348  								tmpAttrs = append(tmpAttrs, htmlAttr)
   349  							}
   350  
   351  							appended = true
   352  						}
   353  
   354  						if elementName == "a" &&
   355  							htmlAttr.Key == "target" &&
   356  							addTargetBlank {
   357  
   358  							if strings.Contains(htmlAttr.Val, "_blank") {
   359  								targetFound = true
   360  								tmpAttrs = append(tmpAttrs, htmlAttr)
   361  							} else {
   362  								htmlAttr.Val = "_blank"
   363  								targetFound = true
   364  								tmpAttrs = append(tmpAttrs, htmlAttr)
   365  							}
   366  
   367  							appended = true
   368  						}
   369  
   370  						if !appended {
   371  							tmpAttrs = append(tmpAttrs, htmlAttr)
   372  						}
   373  					}
   374  					if noFollowFound || targetFound {
   375  						cleanAttrs = tmpAttrs
   376  					}
   377  
   378  					if addNoFollow && !noFollowFound {
   379  						rel := html.Attribute{}
   380  						rel.Key = "rel"
   381  						rel.Val = "nofollow"
   382  						cleanAttrs = append(cleanAttrs, rel)
   383  					}
   384  
   385  					if elementName == "a" && addTargetBlank && !targetFound {
   386  						rel := html.Attribute{}
   387  						rel.Key = "target"
   388  						rel.Val = "_blank"
   389  						cleanAttrs = append(cleanAttrs, rel)
   390  					}
   391  				}
   392  			default:
   393  			}
   394  		}
   395  	}
   396  
   397  	return cleanAttrs
   398  }
   399  
   400  func (p *Policy) allowNoAttrs(elementName string) bool {
   401  	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
   402  	return ok
   403  }
   404  
   405  func (p *Policy) validURL(rawurl string) (string, bool) {
   406  	if p.requireParseableURLs {
   407  		// URLs do not contain whitespace
   408  		if strings.Contains(rawurl, " ") ||
   409  			strings.Contains(rawurl, "\t") ||
   410  			strings.Contains(rawurl, "\n") {
   411  			return "", false
   412  		}
   413  
   414  		u, err := url.Parse(rawurl)
   415  		if err != nil {
   416  			return "", false
   417  		}
   418  
   419  		if u.Scheme != "" {
   420  
   421  			urlPolicy, ok := p.allowURLSchemes[u.Scheme]
   422  			if !ok {
   423  				return "", false
   424  
   425  			}
   426  
   427  			if urlPolicy == nil || urlPolicy(u) == true {
   428  				return u.String(), true
   429  			}
   430  
   431  			return "", false
   432  		}
   433  
   434  		if p.allowRelativeURLs {
   435  			if u.String() != "" {
   436  				return u.String(), true
   437  			}
   438  		}
   439  
   440  		return "", false
   441  	}
   442  
   443  	return rawurl, true
   444  }
   445  
   446  func linkable(elementName string) bool {
   447  	switch elementName {
   448  	case "a", "area", "blockquote", "img", "link", "script":
   449  		return true
   450  	default:
   451  		return false
   452  	}
   453  }