github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/bluemonday/policy.go (about)

     1  // Copyright (c) 2014, David Kitchen <david@buro9.com>
     2  //
     3  // All rights reserved.
     4  //
     5  // Redistribution and use in source and binary forms, with or without
     6  // modification, are permitted provided that the following conditions are met:
     7  //
     8  // * Redistributions of source code must retain the above copyright notice, this
     9  //   list of conditions and the following disclaimer.
    10  //
    11  // * Redistributions in binary form must reproduce the above copyright notice,
    12  //   this list of conditions and the following disclaimer in the documentation
    13  //   and/or other materials provided with the distribution.
    14  //
    15  // * Neither the name of the organisation (Microcosm) nor the names of its
    16  //   contributors may be used to endorse or promote products derived from
    17  //   this software without specific prior written permission.
    18  //
    19  // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    20  // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    21  // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
    22  // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
    23  // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    24  // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
    25  // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    26  // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    27  // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    28  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29  
    30  package bluemonday
    31  
    32  import (
    33  	"net/url"
    34  	"regexp"
    35  	"strings"
    36  )
    37  
    38  // Policy encapsulates the whitelist of HTML elements and attributes that will
    39  // be applied to the sanitised HTML.
    40  //
    41  // You should use bluemonday.NewPolicy() to create a blank policy as the
    42  // unexported fields contain maps that need to be initialized.
    43  type Policy struct {
    44  
    45  	// Declares whether the maps have been initialized, used as a cheap check to
    46  	// ensure that those using Policy{} directly won't cause nil pointer
    47  	// exceptions
    48  	initialized bool
    49  
    50  	// Allows the <!DOCTYPE > tag to exist in the sanitized document
    51  	allowDocType bool
    52  
    53  	// When true, add rel="nofollow" to HTML anchors
    54  	requireNoFollow bool
    55  
    56  	// When true, add rel="nofollow" to HTML anchors
    57  	// Will add for href="http://foo"
    58  	// Will skip for href="/foo" or href="foo"
    59  	requireNoFollowFullyQualifiedLinks bool
    60  
    61  	// When true add target="_blank" to fully qualified links
    62  	// Will add for href="http://foo"
    63  	// Will skip for href="/foo" or href="foo"
    64  	addTargetBlankToFullyQualifiedLinks bool
    65  
    66  	// When true, URLs must be parseable by "net/url" url.Parse()
    67  	requireParseableURLs bool
    68  
    69  	// When true, u, _ := url.Parse("url"); !u.IsAbs() is permitted
    70  	allowRelativeURLs bool
    71  
    72  	// map[htmlElementName]map[htmlAttributeName]attrPolicy
    73  	elsAndAttrs map[string]map[string]attrPolicy
    74  
    75  	// map[htmlAttributeName]attrPolicy
    76  	globalAttrs map[string]attrPolicy
    77  
    78  	// If urlPolicy is nil, all URLs with matching schema are allowed.
    79  	// Otherwise, only the URLs with matching schema and urlPolicy(url)
    80  	// returning true are allowed.
    81  	allowURLSchemes map[string]urlPolicy
    82  
    83  	// If an element has had all attributes removed as a result of a policy
    84  	// being applied, then the element would be removed from the output.
    85  	//
    86  	// However some elements are valid and have strong layout meaning without
    87  	// any attributes, i.e. <table>. To prevent those being removed we maintain
    88  	// a list of elements that are allowed to have no attributes and that will
    89  	// be maintained in the output HTML.
    90  	setOfElementsAllowedWithoutAttrs map[string]struct{}
    91  
    92  	setOfElementsToSkipContent map[string]struct{}
    93  }
    94  
    95  type attrPolicy struct {
    96  
    97  	// optional pattern to match, when not nil the regexp needs to match
    98  	// otherwise the attribute is removed
    99  	regexp *regexp.Regexp
   100  }
   101  
   102  type attrPolicyBuilder struct {
   103  	p *Policy
   104  
   105  	attrNames  []string
   106  	regexp     *regexp.Regexp
   107  	allowEmpty bool
   108  }
   109  
   110  type urlPolicy func(url *url.URL) (allowUrl bool)
   111  
   112  // init initializes the maps if this has not been done already
   113  func (p *Policy) init() {
   114  	if !p.initialized {
   115  		p.elsAndAttrs = make(map[string]map[string]attrPolicy)
   116  		p.globalAttrs = make(map[string]attrPolicy)
   117  		p.allowURLSchemes = make(map[string]urlPolicy)
   118  		p.setOfElementsAllowedWithoutAttrs = make(map[string]struct{})
   119  		p.setOfElementsToSkipContent = make(map[string]struct{})
   120  		p.initialized = true
   121  	}
   122  }
   123  
   124  // NewPolicy returns a blank policy with nothing whitelisted or permitted. This
   125  // is the recommended way to start building a policy and you should now use
   126  // AllowAttrs() and/or AllowElements() to construct the whitelist of HTML
   127  // elements and attributes.
   128  func NewPolicy() *Policy {
   129  
   130  	p := Policy{}
   131  
   132  	p.addDefaultElementsWithoutAttrs()
   133  	p.addDefaultSkipElementContent()
   134  
   135  	return &p
   136  }
   137  
   138  // AllowAttrs takes a range of HTML attribute names and returns an
   139  // attribute policy builder that allows you to specify the pattern and scope of
   140  // the whitelisted attribute.
   141  //
   142  // The attribute policy is only added to the core policy when either Globally()
   143  // or OnElements(...) are called.
   144  func (p *Policy) AllowAttrs(attrNames ...string) *attrPolicyBuilder {
   145  
   146  	p.init()
   147  
   148  	abp := attrPolicyBuilder{
   149  		p:          p,
   150  		allowEmpty: false,
   151  	}
   152  
   153  	for _, attrName := range attrNames {
   154  		abp.attrNames = append(abp.attrNames, strings.ToLower(attrName))
   155  	}
   156  
   157  	return &abp
   158  }
   159  
   160  // AllowNoAttrs says that attributes on element are optional.
   161  //
   162  // The attribute policy is only added to the core policy when OnElements(...)
   163  // are called.
   164  func (p *Policy) AllowNoAttrs() *attrPolicyBuilder {
   165  
   166  	p.init()
   167  
   168  	abp := attrPolicyBuilder{
   169  		p:          p,
   170  		allowEmpty: true,
   171  	}
   172  	return &abp
   173  }
   174  
   175  // AllowNoAttrs says that attributes on element are optional.
   176  //
   177  // The attribute policy is only added to the core policy when OnElements(...)
   178  // are called.
   179  func (abp *attrPolicyBuilder) AllowNoAttrs() *attrPolicyBuilder {
   180  
   181  	abp.allowEmpty = true
   182  
   183  	return abp
   184  }
   185  
   186  // Matching allows a regular expression to be applied to a nascent attribute
   187  // policy, and returns the attribute policy. Calling this more than once will
   188  // replace the existing regexp.
   189  func (abp *attrPolicyBuilder) Matching(regex *regexp.Regexp) *attrPolicyBuilder {
   190  
   191  	abp.regexp = regex
   192  
   193  	return abp
   194  }
   195  
   196  // OnElements will bind an attribute policy to a given range of HTML elements
   197  // and return the updated policy
   198  func (abp *attrPolicyBuilder) OnElements(elements ...string) *Policy {
   199  
   200  	for _, element := range elements {
   201  		element = strings.ToLower(element)
   202  
   203  		for _, attr := range abp.attrNames {
   204  
   205  			if _, ok := abp.p.elsAndAttrs[element]; !ok {
   206  				abp.p.elsAndAttrs[element] = make(map[string]attrPolicy)
   207  			}
   208  
   209  			ap := attrPolicy{}
   210  			if abp.regexp != nil {
   211  				ap.regexp = abp.regexp
   212  			}
   213  
   214  			abp.p.elsAndAttrs[element][attr] = ap
   215  		}
   216  
   217  		if abp.allowEmpty {
   218  			abp.p.setOfElementsAllowedWithoutAttrs[element] = struct{}{}
   219  
   220  			if _, ok := abp.p.elsAndAttrs[element]; !ok {
   221  				abp.p.elsAndAttrs[element] = make(map[string]attrPolicy)
   222  			}
   223  		}
   224  	}
   225  
   226  	return abp.p
   227  }
   228  
   229  // Globally will bind an attribute policy to all HTML elements and return the
   230  // updated policy
   231  func (abp *attrPolicyBuilder) Globally() *Policy {
   232  
   233  	for _, attr := range abp.attrNames {
   234  		if _, ok := abp.p.globalAttrs[attr]; !ok {
   235  			abp.p.globalAttrs[attr] = attrPolicy{}
   236  		}
   237  
   238  		ap := attrPolicy{}
   239  		if abp.regexp != nil {
   240  			ap.regexp = abp.regexp
   241  		}
   242  
   243  		abp.p.globalAttrs[attr] = ap
   244  	}
   245  
   246  	return abp.p
   247  }
   248  
   249  // AllowElements will append HTML elements to the whitelist without applying an
   250  // attribute policy to those elements (the elements are permitted
   251  // sans-attributes)
   252  func (p *Policy) AllowElements(names ...string) *Policy {
   253  	p.init()
   254  
   255  	for _, element := range names {
   256  		element = strings.ToLower(element)
   257  
   258  		if _, ok := p.elsAndAttrs[element]; !ok {
   259  			p.elsAndAttrs[element] = make(map[string]attrPolicy)
   260  		}
   261  	}
   262  
   263  	return p
   264  }
   265  
   266  // RequireNoFollowOnLinks will result in all <a> tags having a rel="nofollow"
   267  // added to them if one does not already exist
   268  //
   269  // Note: This requires p.RequireParseableURLs(true) and will enable it.
   270  func (p *Policy) RequireNoFollowOnLinks(require bool) *Policy {
   271  
   272  	p.requireNoFollow = require
   273  	p.requireParseableURLs = true
   274  
   275  	return p
   276  }
   277  
   278  // RequireNoFollowOnFullyQualifiedLinks will result in all <a> tags that point
   279  // to a non-local destination (i.e. starts with a protocol and has a host)
   280  // having a rel="nofollow" added to them if one does not already exist
   281  //
   282  // Note: This requires p.RequireParseableURLs(true) and will enable it.
   283  func (p *Policy) RequireNoFollowOnFullyQualifiedLinks(require bool) *Policy {
   284  
   285  	p.requireNoFollowFullyQualifiedLinks = require
   286  	p.requireParseableURLs = true
   287  
   288  	return p
   289  }
   290  
   291  // AddTargetBlankToFullyQualifiedLinks will result in all <a> tags that point
   292  // to a non-local destination (i.e. starts with a protocol and has a host)
   293  // having a target="_blank" added to them if one does not already exist
   294  //
   295  // Note: This requires p.RequireParseableURLs(true) and will enable it.
   296  func (p *Policy) AddTargetBlankToFullyQualifiedLinks(require bool) *Policy {
   297  
   298  	p.addTargetBlankToFullyQualifiedLinks = require
   299  	p.requireParseableURLs = true
   300  
   301  	return p
   302  }
   303  
   304  // RequireParseableURLs will result in all URLs requiring that they be parseable
   305  // by "net/url" url.Parse()
   306  // This applies to:
   307  // - a.href
   308  // - area.href
   309  // - blockquote.cite
   310  // - img.src
   311  // - link.href
   312  // - script.src
   313  func (p *Policy) RequireParseableURLs(require bool) *Policy {
   314  
   315  	p.requireParseableURLs = require
   316  
   317  	return p
   318  }
   319  
   320  // AllowRelativeURLs enables RequireParseableURLs and then permits URLs that
   321  // are parseable, have no schema information and url.IsAbs() returns false
   322  // This permits local URLs
   323  func (p *Policy) AllowRelativeURLs(require bool) *Policy {
   324  
   325  	p.RequireParseableURLs(true)
   326  	p.allowRelativeURLs = require
   327  
   328  	return p
   329  }
   330  
   331  // AllowURLSchemes will append URL schemes to the whitelist
   332  // Example: p.AllowURLSchemes("mailto", "http", "https")
   333  func (p *Policy) AllowURLSchemes(schemes ...string) *Policy {
   334  	p.init()
   335  
   336  	p.RequireParseableURLs(true)
   337  
   338  	for _, scheme := range schemes {
   339  		scheme = strings.ToLower(scheme)
   340  
   341  		// Allow all URLs with matching scheme.
   342  		p.allowURLSchemes[scheme] = nil
   343  	}
   344  
   345  	return p
   346  }
   347  
   348  // AllowURLSchemeWithCustomPolicy will append URL schemes with
   349  // a custom URL policy to the whitelist.
   350  // Only the URLs with matching schema and urlPolicy(url)
   351  // returning true will be allowed.
   352  func (p *Policy) AllowURLSchemeWithCustomPolicy(
   353  	scheme string,
   354  	urlPolicy func(url *url.URL) (allowUrl bool),
   355  ) *Policy {
   356  
   357  	p.init()
   358  
   359  	p.RequireParseableURLs(true)
   360  
   361  	scheme = strings.ToLower(scheme)
   362  
   363  	p.allowURLSchemes[scheme] = urlPolicy
   364  
   365  	return p
   366  }
   367  
   368  // AllowDocType states whether the HTML sanitised by the sanitizer is allowed to
   369  // contain the HTML DocType tag: <!DOCTYPE HTML> or one of it's variants.
   370  //
   371  // The HTML spec only permits one doctype per document, and as you know how you
   372  // are using the output of this, you know best as to whether we should ignore it
   373  // (default) or not.
   374  //
   375  // If you are sanitizing a HTML fragment the default (false) is fine.
   376  func (p *Policy) AllowDocType(allow bool) *Policy {
   377  
   378  	p.allowDocType = allow
   379  
   380  	return p
   381  }
   382  
   383  // SkipElementsContent adds the HTML elements whose tags is needed to be removed
   384  // with it's content.
   385  func (p *Policy) SkipElementsContent(names ...string) *Policy {
   386  
   387  	p.init()
   388  
   389  	for _, element := range names {
   390  		element = strings.ToLower(element)
   391  
   392  		if _, ok := p.setOfElementsToSkipContent[element]; !ok {
   393  			p.setOfElementsToSkipContent[element] = struct{}{}
   394  		}
   395  	}
   396  
   397  	return p
   398  }
   399  
   400  // addDefaultElementsWithoutAttrs adds the HTML elements that we know are valid
   401  // without any attributes to an internal map.
   402  // i.e. we know that <table> is valid, but <bdo> isn't valid as the "dir" attr
   403  // is mandatory
   404  func (p *Policy) addDefaultElementsWithoutAttrs() {
   405  	p.init()
   406  
   407  	p.setOfElementsAllowedWithoutAttrs["abbr"] = struct{}{}
   408  	p.setOfElementsAllowedWithoutAttrs["acronym"] = struct{}{}
   409  	p.setOfElementsAllowedWithoutAttrs["article"] = struct{}{}
   410  	p.setOfElementsAllowedWithoutAttrs["aside"] = struct{}{}
   411  	p.setOfElementsAllowedWithoutAttrs["audio"] = struct{}{}
   412  	p.setOfElementsAllowedWithoutAttrs["b"] = struct{}{}
   413  	p.setOfElementsAllowedWithoutAttrs["bdi"] = struct{}{}
   414  	p.setOfElementsAllowedWithoutAttrs["blockquote"] = struct{}{}
   415  	p.setOfElementsAllowedWithoutAttrs["body"] = struct{}{}
   416  	p.setOfElementsAllowedWithoutAttrs["br"] = struct{}{}
   417  	p.setOfElementsAllowedWithoutAttrs["button"] = struct{}{}
   418  	p.setOfElementsAllowedWithoutAttrs["canvas"] = struct{}{}
   419  	p.setOfElementsAllowedWithoutAttrs["caption"] = struct{}{}
   420  	p.setOfElementsAllowedWithoutAttrs["cite"] = struct{}{}
   421  	p.setOfElementsAllowedWithoutAttrs["code"] = struct{}{}
   422  	p.setOfElementsAllowedWithoutAttrs["col"] = struct{}{}
   423  	p.setOfElementsAllowedWithoutAttrs["colgroup"] = struct{}{}
   424  	p.setOfElementsAllowedWithoutAttrs["datalist"] = struct{}{}
   425  	p.setOfElementsAllowedWithoutAttrs["dd"] = struct{}{}
   426  	p.setOfElementsAllowedWithoutAttrs["del"] = struct{}{}
   427  	p.setOfElementsAllowedWithoutAttrs["details"] = struct{}{}
   428  	p.setOfElementsAllowedWithoutAttrs["dfn"] = struct{}{}
   429  	p.setOfElementsAllowedWithoutAttrs["div"] = struct{}{}
   430  	p.setOfElementsAllowedWithoutAttrs["dl"] = struct{}{}
   431  	p.setOfElementsAllowedWithoutAttrs["dt"] = struct{}{}
   432  	p.setOfElementsAllowedWithoutAttrs["em"] = struct{}{}
   433  	p.setOfElementsAllowedWithoutAttrs["fieldset"] = struct{}{}
   434  	p.setOfElementsAllowedWithoutAttrs["figcaption"] = struct{}{}
   435  	p.setOfElementsAllowedWithoutAttrs["figure"] = struct{}{}
   436  	p.setOfElementsAllowedWithoutAttrs["footer"] = struct{}{}
   437  	p.setOfElementsAllowedWithoutAttrs["h1"] = struct{}{}
   438  	p.setOfElementsAllowedWithoutAttrs["h2"] = struct{}{}
   439  	p.setOfElementsAllowedWithoutAttrs["h3"] = struct{}{}
   440  	p.setOfElementsAllowedWithoutAttrs["h4"] = struct{}{}
   441  	p.setOfElementsAllowedWithoutAttrs["h5"] = struct{}{}
   442  	p.setOfElementsAllowedWithoutAttrs["h6"] = struct{}{}
   443  	p.setOfElementsAllowedWithoutAttrs["head"] = struct{}{}
   444  	p.setOfElementsAllowedWithoutAttrs["header"] = struct{}{}
   445  	p.setOfElementsAllowedWithoutAttrs["hgroup"] = struct{}{}
   446  	p.setOfElementsAllowedWithoutAttrs["hr"] = struct{}{}
   447  	p.setOfElementsAllowedWithoutAttrs["html"] = struct{}{}
   448  	p.setOfElementsAllowedWithoutAttrs["i"] = struct{}{}
   449  	p.setOfElementsAllowedWithoutAttrs["ins"] = struct{}{}
   450  	p.setOfElementsAllowedWithoutAttrs["kbd"] = struct{}{}
   451  	p.setOfElementsAllowedWithoutAttrs["li"] = struct{}{}
   452  	p.setOfElementsAllowedWithoutAttrs["mark"] = struct{}{}
   453  	p.setOfElementsAllowedWithoutAttrs["nav"] = struct{}{}
   454  	p.setOfElementsAllowedWithoutAttrs["ol"] = struct{}{}
   455  	p.setOfElementsAllowedWithoutAttrs["optgroup"] = struct{}{}
   456  	p.setOfElementsAllowedWithoutAttrs["option"] = struct{}{}
   457  	p.setOfElementsAllowedWithoutAttrs["p"] = struct{}{}
   458  	p.setOfElementsAllowedWithoutAttrs["pre"] = struct{}{}
   459  	p.setOfElementsAllowedWithoutAttrs["q"] = struct{}{}
   460  	p.setOfElementsAllowedWithoutAttrs["rp"] = struct{}{}
   461  	p.setOfElementsAllowedWithoutAttrs["rt"] = struct{}{}
   462  	p.setOfElementsAllowedWithoutAttrs["ruby"] = struct{}{}
   463  	p.setOfElementsAllowedWithoutAttrs["s"] = struct{}{}
   464  	p.setOfElementsAllowedWithoutAttrs["samp"] = struct{}{}
   465  	p.setOfElementsAllowedWithoutAttrs["section"] = struct{}{}
   466  	p.setOfElementsAllowedWithoutAttrs["select"] = struct{}{}
   467  	p.setOfElementsAllowedWithoutAttrs["small"] = struct{}{}
   468  	p.setOfElementsAllowedWithoutAttrs["span"] = struct{}{}
   469  	p.setOfElementsAllowedWithoutAttrs["strike"] = struct{}{}
   470  	p.setOfElementsAllowedWithoutAttrs["strong"] = struct{}{}
   471  	p.setOfElementsAllowedWithoutAttrs["style"] = struct{}{}
   472  	p.setOfElementsAllowedWithoutAttrs["sub"] = struct{}{}
   473  	p.setOfElementsAllowedWithoutAttrs["summary"] = struct{}{}
   474  	p.setOfElementsAllowedWithoutAttrs["sup"] = struct{}{}
   475  	p.setOfElementsAllowedWithoutAttrs["svg"] = struct{}{}
   476  	p.setOfElementsAllowedWithoutAttrs["table"] = struct{}{}
   477  	p.setOfElementsAllowedWithoutAttrs["tbody"] = struct{}{}
   478  	p.setOfElementsAllowedWithoutAttrs["td"] = struct{}{}
   479  	p.setOfElementsAllowedWithoutAttrs["textarea"] = struct{}{}
   480  	p.setOfElementsAllowedWithoutAttrs["tfoot"] = struct{}{}
   481  	p.setOfElementsAllowedWithoutAttrs["th"] = struct{}{}
   482  	p.setOfElementsAllowedWithoutAttrs["thead"] = struct{}{}
   483  	p.setOfElementsAllowedWithoutAttrs["time"] = struct{}{}
   484  	p.setOfElementsAllowedWithoutAttrs["tr"] = struct{}{}
   485  	p.setOfElementsAllowedWithoutAttrs["tt"] = struct{}{}
   486  	p.setOfElementsAllowedWithoutAttrs["u"] = struct{}{}
   487  	p.setOfElementsAllowedWithoutAttrs["ul"] = struct{}{}
   488  	p.setOfElementsAllowedWithoutAttrs["var"] = struct{}{}
   489  	p.setOfElementsAllowedWithoutAttrs["video"] = struct{}{}
   490  	p.setOfElementsAllowedWithoutAttrs["wbr"] = struct{}{}
   491  
   492  }
   493  
   494  // addDefaultSkipElementContent adds the HTML elements that we should skip
   495  // rendering the character content of, if the element itself is not allowed.
   496  // This is all character data that the end user would not normally see.
   497  // i.e. if we exclude a <script> tag then we shouldn't render the JavaScript or
   498  // anything else until we encounter the closing </script> tag.
   499  func (p *Policy) addDefaultSkipElementContent() {
   500  	p.init()
   501  
   502  	p.setOfElementsToSkipContent["frame"] = struct{}{}
   503  	p.setOfElementsToSkipContent["frameset"] = struct{}{}
   504  	p.setOfElementsToSkipContent["iframe"] = struct{}{}
   505  	p.setOfElementsToSkipContent["noembed"] = struct{}{}
   506  	p.setOfElementsToSkipContent["noframes"] = struct{}{}
   507  	p.setOfElementsToSkipContent["noscript"] = struct{}{}
   508  	p.setOfElementsToSkipContent["nostyle"] = struct{}{}
   509  	p.setOfElementsToSkipContent["object"] = struct{}{}
   510  	p.setOfElementsToSkipContent["script"] = struct{}{}
   511  	p.setOfElementsToSkipContent["style"] = struct{}{}
   512  	p.setOfElementsToSkipContent["title"] = struct{}{}
   513  }