github.com/huandu/go@v0.0.0-20151114150818-04e615e41150/src/html/escape.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package html provides functions for escaping and unescaping HTML text.
     6  package html
     7  
     8  import (
     9  	"strings"
    10  	"unicode/utf8"
    11  )
    12  
    13  type writer interface {
    14  	WriteString(string) (int, error)
    15  }
    16  
    17  // These replacements permit compatibility with old numeric entities that
    18  // assumed Windows-1252 encoding.
    19  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    20  var replacementTable = [...]rune{
    21  	'\u20AC', // First entry is what 0x80 should be replaced with.
    22  	'\u0081',
    23  	'\u201A',
    24  	'\u0192',
    25  	'\u201E',
    26  	'\u2026',
    27  	'\u2020',
    28  	'\u2021',
    29  	'\u02C6',
    30  	'\u2030',
    31  	'\u0160',
    32  	'\u2039',
    33  	'\u0152',
    34  	'\u008D',
    35  	'\u017D',
    36  	'\u008F',
    37  	'\u0090',
    38  	'\u2018',
    39  	'\u2019',
    40  	'\u201C',
    41  	'\u201D',
    42  	'\u2022',
    43  	'\u2013',
    44  	'\u2014',
    45  	'\u02DC',
    46  	'\u2122',
    47  	'\u0161',
    48  	'\u203A',
    49  	'\u0153',
    50  	'\u009D',
    51  	'\u017E',
    52  	'\u0178', // Last entry is 0x9F.
    53  	// 0x00->'\uFFFD' is handled programmatically.
    54  	// 0x0D->'\u000D' is a no-op.
    55  }
    56  
    57  // unescapeEntity reads an entity like "<" from b[src:] and writes the
    58  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    59  // Precondition: b[src] == '&' && dst <= src.
    60  // attribute should be true if parsing an attribute value.
    61  func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
    62  	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    63  
    64  	// i starts at 1 because we already know that s[0] == '&'.
    65  	i, s := 1, b[src:]
    66  
    67  	if len(s) <= 1 {
    68  		b[dst] = b[src]
    69  		return dst + 1, src + 1
    70  	}
    71  
    72  	if s[i] == '#' {
    73  		if len(s) <= 3 { // We need to have at least "&#.".
    74  			b[dst] = b[src]
    75  			return dst + 1, src + 1
    76  		}
    77  		i++
    78  		c := s[i]
    79  		hex := false
    80  		if c == 'x' || c == 'X' {
    81  			hex = true
    82  			i++
    83  		}
    84  
    85  		x := '\x00'
    86  		for i < len(s) {
    87  			c = s[i]
    88  			i++
    89  			if hex {
    90  				if '0' <= c && c <= '9' {
    91  					x = 16*x + rune(c) - '0'
    92  					continue
    93  				} else if 'a' <= c && c <= 'f' {
    94  					x = 16*x + rune(c) - 'a' + 10
    95  					continue
    96  				} else if 'A' <= c && c <= 'F' {
    97  					x = 16*x + rune(c) - 'A' + 10
    98  					continue
    99  				}
   100  			} else if '0' <= c && c <= '9' {
   101  				x = 10*x + rune(c) - '0'
   102  				continue
   103  			}
   104  			if c != ';' {
   105  				i--
   106  			}
   107  			break
   108  		}
   109  
   110  		if i <= 3 { // No characters matched.
   111  			b[dst] = b[src]
   112  			return dst + 1, src + 1
   113  		}
   114  
   115  		if 0x80 <= x && x <= 0x9F {
   116  			// Replace characters from Windows-1252 with UTF-8 equivalents.
   117  			x = replacementTable[x-0x80]
   118  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   119  			// Replace invalid characters with the replacement character.
   120  			x = '\uFFFD'
   121  		}
   122  
   123  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   124  	}
   125  
   126  	// Consume the maximum number of characters possible, with the
   127  	// consumed characters matching one of the named references.
   128  
   129  	for i < len(s) {
   130  		c := s[i]
   131  		i++
   132  		// Lower-cased characters are more common in entities, so we check for them first.
   133  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   134  			continue
   135  		}
   136  		if c != ';' {
   137  			i--
   138  		}
   139  		break
   140  	}
   141  
   142  	entityName := string(s[1:i])
   143  	if entityName == "" {
   144  		// No-op.
   145  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   146  		// No-op.
   147  	} else if x := entity[entityName]; x != 0 {
   148  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   149  	} else if x := entity2[entityName]; x[0] != 0 {
   150  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   151  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   152  	} else if !attribute {
   153  		maxLen := len(entityName) - 1
   154  		if maxLen > longestEntityWithoutSemicolon {
   155  			maxLen = longestEntityWithoutSemicolon
   156  		}
   157  		for j := maxLen; j > 1; j-- {
   158  			if x := entity[entityName[:j]]; x != 0 {
   159  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   160  			}
   161  		}
   162  	}
   163  
   164  	dst1, src1 = dst+i, src+i
   165  	copy(b[dst:dst1], b[src:src1])
   166  	return dst1, src1
   167  }
   168  
   169  // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
   170  func unescape(b []byte) []byte {
   171  	for i, c := range b {
   172  		if c == '&' {
   173  			dst, src := unescapeEntity(b, i, i, false)
   174  			for src < len(b) {
   175  				c := b[src]
   176  				if c == '&' {
   177  					dst, src = unescapeEntity(b, dst, src, false)
   178  				} else {
   179  					b[dst] = c
   180  					dst, src = dst+1, src+1
   181  				}
   182  			}
   183  			return b[0:dst]
   184  		}
   185  	}
   186  	return b
   187  }
   188  
   189  var htmlEscaper = strings.NewReplacer(
   190  	`&`, "&amp;",
   191  	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   192  	`<`, "&lt;",
   193  	`>`, "&gt;",
   194  	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
   195  )
   196  
   197  // EscapeString escapes special characters like "<" to become "&lt;". It
   198  // escapes only five such characters: <, >, &, ' and ".
   199  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   200  // always true.
   201  func EscapeString(s string) string {
   202  	return htmlEscaper.Replace(s)
   203  }
   204  
   205  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   206  // larger range of entities than EscapeString escapes. For example, "&aacute;"
   207  // unescapes to "รก", as does "&#225;" and "&xE1;".
   208  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   209  // always true.
   210  func UnescapeString(s string) string {
   211  	if !strings.Contains(s, "&") {
   212  		return s
   213  	}
   214  	return string(unescape([]byte(s)))
   215  }