github.com/sean-/go@v0.0.0-20151219100004-97f854cd7bb6/src/html/escape.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package html provides functions for escaping and unescaping HTML text.
     6  package html
     7  
     8  import (
     9  	"strings"
    10  	"unicode/utf8"
    11  )
    12  
    13  type writer interface {
    14  	WriteString(string) (int, error)
    15  }
    16  
    17  // These replacements permit compatibility with old numeric entities that
    18  // assumed Windows-1252 encoding.
    19  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    20  var replacementTable = [...]rune{
    21  	'\u20AC', // First entry is what 0x80 should be replaced with.
    22  	'\u0081',
    23  	'\u201A',
    24  	'\u0192',
    25  	'\u201E',
    26  	'\u2026',
    27  	'\u2020',
    28  	'\u2021',
    29  	'\u02C6',
    30  	'\u2030',
    31  	'\u0160',
    32  	'\u2039',
    33  	'\u0152',
    34  	'\u008D',
    35  	'\u017D',
    36  	'\u008F',
    37  	'\u0090',
    38  	'\u2018',
    39  	'\u2019',
    40  	'\u201C',
    41  	'\u201D',
    42  	'\u2022',
    43  	'\u2013',
    44  	'\u2014',
    45  	'\u02DC',
    46  	'\u2122',
    47  	'\u0161',
    48  	'\u203A',
    49  	'\u0153',
    50  	'\u009D',
    51  	'\u017E',
    52  	'\u0178', // Last entry is 0x9F.
    53  	// 0x00->'\uFFFD' is handled programmatically.
    54  	// 0x0D->'\u000D' is a no-op.
    55  }
    56  
    57  // unescapeEntity reads an entity like "<" from b[src:] and writes the
    58  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    59  // Precondition: b[src] == '&' && dst <= src.
    60  func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
    61  	const attribute = false
    62  
    63  	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    64  
    65  	// i starts at 1 because we already know that s[0] == '&'.
    66  	i, s := 1, b[src:]
    67  
    68  	if len(s) <= 1 {
    69  		b[dst] = b[src]
    70  		return dst + 1, src + 1
    71  	}
    72  
    73  	if s[i] == '#' {
    74  		if len(s) <= 3 { // We need to have at least "&#.".
    75  			b[dst] = b[src]
    76  			return dst + 1, src + 1
    77  		}
    78  		i++
    79  		c := s[i]
    80  		hex := false
    81  		if c == 'x' || c == 'X' {
    82  			hex = true
    83  			i++
    84  		}
    85  
    86  		x := '\x00'
    87  		for i < len(s) {
    88  			c = s[i]
    89  			i++
    90  			if hex {
    91  				if '0' <= c && c <= '9' {
    92  					x = 16*x + rune(c) - '0'
    93  					continue
    94  				} else if 'a' <= c && c <= 'f' {
    95  					x = 16*x + rune(c) - 'a' + 10
    96  					continue
    97  				} else if 'A' <= c && c <= 'F' {
    98  					x = 16*x + rune(c) - 'A' + 10
    99  					continue
   100  				}
   101  			} else if '0' <= c && c <= '9' {
   102  				x = 10*x + rune(c) - '0'
   103  				continue
   104  			}
   105  			if c != ';' {
   106  				i--
   107  			}
   108  			break
   109  		}
   110  
   111  		if i <= 3 { // No characters matched.
   112  			b[dst] = b[src]
   113  			return dst + 1, src + 1
   114  		}
   115  
   116  		if 0x80 <= x && x <= 0x9F {
   117  			// Replace characters from Windows-1252 with UTF-8 equivalents.
   118  			x = replacementTable[x-0x80]
   119  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   120  			// Replace invalid characters with the replacement character.
   121  			x = '\uFFFD'
   122  		}
   123  
   124  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   125  	}
   126  
   127  	// Consume the maximum number of characters possible, with the
   128  	// consumed characters matching one of the named references.
   129  
   130  	for i < len(s) {
   131  		c := s[i]
   132  		i++
   133  		// Lower-cased characters are more common in entities, so we check for them first.
   134  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   135  			continue
   136  		}
   137  		if c != ';' {
   138  			i--
   139  		}
   140  		break
   141  	}
   142  
   143  	entityName := s[1:i]
   144  	if len(entityName) == 0 {
   145  		// No-op.
   146  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   147  		// No-op.
   148  	} else if x := entity[string(entityName)]; x != 0 {
   149  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   150  	} else if x := entity2[string(entityName)]; x[0] != 0 {
   151  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   152  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   153  	} else if !attribute {
   154  		maxLen := len(entityName) - 1
   155  		if maxLen > longestEntityWithoutSemicolon {
   156  			maxLen = longestEntityWithoutSemicolon
   157  		}
   158  		for j := maxLen; j > 1; j-- {
   159  			if x := entity[string(entityName[:j])]; x != 0 {
   160  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   161  			}
   162  		}
   163  	}
   164  
   165  	dst1, src1 = dst+i, src+i
   166  	copy(b[dst:dst1], b[src:src1])
   167  	return dst1, src1
   168  }
   169  
   170  var htmlEscaper = strings.NewReplacer(
   171  	`&`, "&amp;",
   172  	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   173  	`<`, "&lt;",
   174  	`>`, "&gt;",
   175  	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
   176  )
   177  
   178  // EscapeString escapes special characters like "<" to become "&lt;". It
   179  // escapes only five such characters: <, >, &, ' and ".
   180  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   181  // always true.
   182  func EscapeString(s string) string {
   183  	return htmlEscaper.Replace(s)
   184  }
   185  
   186  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   187  // larger range of entities than EscapeString escapes. For example, "&aacute;"
   188  // unescapes to "รก", as does "&#225;" and "&xE1;".
   189  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   190  // always true.
   191  func UnescapeString(s string) string {
   192  	i := strings.IndexByte(s, '&')
   193  
   194  	if i < 0 {
   195  		return s
   196  	}
   197  
   198  	b := []byte(s)
   199  	dst, src := unescapeEntity(b, i, i)
   200  	for len(s[src:]) > 0 {
   201  		if s[src] == '&' {
   202  			i = 0
   203  		} else {
   204  			i = strings.IndexByte(s[src:], '&')
   205  		}
   206  		if i < 0 {
   207  			dst += copy(b[dst:], s[src:])
   208  			break
   209  		}
   210  
   211  		if i > 0 {
   212  			copy(b[dst:], s[src:src+i])
   213  		}
   214  		dst, src = unescapeEntity(b, dst+i, src+i)
   215  	}
   216  	return string(b[:dst])
   217  }