github.com/hongwozai/go-src-1.4.3@v0.0.0-20191127132709-dc3fce3dbccb/src/html/escape.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package html provides functions for escaping and unescaping HTML text.
     6  package html
     7  
     8  import (
     9  	"bytes"
    10  	"strings"
    11  	"unicode/utf8"
    12  )
    13  
    14  type writer interface {
    15  	WriteString(string) (int, error)
    16  }
    17  
    18  // These replacements permit compatibility with old numeric entities that
    19  // assumed Windows-1252 encoding.
    20  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    21  var replacementTable = [...]rune{
    22  	'\u20AC', // First entry is what 0x80 should be replaced with.
    23  	'\u0081',
    24  	'\u201A',
    25  	'\u0192',
    26  	'\u201E',
    27  	'\u2026',
    28  	'\u2020',
    29  	'\u2021',
    30  	'\u02C6',
    31  	'\u2030',
    32  	'\u0160',
    33  	'\u2039',
    34  	'\u0152',
    35  	'\u008D',
    36  	'\u017D',
    37  	'\u008F',
    38  	'\u0090',
    39  	'\u2018',
    40  	'\u2019',
    41  	'\u201C',
    42  	'\u201D',
    43  	'\u2022',
    44  	'\u2013',
    45  	'\u2014',
    46  	'\u02DC',
    47  	'\u2122',
    48  	'\u0161',
    49  	'\u203A',
    50  	'\u0153',
    51  	'\u009D',
    52  	'\u017E',
    53  	'\u0178', // Last entry is 0x9F.
    54  	// 0x00->'\uFFFD' is handled programmatically.
    55  	// 0x0D->'\u000D' is a no-op.
    56  }
    57  
    58  // unescapeEntity reads an entity like "<" from b[src:] and writes the
    59  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    60  // Precondition: b[src] == '&' && dst <= src.
    61  // attribute should be true if parsing an attribute value.
    62  func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
    63  	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    64  
    65  	// i starts at 1 because we already know that s[0] == '&'.
    66  	i, s := 1, b[src:]
    67  
    68  	if len(s) <= 1 {
    69  		b[dst] = b[src]
    70  		return dst + 1, src + 1
    71  	}
    72  
    73  	if s[i] == '#' {
    74  		if len(s) <= 3 { // We need to have at least "&#.".
    75  			b[dst] = b[src]
    76  			return dst + 1, src + 1
    77  		}
    78  		i++
    79  		c := s[i]
    80  		hex := false
    81  		if c == 'x' || c == 'X' {
    82  			hex = true
    83  			i++
    84  		}
    85  
    86  		x := '\x00'
    87  		for i < len(s) {
    88  			c = s[i]
    89  			i++
    90  			if hex {
    91  				if '0' <= c && c <= '9' {
    92  					x = 16*x + rune(c) - '0'
    93  					continue
    94  				} else if 'a' <= c && c <= 'f' {
    95  					x = 16*x + rune(c) - 'a' + 10
    96  					continue
    97  				} else if 'A' <= c && c <= 'F' {
    98  					x = 16*x + rune(c) - 'A' + 10
    99  					continue
   100  				}
   101  			} else if '0' <= c && c <= '9' {
   102  				x = 10*x + rune(c) - '0'
   103  				continue
   104  			}
   105  			if c != ';' {
   106  				i--
   107  			}
   108  			break
   109  		}
   110  
   111  		if i <= 3 { // No characters matched.
   112  			b[dst] = b[src]
   113  			return dst + 1, src + 1
   114  		}
   115  
   116  		if 0x80 <= x && x <= 0x9F {
   117  			// Replace characters from Windows-1252 with UTF-8 equivalents.
   118  			x = replacementTable[x-0x80]
   119  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   120  			// Replace invalid characters with the replacement character.
   121  			x = '\uFFFD'
   122  		}
   123  
   124  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   125  	}
   126  
   127  	// Consume the maximum number of characters possible, with the
   128  	// consumed characters matching one of the named references.
   129  
   130  	for i < len(s) {
   131  		c := s[i]
   132  		i++
   133  		// Lower-cased characters are more common in entities, so we check for them first.
   134  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   135  			continue
   136  		}
   137  		if c != ';' {
   138  			i--
   139  		}
   140  		break
   141  	}
   142  
   143  	entityName := string(s[1:i])
   144  	if entityName == "" {
   145  		// No-op.
   146  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   147  		// No-op.
   148  	} else if x := entity[entityName]; x != 0 {
   149  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   150  	} else if x := entity2[entityName]; x[0] != 0 {
   151  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   152  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   153  	} else if !attribute {
   154  		maxLen := len(entityName) - 1
   155  		if maxLen > longestEntityWithoutSemicolon {
   156  			maxLen = longestEntityWithoutSemicolon
   157  		}
   158  		for j := maxLen; j > 1; j-- {
   159  			if x := entity[entityName[:j]]; x != 0 {
   160  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   161  			}
   162  		}
   163  	}
   164  
   165  	dst1, src1 = dst+i, src+i
   166  	copy(b[dst:dst1], b[src:src1])
   167  	return dst1, src1
   168  }
   169  
   170  // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
   171  func unescape(b []byte) []byte {
   172  	for i, c := range b {
   173  		if c == '&' {
   174  			dst, src := unescapeEntity(b, i, i, false)
   175  			for src < len(b) {
   176  				c := b[src]
   177  				if c == '&' {
   178  					dst, src = unescapeEntity(b, dst, src, false)
   179  				} else {
   180  					b[dst] = c
   181  					dst, src = dst+1, src+1
   182  				}
   183  			}
   184  			return b[0:dst]
   185  		}
   186  	}
   187  	return b
   188  }
   189  
   190  const escapedChars = `&'<>"`
   191  
   192  func escape(w writer, s string) error {
   193  	i := strings.IndexAny(s, escapedChars)
   194  	for i != -1 {
   195  		if _, err := w.WriteString(s[:i]); err != nil {
   196  			return err
   197  		}
   198  		var esc string
   199  		switch s[i] {
   200  		case '&':
   201  			esc = "&amp;"
   202  		case '\'':
   203  			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   204  			esc = "&#39;"
   205  		case '<':
   206  			esc = "&lt;"
   207  		case '>':
   208  			esc = "&gt;"
   209  		case '"':
   210  			// "&#34;" is shorter than "&quot;".
   211  			esc = "&#34;"
   212  		default:
   213  			panic("unrecognized escape character")
   214  		}
   215  		s = s[i+1:]
   216  		if _, err := w.WriteString(esc); err != nil {
   217  			return err
   218  		}
   219  		i = strings.IndexAny(s, escapedChars)
   220  	}
   221  	_, err := w.WriteString(s)
   222  	return err
   223  }
   224  
   225  // EscapeString escapes special characters like "<" to become "&lt;". It
   226  // escapes only five such characters: <, >, &, ' and ".
   227  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   228  // always true.
   229  func EscapeString(s string) string {
   230  	if strings.IndexAny(s, escapedChars) == -1 {
   231  		return s
   232  	}
   233  	var buf bytes.Buffer
   234  	escape(&buf, s)
   235  	return buf.String()
   236  }
   237  
   238  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   239  // larger range of entities than EscapeString escapes. For example, "&aacute;"
   240  // unescapes to "รก", as does "&#225;" and "&xE1;".
   241  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   242  // always true.
   243  func UnescapeString(s string) string {
   244  	for _, c := range s {
   245  		if c == '&' {
   246  			return string(unescape([]byte(s)))
   247  		}
   248  	}
   249  	return s
   250  }