golang.org/x/net@v0.25.1-0.20240516223405-c87a5b62e243/html/escape.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bytes"
     9  	"strings"
    10  	"unicode/utf8"
    11  )
    12  
    13  // These replacements permit compatibility with old numeric entities that
    14  // assumed Windows-1252 encoding.
    15  // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
    16  var replacementTable = [...]rune{
    17  	'\u20AC', // First entry is what 0x80 should be replaced with.
    18  	'\u0081',
    19  	'\u201A',
    20  	'\u0192',
    21  	'\u201E',
    22  	'\u2026',
    23  	'\u2020',
    24  	'\u2021',
    25  	'\u02C6',
    26  	'\u2030',
    27  	'\u0160',
    28  	'\u2039',
    29  	'\u0152',
    30  	'\u008D',
    31  	'\u017D',
    32  	'\u008F',
    33  	'\u0090',
    34  	'\u2018',
    35  	'\u2019',
    36  	'\u201C',
    37  	'\u201D',
    38  	'\u2022',
    39  	'\u2013',
    40  	'\u2014',
    41  	'\u02DC',
    42  	'\u2122',
    43  	'\u0161',
    44  	'\u203A',
    45  	'\u0153',
    46  	'\u009D',
    47  	'\u017E',
    48  	'\u0178', // Last entry is 0x9F.
    49  	// 0x00->'\uFFFD' is handled programmatically.
    50  	// 0x0D->'\u000D' is a no-op.
    51  }
    52  
    53  // unescapeEntity reads an entity like "<" from b[src:] and writes the
    54  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    55  // Precondition: b[src] == '&' && dst <= src.
    56  // attribute should be true if parsing an attribute value.
    57  func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
    58  	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
    59  
    60  	// i starts at 1 because we already know that s[0] == '&'.
    61  	i, s := 1, b[src:]
    62  
    63  	if len(s) <= 1 {
    64  		b[dst] = b[src]
    65  		return dst + 1, src + 1
    66  	}
    67  
    68  	if s[i] == '#' {
    69  		if len(s) <= 3 { // We need to have at least "&#.".
    70  			b[dst] = b[src]
    71  			return dst + 1, src + 1
    72  		}
    73  		i++
    74  		c := s[i]
    75  		hex := false
    76  		if c == 'x' || c == 'X' {
    77  			hex = true
    78  			i++
    79  		}
    80  
    81  		x := '\x00'
    82  		for i < len(s) {
    83  			c = s[i]
    84  			i++
    85  			if hex {
    86  				if '0' <= c && c <= '9' {
    87  					x = 16*x + rune(c) - '0'
    88  					continue
    89  				} else if 'a' <= c && c <= 'f' {
    90  					x = 16*x + rune(c) - 'a' + 10
    91  					continue
    92  				} else if 'A' <= c && c <= 'F' {
    93  					x = 16*x + rune(c) - 'A' + 10
    94  					continue
    95  				}
    96  			} else if '0' <= c && c <= '9' {
    97  				x = 10*x + rune(c) - '0'
    98  				continue
    99  			}
   100  			if c != ';' {
   101  				i--
   102  			}
   103  			break
   104  		}
   105  
   106  		if i <= 3 { // No characters matched.
   107  			b[dst] = b[src]
   108  			return dst + 1, src + 1
   109  		}
   110  
   111  		if 0x80 <= x && x <= 0x9F {
   112  			// Replace characters from Windows-1252 with UTF-8 equivalents.
   113  			x = replacementTable[x-0x80]
   114  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   115  			// Replace invalid characters with the replacement character.
   116  			x = '\uFFFD'
   117  		}
   118  
   119  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   120  	}
   121  
   122  	// Consume the maximum number of characters possible, with the
   123  	// consumed characters matching one of the named references.
   124  
   125  	for i < len(s) {
   126  		c := s[i]
   127  		i++
   128  		// Lower-cased characters are more common in entities, so we check for them first.
   129  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   130  			continue
   131  		}
   132  		if c != ';' {
   133  			i--
   134  		}
   135  		break
   136  	}
   137  
   138  	entityName := string(s[1:i])
   139  	if entityName == "" {
   140  		// No-op.
   141  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   142  		// No-op.
   143  	} else if x := entity[entityName]; x != 0 {
   144  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   145  	} else if x := entity2[entityName]; x[0] != 0 {
   146  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   147  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   148  	} else if !attribute {
   149  		maxLen := len(entityName) - 1
   150  		if maxLen > longestEntityWithoutSemicolon {
   151  			maxLen = longestEntityWithoutSemicolon
   152  		}
   153  		for j := maxLen; j > 1; j-- {
   154  			if x := entity[entityName[:j]]; x != 0 {
   155  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   156  			}
   157  		}
   158  	}
   159  
   160  	dst1, src1 = dst+i, src+i
   161  	copy(b[dst:dst1], b[src:src1])
   162  	return dst1, src1
   163  }
   164  
   165  // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
   166  // attribute should be true if parsing an attribute value.
   167  func unescape(b []byte, attribute bool) []byte {
   168  	for i, c := range b {
   169  		if c == '&' {
   170  			dst, src := unescapeEntity(b, i, i, attribute)
   171  			for src < len(b) {
   172  				c := b[src]
   173  				if c == '&' {
   174  					dst, src = unescapeEntity(b, dst, src, attribute)
   175  				} else {
   176  					b[dst] = c
   177  					dst, src = dst+1, src+1
   178  				}
   179  			}
   180  			return b[0:dst]
   181  		}
   182  	}
   183  	return b
   184  }
   185  
   186  // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
   187  func lower(b []byte) []byte {
   188  	for i, c := range b {
   189  		if 'A' <= c && c <= 'Z' {
   190  			b[i] = c + 'a' - 'A'
   191  		}
   192  	}
   193  	return b
   194  }
   195  
   196  // escapeComment is like func escape but escapes its input bytes less often.
   197  // Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
   198  // meaningful and (2) contain angle brackets that we'd like to avoid escaping
   199  // unless we have to.
   200  //
   201  // "We have to" includes the '&' byte, since that introduces other escapes.
   202  //
   203  // It also includes those bytes (not including EOF) that would otherwise end
   204  // the comment. Per the summary table at the bottom of comment_test.go, this is
   205  // the '>' byte that, per above, we'd like to avoid escaping unless we have to.
   206  //
   207  // Studying the summary table (and T actions in its '>' column) closely, we
   208  // only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
   209  // start of the comment data. State 52 is after a '!'. The other three states
   210  // are after a '-'.
   211  //
   212  // Our algorithm is thus to escape every '&' and to escape '>' if and only if:
   213  //   - The '>' is after a '!' or '-' (in the unescaped data) or
   214  //   - The '>' is at the start of the comment data (after the opening "<!--").
   215  func escapeComment(w writer, s string) error {
   216  	// When modifying this function, consider manually increasing the
   217  	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
   218  	// That increase should only be temporary, not committed, as it
   219  	// exponentially affects the test running time.
   220  
   221  	if len(s) == 0 {
   222  		return nil
   223  	}
   224  
   225  	// Loop:
   226  	//   - Grow j such that s[i:j] does not need escaping.
   227  	//   - If s[j] does need escaping, output s[i:j] and an escaped s[j],
   228  	//     resetting i and j to point past that s[j] byte.
   229  	i := 0
   230  	for j := 0; j < len(s); j++ {
   231  		escaped := ""
   232  		switch s[j] {
   233  		case '&':
   234  			escaped = "&amp;"
   235  
   236  		case '>':
   237  			if j > 0 {
   238  				if prev := s[j-1]; (prev != '!') && (prev != '-') {
   239  					continue
   240  				}
   241  			}
   242  			escaped = "&gt;"
   243  
   244  		default:
   245  			continue
   246  		}
   247  
   248  		if i < j {
   249  			if _, err := w.WriteString(s[i:j]); err != nil {
   250  				return err
   251  			}
   252  		}
   253  		if _, err := w.WriteString(escaped); err != nil {
   254  			return err
   255  		}
   256  		i = j + 1
   257  	}
   258  
   259  	if i < len(s) {
   260  		if _, err := w.WriteString(s[i:]); err != nil {
   261  			return err
   262  		}
   263  	}
   264  	return nil
   265  }
   266  
   267  // escapeCommentString is to EscapeString as escapeComment is to escape.
   268  func escapeCommentString(s string) string {
   269  	if strings.IndexAny(s, "&>") == -1 {
   270  		return s
   271  	}
   272  	var buf bytes.Buffer
   273  	escapeComment(&buf, s)
   274  	return buf.String()
   275  }
   276  
   277  const escapedChars = "&'<>\"\r"
   278  
   279  func escape(w writer, s string) error {
   280  	i := strings.IndexAny(s, escapedChars)
   281  	for i != -1 {
   282  		if _, err := w.WriteString(s[:i]); err != nil {
   283  			return err
   284  		}
   285  		var esc string
   286  		switch s[i] {
   287  		case '&':
   288  			esc = "&amp;"
   289  		case '\'':
   290  			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   291  			esc = "&#39;"
   292  		case '<':
   293  			esc = "&lt;"
   294  		case '>':
   295  			esc = "&gt;"
   296  		case '"':
   297  			// "&#34;" is shorter than "&quot;".
   298  			esc = "&#34;"
   299  		case '\r':
   300  			esc = "&#13;"
   301  		default:
   302  			panic("unrecognized escape character")
   303  		}
   304  		s = s[i+1:]
   305  		if _, err := w.WriteString(esc); err != nil {
   306  			return err
   307  		}
   308  		i = strings.IndexAny(s, escapedChars)
   309  	}
   310  	_, err := w.WriteString(s)
   311  	return err
   312  }
   313  
   314  // EscapeString escapes special characters like "<" to become "&lt;". It
   315  // escapes only five such characters: <, >, &, ' and ".
   316  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   317  // always true.
   318  func EscapeString(s string) string {
   319  	if strings.IndexAny(s, escapedChars) == -1 {
   320  		return s
   321  	}
   322  	var buf bytes.Buffer
   323  	escape(&buf, s)
   324  	return buf.String()
   325  }
   326  
   327  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   328  // larger range of entities than EscapeString escapes. For example, "&aacute;"
   329  // unescapes to "รก", as does "&#225;" and "&xE1;".
   330  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   331  // always true.
   332  func UnescapeString(s string) string {
   333  	for _, c := range s {
   334  		if c == '&' {
   335  			return string(unescape([]byte(s), false))
   336  		}
   337  	}
   338  	return s
   339  }