github.com/vugu/vugu@v0.3.6-0.20240430171613-3f6f402e014b/internal/htmlx/escape.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package htmlx
     6  
     7  import (
     8  	"bytes"
     9  	"strings"
    10  	"unicode/utf8"
    11  )
    12  
    13  // These replacements permit compatibility with old numeric entities that
    14  // assumed Windows-1252 encoding.
    15  // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
    16  var replacementTable = [...]rune{
    17  	'\u20AC', // First entry is what 0x80 should be replaced with.
    18  	'\u0081',
    19  	'\u201A',
    20  	'\u0192',
    21  	'\u201E',
    22  	'\u2026',
    23  	'\u2020',
    24  	'\u2021',
    25  	'\u02C6',
    26  	'\u2030',
    27  	'\u0160',
    28  	'\u2039',
    29  	'\u0152',
    30  	'\u008D',
    31  	'\u017D',
    32  	'\u008F',
    33  	'\u0090',
    34  	'\u2018',
    35  	'\u2019',
    36  	'\u201C',
    37  	'\u201D',
    38  	'\u2022',
    39  	'\u2013',
    40  	'\u2014',
    41  	'\u02DC',
    42  	'\u2122',
    43  	'\u0161',
    44  	'\u203A',
    45  	'\u0153',
    46  	'\u009D',
    47  	'\u017E',
    48  	'\u0178', // Last entry is 0x9F.
    49  	// 0x00->'\uFFFD' is handled programmatically.
    50  	// 0x0D->'\u000D' is a no-op.
    51  }
    52  
    53  // unescapeEntity reads an entity like "<" from b[src:] and writes the
    54  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    55  // Precondition: b[src] == '&' && dst <= src.
    56  // attribute should be true if parsing an attribute value.
    57  func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
    58  	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
    59  
    60  	// i starts at 1 because we already know that s[0] == '&'.
    61  	i, s := 1, b[src:]
    62  
    63  	if len(s) <= 1 {
    64  		b[dst] = b[src]
    65  		return dst + 1, src + 1
    66  	}
    67  
    68  	if s[i] == '#' {
    69  		if len(s) <= 3 { // We need to have at least "&#.".
    70  			b[dst] = b[src]
    71  			return dst + 1, src + 1
    72  		}
    73  		i++
    74  		c := s[i]
    75  		hex := false
    76  		if c == 'x' || c == 'X' {
    77  			hex = true
    78  			i++
    79  		}
    80  
    81  		x := '\x00'
    82  		for i < len(s) {
    83  			c = s[i]
    84  			i++
    85  			if hex {
    86  				if '0' <= c && c <= '9' {
    87  					x = 16*x + rune(c) - '0'
    88  					continue
    89  				} else if 'a' <= c && c <= 'f' {
    90  					x = 16*x + rune(c) - 'a' + 10
    91  					continue
    92  				} else if 'A' <= c && c <= 'F' {
    93  					x = 16*x + rune(c) - 'A' + 10
    94  					continue
    95  				}
    96  			} else if '0' <= c && c <= '9' {
    97  				x = 10*x + rune(c) - '0'
    98  				continue
    99  			}
   100  			if c != ';' {
   101  				i--
   102  			}
   103  			break
   104  		}
   105  
   106  		if i <= 3 { // No characters matched.
   107  			b[dst] = b[src]
   108  			return dst + 1, src + 1
   109  		}
   110  
   111  		if 0x80 <= x && x <= 0x9F {
   112  			// Replace characters from Windows-1252 with UTF-8 equivalents.
   113  			x = replacementTable[x-0x80]
   114  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   115  			// Replace invalid characters with the replacement character.
   116  			x = '\uFFFD'
   117  		}
   118  
   119  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   120  	}
   121  
   122  	// Consume the maximum number of characters possible, with the
   123  	// consumed characters matching one of the named references.
   124  
   125  	for i < len(s) {
   126  		c := s[i]
   127  		i++
   128  		// Lower-cased characters are more common in entities, so we check for them first.
   129  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   130  			continue
   131  		}
   132  		if c != ';' {
   133  			i--
   134  		}
   135  		break
   136  	}
   137  
   138  	entityName := string(s[1:i])
   139  	if entityName == "" {
   140  		// No-op.
   141  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   142  		// No-op.
   143  	} else if x := entity[entityName]; x != 0 {
   144  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   145  	} else if x := entity2[entityName]; x[0] != 0 {
   146  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   147  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   148  	} else if !attribute {
   149  		maxLen := len(entityName) - 1
   150  		if maxLen > longestEntityWithoutSemicolon {
   151  			maxLen = longestEntityWithoutSemicolon
   152  		}
   153  		for j := maxLen; j > 1; j-- {
   154  			if x := entity[entityName[:j]]; x != 0 {
   155  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   156  			}
   157  		}
   158  	}
   159  
   160  	dst1, src1 = dst+i, src+i
   161  	copy(b[dst:dst1], b[src:src1])
   162  	return dst1, src1
   163  }
   164  
   165  // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
   166  // attribute should be true if parsing an attribute value.
   167  func unescape(b []byte, attribute bool) []byte {
   168  	for i, c := range b {
   169  		if c == '&' {
   170  			dst, src := unescapeEntity(b, i, i, attribute)
   171  			for src < len(b) {
   172  				c := b[src]
   173  				if c == '&' {
   174  					dst, src = unescapeEntity(b, dst, src, attribute)
   175  				} else {
   176  					b[dst] = c
   177  					dst, src = dst+1, src+1
   178  				}
   179  			}
   180  			return b[0:dst]
   181  		}
   182  	}
   183  	return b
   184  }
   185  
   186  // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
   187  func lower(b []byte) []byte {
   188  	for i, c := range b {
   189  		if 'A' <= c && c <= 'Z' {
   190  			b[i] = c + 'a' - 'A'
   191  		}
   192  	}
   193  	return b
   194  }
   195  
   196  const escapedChars = "&'<>\"\r"
   197  
   198  func escape(w writer, s string) error {
   199  	i := strings.IndexAny(s, escapedChars)
   200  	for i != -1 {
   201  		if _, err := w.WriteString(s[:i]); err != nil {
   202  			return err
   203  		}
   204  		var esc string
   205  		switch s[i] {
   206  		case '&':
   207  			esc = "&amp;"
   208  		case '\'':
   209  			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   210  			esc = "&#39;"
   211  		case '<':
   212  			esc = "&lt;"
   213  		case '>':
   214  			esc = "&gt;"
   215  		case '"':
   216  			// "&#34;" is shorter than "&quot;".
   217  			esc = "&#34;"
   218  		case '\r':
   219  			esc = "&#13;"
   220  		default:
   221  			panic("unrecognized escape character")
   222  		}
   223  		s = s[i+1:]
   224  		if _, err := w.WriteString(esc); err != nil {
   225  			return err
   226  		}
   227  		i = strings.IndexAny(s, escapedChars)
   228  	}
   229  	_, err := w.WriteString(s)
   230  	return err
   231  }
   232  
   233  // EscapeString escapes special characters like "<" to become "&lt;". It
   234  // escapes only five such characters: <, >, &, ' and ".
   235  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   236  // always true.
   237  func EscapeString(s string) string {
   238  	if !strings.ContainsAny(s, escapedChars) {
   239  		return s
   240  	}
   241  	var buf bytes.Buffer
   242  	err := escape(&buf, s)
   243  	if err != nil {
   244  		panic(err)
   245  	}
   246  	return buf.String()
   247  }
   248  
   249  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   250  // larger range of entities than EscapeString escapes. For example, "&aacute;"
   251  // unescapes to "รก", as does "&#225;" and "&xE1;".
   252  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   253  // always true.
   254  func UnescapeString(s string) string {
   255  	for _, c := range s {
   256  		if c == '&' {
   257  			return string(unescape([]byte(s), false))
   258  		}
   259  	}
   260  	return s
   261  }