github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/mahonia/entity.go (about) 1 package mahonia 2 3 // decoding HTML entities 4 5 import ( 6 "sort" 7 ) 8 9 // EntityDecoder returns a Decoder that decodes HTML character entities. 10 // If there is no valid character entity at the current position, it returns INVALID_CHAR. 11 // So it needs to be combined with another Decoder via FallbackDecoder. 12 func EntityDecoder() Decoder { 13 var leftover rune // leftover rune from two-rune entity 14 return func(p []byte) (r rune, size int, status Status) { 15 if leftover != 0 { 16 r = leftover 17 leftover = 0 18 return r, 0, SUCCESS 19 } 20 21 if len(p) == 0 { 22 return 0, 0, NO_ROOM 23 } 24 25 if p[0] != '&' { 26 return 0xfffd, 1, INVALID_CHAR 27 } 28 29 if len(p) < 3 { 30 return 0, 1, NO_ROOM 31 } 32 33 r, size, status = 0xfffd, 1, INVALID_CHAR 34 n := 1 // number of bytes read so far 35 36 if p[n] == '#' { 37 n++ 38 c := p[n] 39 hex := false 40 if c == 'x' || c == 'X' { 41 hex = true 42 n++ 43 } 44 45 var x rune 46 for n < len(p) { 47 c = p[n] 48 n++ 49 if hex { 50 if '0' <= c && c <= '9' { 51 x = 16*x + rune(c) - '0' 52 continue 53 } else if 'a' <= c && c <= 'f' { 54 x = 16*x + rune(c) - 'a' + 10 55 continue 56 } else if 'A' <= c && c <= 'F' { 57 x = 16*x + rune(c) - 'A' + 10 58 continue 59 } 60 } else if '0' <= c && c <= '9' { 61 x = 10*x + rune(c) - '0' 62 continue 63 } 64 if c != ';' { 65 n-- 66 } 67 break 68 } 69 70 if n == len(p) && p[n-1] != ';' { 71 return 0, 0, NO_ROOM 72 } 73 74 size = n 75 if p[n-1] == ';' { 76 n-- 77 } 78 if hex { 79 n-- 80 } 81 n-- 82 // Now n is the number of actual digits read. 83 if n == 0 { 84 return 0xfffd, 1, INVALID_CHAR 85 } 86 87 if 0x80 <= x && x <= 0x9F { 88 // Replace characters from Windows-1252 with UTF-8 equivalents. 89 x = replacementTable[x-0x80] 90 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { 91 // Replace invalid characters with the replacement character. 92 return 0xfffd, size, INVALID_CHAR 93 } 94 95 r = x 96 status = SUCCESS 97 return 98 } 99 100 // Look for a named entity in EntityList. 101 102 possible := entityList 103 for len(possible) > 0 { 104 if len(p) <= n { 105 leftover = 0 106 return 0, 0, NO_ROOM 107 } 108 109 c := p[n] 110 111 // Narrow down the selection in possible to those items that have c in the 112 // appropriate byte. 113 first := sort.Search(len(possible), func(i int) bool { 114 e := possible[i].name 115 if len(e) < n { 116 return false 117 } 118 return e[n-1] >= c 119 }) 120 possible = possible[first:] 121 last := sort.Search(len(possible), func(i int) bool { 122 return possible[i].name[n-1] > c 123 }) 124 possible = possible[:last] 125 126 n++ 127 if len(possible) > 0 && len(possible[0].name) == n-1 { 128 r, leftover = possible[0].r1, possible[0].r2 129 size = n 130 status = SUCCESS 131 // but don't return yet, since we need the longest match 132 } 133 } 134 135 return 136 } 137 } 138 139 // This table is copied from /src/pkg/html/escape.go in the Go source 140 // 141 // These replacements permit compatibility with old numeric entities that 142 // assumed Windows-1252 encoding. 143 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference 144 var replacementTable = [...]rune{ 145 '\u20AC', // First entry is what 0x80 should be replaced with. 146 '\u0081', 147 '\u201A', 148 '\u0192', 149 '\u201E', 150 '\u2026', 151 '\u2020', 152 '\u2021', 153 '\u02C6', 154 '\u2030', 155 '\u0160', 156 '\u2039', 157 '\u0152', 158 '\u008D', 159 '\u017D', 160 '\u008F', 161 '\u0090', 162 '\u2018', 163 '\u2019', 164 '\u201C', 165 '\u201D', 166 '\u2022', 167 '\u2013', 168 '\u2014', 169 '\u02DC', 170 '\u2122', 171 '\u0161', 172 '\u203A', 173 '\u0153', 174 '\u009D', 175 '\u017E', 176 '\u0178', // Last entry is 0x9F. 177 // 0x00->'\uFFFD' is handled programmatically. 178 // 0x0D->'\u000D' is a no-op. 179 }