gitee.com/quant1x/gox@v1.7.6/text/encoding/entity.go (about) 1 package encoding 2 3 // decoding HTML entities 4 5 import ( 6 "sync" 7 ) 8 9 var entityOnce sync.Once 10 11 // entityTrie is similar to mbcsTrie, but not identical. 12 type htmlEntityTrie struct { 13 runes [2]rune // Some HTML entities decode to two characters. 14 children []htmlEntityTrie 15 } 16 17 var entityTrie htmlEntityTrie 18 19 func buildEntityTrie() { 20 for e, c := range entity { 21 current := &entityTrie 22 for i := 0; i < len(e); i++ { 23 if current.children == nil { 24 current.children = make([]htmlEntityTrie, 256) 25 } 26 current = ¤t.children[e[i]] 27 } 28 current.runes[0] = c 29 } 30 31 for e, runes := range entity2 { 32 current := &entityTrie 33 for i := 0; i < len(e); i++ { 34 if current.children == nil { 35 current.children = make([]htmlEntityTrie, 256) 36 } 37 current = ¤t.children[e[i]] 38 } 39 current.runes = runes 40 } 41 } 42 43 // EntityDecoder returns a Decoder that decodes HTML character entities. 44 // If there is no valid character entity at the current position, it returns INVALID_CHAR. 45 // So it needs to be combined with another Decoder via FallbackDecoder. 46 func EntityDecoder() Decoder { 47 entityOnce.Do(buildEntityTrie) 48 var leftover rune // leftover rune from two-rune entity 49 return func(p []byte) (r rune, size int, status Status) { 50 if leftover != 0 { 51 r = leftover 52 leftover = 0 53 return r, 0, SUCCESS 54 } 55 56 if len(p) == 0 { 57 return 0, 0, NO_ROOM 58 } 59 60 if p[0] != '&' { 61 return 0xfffd, 1, INVALID_CHAR 62 } 63 64 if len(p) < 3 { 65 return 0, 1, NO_ROOM 66 } 67 68 r, size, status = 0xfffd, 1, INVALID_CHAR 69 n := 1 // number of bytes read so far 70 71 if p[n] == '#' { 72 n++ 73 c := p[n] 74 hex := false 75 if c == 'x' || c == 'X' { 76 hex = true 77 n++ 78 } 79 80 var x rune 81 for n < len(p) { 82 c = p[n] 83 n++ 84 if hex { 85 if '0' <= c && c <= '9' { 86 x = 16*x + rune(c) - '0' 87 continue 88 } else if 'a' <= c && c <= 'f' { 89 x = 16*x + rune(c) - 'a' + 10 90 continue 91 } else if 'A' <= c && c <= 'F' { 92 x = 16*x + rune(c) - 'A' + 10 93 continue 94 } 95 } else if '0' <= c && c <= '9' { 96 x = 10*x + rune(c) - '0' 97 continue 98 } 99 if c != ';' { 100 n-- 101 } 102 break 103 } 104 105 if n == len(p) && p[n-1] != ';' { 106 return 0, 0, NO_ROOM 107 } 108 109 size = n 110 if p[n-1] == ';' { 111 n-- 112 } 113 if hex { 114 n-- 115 } 116 n-- 117 // Now n is the number of actual digits read. 118 if n == 0 { 119 return 0xfffd, 1, INVALID_CHAR 120 } 121 122 if 0x80 <= x && x <= 0x9F { 123 // Replace characters from Windows-1252 with UTF-8 equivalents. 124 x = replacementTable[x-0x80] 125 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { 126 // Replace invalid characters with the replacement character. 127 return 0xfffd, size, INVALID_CHAR 128 } 129 130 r = x 131 status = SUCCESS 132 return 133 } 134 135 current := &entityTrie 136 for current.children != nil { 137 if len(p) <= n { 138 leftover = 0 139 return 0, 0, NO_ROOM 140 } 141 142 current = ¤t.children[p[n]] 143 n++ 144 if current.runes[0] != 0 { 145 r, leftover = current.runes[0], current.runes[1] 146 size = n 147 status = SUCCESS 148 // but don't return yet, since we need the longest match 149 } 150 } 151 152 return 153 } 154 } 155 156 // This table is copied from /src/pkg/html/escape.go in the Go source 157 // 158 // These replacements permit compatibility with old numeric entities that 159 // assumed Windows-1252 encoding. 160 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference 161 var replacementTable = [...]rune{ 162 '\u20AC', // First entry is what 0x80 should be replaced with. 163 '\u0081', 164 '\u201A', 165 '\u0192', 166 '\u201E', 167 '\u2026', 168 '\u2020', 169 '\u2021', 170 '\u02C6', 171 '\u2030', 172 '\u0160', 173 '\u2039', 174 '\u0152', 175 '\u008D', 176 '\u017D', 177 '\u008F', 178 '\u0090', 179 '\u2018', 180 '\u2019', 181 '\u201C', 182 '\u201D', 183 '\u2022', 184 '\u2013', 185 '\u2014', 186 '\u02DC', 187 '\u2122', 188 '\u0161', 189 '\u203A', 190 '\u0153', 191 '\u009D', 192 '\u017E', 193 '\u0178', // Last entry is 0x9F. 194 // 0x00->'\uFFFD' is handled programmatically. 195 // 0x0D->'\u000D' is a no-op. 196 }