gitee.com/quant1x/gox@v1.7.6/text/encoding/entity.go

gitee.com/quant1x/gox@v1.7.6/text/encoding/entity.go (about)

     1  package encoding
     2  
     3  // decoding HTML entities
     4  
     5  import (
     6  	"sync"
     7  )
     8  
     9  var entityOnce sync.Once
    10  
    11  // entityTrie is similar to mbcsTrie, but not identical.
    12  type htmlEntityTrie struct {
    13  	runes    [2]rune // Some HTML entities decode to two characters.
    14  	children []htmlEntityTrie
    15  }
    16  
    17  var entityTrie htmlEntityTrie
    18  
    19  func buildEntityTrie() {
    20  	for e, c := range entity {
    21  		current := &entityTrie
    22  		for i := 0; i < len(e); i++ {
    23  			if current.children == nil {
    24  				current.children = make([]htmlEntityTrie, 256)
    25  			}
    26  			current = &current.children[e[i]]
    27  		}
    28  		current.runes[0] = c
    29  	}
    30  
    31  	for e, runes := range entity2 {
    32  		current := &entityTrie
    33  		for i := 0; i < len(e); i++ {
    34  			if current.children == nil {
    35  				current.children = make([]htmlEntityTrie, 256)
    36  			}
    37  			current = &current.children[e[i]]
    38  		}
    39  		current.runes = runes
    40  	}
    41  }
    42  
    43  // EntityDecoder returns a Decoder that decodes HTML character entities.
    44  // If there is no valid character entity at the current position, it returns INVALID_CHAR.
    45  // So it needs to be combined with another Decoder via FallbackDecoder.
    46  func EntityDecoder() Decoder {
    47  	entityOnce.Do(buildEntityTrie)
    48  	var leftover rune // leftover rune from two-rune entity
    49  	return func(p []byte) (r rune, size int, status Status) {
    50  		if leftover != 0 {
    51  			r = leftover
    52  			leftover = 0
    53  			return r, 0, SUCCESS
    54  		}
    55  
    56  		if len(p) == 0 {
    57  			return 0, 0, NO_ROOM
    58  		}
    59  
    60  		if p[0] != '&' {
    61  			return 0xfffd, 1, INVALID_CHAR
    62  		}
    63  
    64  		if len(p) < 3 {
    65  			return 0, 1, NO_ROOM
    66  		}
    67  
    68  		r, size, status = 0xfffd, 1, INVALID_CHAR
    69  		n := 1 // number of bytes read so far
    70  
    71  		if p[n] == '#' {
    72  			n++
    73  			c := p[n]
    74  			hex := false
    75  			if c == 'x' || c == 'X' {
    76  				hex = true
    77  				n++
    78  			}
    79  
    80  			var x rune
    81  			for n < len(p) {
    82  				c = p[n]
    83  				n++
    84  				if hex {
    85  					if '0' <= c && c <= '9' {
    86  						x = 16*x + rune(c) - '0'
    87  						continue
    88  					} else if 'a' <= c && c <= 'f' {
    89  						x = 16*x + rune(c) - 'a' + 10
    90  						continue
    91  					} else if 'A' <= c && c <= 'F' {
    92  						x = 16*x + rune(c) - 'A' + 10
    93  						continue
    94  					}
    95  				} else if '0' <= c && c <= '9' {
    96  					x = 10*x + rune(c) - '0'
    97  					continue
    98  				}
    99  				if c != ';' {
   100  					n--
   101  				}
   102  				break
   103  			}
   104  
   105  			if n == len(p) && p[n-1] != ';' {
   106  				return 0, 0, NO_ROOM
   107  			}
   108  
   109  			size = n
   110  			if p[n-1] == ';' {
   111  				n--
   112  			}
   113  			if hex {
   114  				n--
   115  			}
   116  			n--
   117  			// Now n is the number of actual digits read.
   118  			if n == 0 {
   119  				return 0xfffd, 1, INVALID_CHAR
   120  			}
   121  
   122  			if 0x80 <= x && x <= 0x9F {
   123  				// Replace characters from Windows-1252 with UTF-8 equivalents.
   124  				x = replacementTable[x-0x80]
   125  			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   126  				// Replace invalid characters with the replacement character.
   127  				return 0xfffd, size, INVALID_CHAR
   128  			}
   129  
   130  			r = x
   131  			status = SUCCESS
   132  			return
   133  		}
   134  
   135  		current := &entityTrie
   136  		for current.children != nil {
   137  			if len(p) <= n {
   138  				leftover = 0
   139  				return 0, 0, NO_ROOM
   140  			}
   141  
   142  			current = &current.children[p[n]]
   143  			n++
   144  			if current.runes[0] != 0 {
   145  				r, leftover = current.runes[0], current.runes[1]
   146  				size = n
   147  				status = SUCCESS
   148  				// but don't return yet, since we need the longest match
   149  			}
   150  		}
   151  
   152  		return
   153  	}
   154  }
   155  
   156  // This table is copied from /src/pkg/html/escape.go in the Go source
   157  //
   158  // These replacements permit compatibility with old numeric entities that 
   159  // assumed Windows-1252 encoding.
   160  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
   161  var replacementTable = [...]rune{
   162  	'\u20AC', // First entry is what 0x80 should be replaced with.
   163  	'\u0081',
   164  	'\u201A',
   165  	'\u0192',
   166  	'\u201E',
   167  	'\u2026',
   168  	'\u2020',
   169  	'\u2021',
   170  	'\u02C6',
   171  	'\u2030',
   172  	'\u0160',
   173  	'\u2039',
   174  	'\u0152',
   175  	'\u008D',
   176  	'\u017D',
   177  	'\u008F',
   178  	'\u0090',
   179  	'\u2018',
   180  	'\u2019',
   181  	'\u201C',
   182  	'\u201D',
   183  	'\u2022',
   184  	'\u2013',
   185  	'\u2014',
   186  	'\u02DC',
   187  	'\u2122',
   188  	'\u0161',
   189  	'\u203A',
   190  	'\u0153',
   191  	'\u009D',
   192  	'\u017E',
   193  	'\u0178', // Last entry is 0x9F.
   194  	// 0x00->'\uFFFD' is handled programmatically. 
   195  	// 0x0D->'\u000D' is a no-op.
   196  }