github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/mahonia/entity.go (about)

     1  package mahonia
     2  
     3  // decoding HTML entities
     4  
     5  import (
     6  	"sort"
     7  )
     8  
     9  // EntityDecoder returns a Decoder that decodes HTML character entities.
    10  // If there is no valid character entity at the current position, it returns INVALID_CHAR.
    11  // So it needs to be combined with another Decoder via FallbackDecoder.
    12  func EntityDecoder() Decoder {
    13  	var leftover rune // leftover rune from two-rune entity
    14  	return func(p []byte) (r rune, size int, status Status) {
    15  		if leftover != 0 {
    16  			r = leftover
    17  			leftover = 0
    18  			return r, 0, SUCCESS
    19  		}
    20  
    21  		if len(p) == 0 {
    22  			return 0, 0, NO_ROOM
    23  		}
    24  
    25  		if p[0] != '&' {
    26  			return 0xfffd, 1, INVALID_CHAR
    27  		}
    28  
    29  		if len(p) < 3 {
    30  			return 0, 1, NO_ROOM
    31  		}
    32  
    33  		r, size, status = 0xfffd, 1, INVALID_CHAR
    34  		n := 1 // number of bytes read so far
    35  
    36  		if p[n] == '#' {
    37  			n++
    38  			c := p[n]
    39  			hex := false
    40  			if c == 'x' || c == 'X' {
    41  				hex = true
    42  				n++
    43  			}
    44  
    45  			var x rune
    46  			for n < len(p) {
    47  				c = p[n]
    48  				n++
    49  				if hex {
    50  					if '0' <= c && c <= '9' {
    51  						x = 16*x + rune(c) - '0'
    52  						continue
    53  					} else if 'a' <= c && c <= 'f' {
    54  						x = 16*x + rune(c) - 'a' + 10
    55  						continue
    56  					} else if 'A' <= c && c <= 'F' {
    57  						x = 16*x + rune(c) - 'A' + 10
    58  						continue
    59  					}
    60  				} else if '0' <= c && c <= '9' {
    61  					x = 10*x + rune(c) - '0'
    62  					continue
    63  				}
    64  				if c != ';' {
    65  					n--
    66  				}
    67  				break
    68  			}
    69  
    70  			if n == len(p) && p[n-1] != ';' {
    71  				return 0, 0, NO_ROOM
    72  			}
    73  
    74  			size = n
    75  			if p[n-1] == ';' {
    76  				n--
    77  			}
    78  			if hex {
    79  				n--
    80  			}
    81  			n--
    82  			// Now n is the number of actual digits read.
    83  			if n == 0 {
    84  				return 0xfffd, 1, INVALID_CHAR
    85  			}
    86  
    87  			if 0x80 <= x && x <= 0x9F {
    88  				// Replace characters from Windows-1252 with UTF-8 equivalents.
    89  				x = replacementTable[x-0x80]
    90  			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
    91  				// Replace invalid characters with the replacement character.
    92  				return 0xfffd, size, INVALID_CHAR
    93  			}
    94  
    95  			r = x
    96  			status = SUCCESS
    97  			return
    98  		}
    99  
   100  		// Look for a named entity in EntityList.
   101  
   102  		possible := entityList
   103  		for len(possible) > 0 {
   104  			if len(p) <= n {
   105  				leftover = 0
   106  				return 0, 0, NO_ROOM
   107  			}
   108  
   109  			c := p[n]
   110  
   111  			// Narrow down the selection in possible to those items that have c in the
   112  			// appropriate byte.
   113  			first := sort.Search(len(possible), func(i int) bool {
   114  				e := possible[i].name
   115  				if len(e) < n {
   116  					return false
   117  				}
   118  				return e[n-1] >= c
   119  			})
   120  			possible = possible[first:]
   121  			last := sort.Search(len(possible), func(i int) bool {
   122  				return possible[i].name[n-1] > c
   123  			})
   124  			possible = possible[:last]
   125  
   126  			n++
   127  			if len(possible) > 0 && len(possible[0].name) == n-1 {
   128  				r, leftover = possible[0].r1, possible[0].r2
   129  				size = n
   130  				status = SUCCESS
   131  				// but don't return yet, since we need the longest match
   132  			}
   133  		}
   134  
   135  		return
   136  	}
   137  }
   138  
   139  // This table is copied from /src/pkg/html/escape.go in the Go source
   140  //
   141  // These replacements permit compatibility with old numeric entities that
   142  // assumed Windows-1252 encoding.
   143  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
   144  var replacementTable = [...]rune{
   145  	'\u20AC', // First entry is what 0x80 should be replaced with.
   146  	'\u0081',
   147  	'\u201A',
   148  	'\u0192',
   149  	'\u201E',
   150  	'\u2026',
   151  	'\u2020',
   152  	'\u2021',
   153  	'\u02C6',
   154  	'\u2030',
   155  	'\u0160',
   156  	'\u2039',
   157  	'\u0152',
   158  	'\u008D',
   159  	'\u017D',
   160  	'\u008F',
   161  	'\u0090',
   162  	'\u2018',
   163  	'\u2019',
   164  	'\u201C',
   165  	'\u201D',
   166  	'\u2022',
   167  	'\u2013',
   168  	'\u2014',
   169  	'\u02DC',
   170  	'\u2122',
   171  	'\u0161',
   172  	'\u203A',
   173  	'\u0153',
   174  	'\u009D',
   175  	'\u017E',
   176  	'\u0178', // Last entry is 0x9F.
   177  	// 0x00->'\uFFFD' is handled programmatically.
   178  	// 0x0D->'\u000D' is a no-op.
   179  }