github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/examples/gno.land/p/demo/json/escape.gno

github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/examples/gno.land/p/demo/json/escape.gno (about)

     1  package json
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"unicode/utf8"
     7  )
     8  
     9  const (
    10  	supplementalPlanesOffset     = 0x10000
    11  	highSurrogateOffset          = 0xD800
    12  	lowSurrogateOffset           = 0xDC00
    13  	surrogateEnd                 = 0xDFFF
    14  	basicMultilingualPlaneOffset = 0xFFFF
    15  	badHex                       = -1
    16  )
    17  
    18  var hexLookupTable = [256]int{
    19  	'0': 0x0, '1': 0x1, '2': 0x2, '3': 0x3, '4': 0x4,
    20  	'5': 0x5, '6': 0x6, '7': 0x7, '8': 0x8, '9': 0x9,
    21  	'A': 0xA, 'B': 0xB, 'C': 0xC, 'D': 0xD, 'E': 0xE, 'F': 0xF,
    22  	'a': 0xA, 'b': 0xB, 'c': 0xC, 'd': 0xD, 'e': 0xE, 'f': 0xF,
    23  	// Fill unspecified index-value pairs with key and value of -1
    24  	'G': -1, 'H': -1, 'I': -1, 'J': -1,
    25  	'K': -1, 'L': -1, 'M': -1, 'N': -1,
    26  	'O': -1, 'P': -1, 'Q': -1, 'R': -1,
    27  	'S': -1, 'T': -1, 'U': -1, 'V': -1,
    28  	'W': -1, 'X': -1, 'Y': -1, 'Z': -1,
    29  	'g': -1, 'h': -1, 'i': -1, 'j': -1,
    30  	'k': -1, 'l': -1, 'm': -1, 'n': -1,
    31  	'o': -1, 'p': -1, 'q': -1, 'r': -1,
    32  	's': -1, 't': -1, 'u': -1, 'v': -1,
    33  	'w': -1, 'x': -1, 'y': -1, 'z': -1,
    34  }
    35  
    36  func h2i(c byte) int {
    37  	return hexLookupTable[c]
    38  }
    39  
    40  // Unescape takes an input byte slice, processes it to Unescape certain characters,
    41  // and writes the result into an output byte slice.
    42  //
    43  // it returns the processed slice and any error encountered during the Unescape operation.
    44  func Unescape(input, output []byte) ([]byte, error) {
    45  	// find the index of the first backslash in the input slice.
    46  	firstBackslash := bytes.IndexByte(input, backSlash)
    47  	if firstBackslash == -1 {
    48  		return input, nil
    49  	}
    50  
    51  	// ensure the output slice has enough capacity to hold the result.
    52  	inputLen := len(input)
    53  	if cap(output) < inputLen {
    54  		output = make([]byte, inputLen)
    55  	}
    56  
    57  	output = output[:inputLen]
    58  	copy(output, input[:firstBackslash])
    59  
    60  	input = input[firstBackslash:]
    61  	buf := output[firstBackslash:]
    62  
    63  	for len(input) > 0 {
    64  		inLen, bufLen, err := processEscapedUTF8(input, buf)
    65  		if err != nil {
    66  			return nil, err
    67  		}
    68  
    69  		input = input[inLen:] // the number of bytes consumed in the input
    70  		buf = buf[bufLen:]    // the number of bytes written to buf
    71  
    72  		// find the next backslash in the remaining input
    73  		nextBackslash := bytes.IndexByte(input, backSlash)
    74  		if nextBackslash == -1 {
    75  			copy(buf, input)
    76  			buf = buf[len(input):]
    77  			break
    78  		}
    79  
    80  		copy(buf, input[:nextBackslash])
    81  
    82  		input = input[nextBackslash:]
    83  		buf = buf[nextBackslash:]
    84  	}
    85  
    86  	return output[:len(output)-len(buf)], nil
    87  }
    88  
    89  // isSurrogatePair returns true if the rune is a surrogate pair.
    90  //
    91  // A surrogate pairs are used in UTF-16 encoding to encode characters
    92  // outside the Basic Multilingual Plane (BMP).
    93  func isSurrogatePair(r rune) bool {
    94  	return highSurrogateOffset <= r && r <= surrogateEnd
    95  }
    96  
    97  // combineSurrogates reconstruct the original unicode code points in the
    98  // supplemental plane by combinin the high and low surrogate.
    99  //
   100  // The hight surrogate in the range from U+D800 to U+DBFF,
   101  // and the low surrogate in the range from U+DC00 to U+DFFF.
   102  //
   103  // The formula to combine the surrogates is:
   104  // (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000
   105  func combineSurrogates(high, low rune) rune {
   106  	return ((high - highSurrogateOffset) << 10) + (low - lowSurrogateOffset) + supplementalPlanesOffset
   107  }
   108  
   109  // deocdeSingleUnicodeEscape decodes a unicode escape sequence (e.g., \uXXXX) into a rune.
   110  func decodeSingleUnicodeEscape(b []byte) (rune, bool) {
   111  	if len(b) < 6 {
   112  		return utf8.RuneError, false
   113  	}
   114  
   115  	// convert hex to decimal
   116  	h1, h2, h3, h4 := h2i(b[2]), h2i(b[3]), h2i(b[4]), h2i(b[5])
   117  	if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
   118  		return utf8.RuneError, false
   119  	}
   120  
   121  	return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
   122  }
   123  
   124  // decodeUnicodeEscape decodes a Unicode escape sequence from a byte slice.
   125  func decodeUnicodeEscape(b []byte) (rune, int) {
   126  	r, ok := decodeSingleUnicodeEscape(b)
   127  	if !ok {
   128  		return utf8.RuneError, -1
   129  	}
   130  
   131  	// determine valid unicode escapes within the BMP
   132  	if r <= basicMultilingualPlaneOffset && !isSurrogatePair(r) {
   133  		return r, 6
   134  	}
   135  
   136  	// Decode the following escape sequence to verify a UTF-16 susergate pair.
   137  	r2, ok := decodeSingleUnicodeEscape(b[6:])
   138  	if !ok {
   139  		return utf8.RuneError, -1
   140  	}
   141  
   142  	if r2 < lowSurrogateOffset {
   143  		return utf8.RuneError, -1
   144  	}
   145  
   146  	return combineSurrogates(r, r2), 12
   147  }
   148  
   149  var escapeByteSet = [256]byte{
   150  	'"':  doubleQuote,
   151  	'\\': backSlash,
   152  	'/':  slash,
   153  	'b':  backSpace,
   154  	'f':  formFeed,
   155  	'n':  newLine,
   156  	'r':  carriageReturn,
   157  	't':  tab,
   158  }
   159  
   160  // Unquote takes a byte slice and unquotes it by removing
   161  // the surrounding quotes and unescaping the contents.
   162  func Unquote(s []byte, border byte) (string, bool) {
   163  	s, ok := unquoteBytes(s, border)
   164  	return string(s), ok
   165  }
   166  
   167  // unquoteBytes takes a byte slice and unquotes it by removing
   168  // TODO: consider to move this function to the strconv package.
   169  func unquoteBytes(s []byte, border byte) ([]byte, bool) {
   170  	if len(s) < 2 || s[0] != border || s[len(s)-1] != border {
   171  		return nil, false
   172  	}
   173  
   174  	s = s[1 : len(s)-1]
   175  
   176  	r := 0
   177  	for r < len(s) {
   178  		c := s[r]
   179  
   180  		if c == backSlash || c == border || c < 0x20 {
   181  			break
   182  		}
   183  
   184  		if c < utf8.RuneSelf {
   185  			r++
   186  			continue
   187  		}
   188  
   189  		rr, size := utf8.DecodeRune(s[r:])
   190  		if rr == utf8.RuneError && size == 1 {
   191  			break
   192  		}
   193  
   194  		r += size
   195  	}
   196  
   197  	if r == len(s) {
   198  		return s, true
   199  	}
   200  
   201  	utfDoubleMax := utf8.UTFMax * 2
   202  	b := make([]byte, len(s)+utfDoubleMax)
   203  	w := copy(b, s[0:r])
   204  
   205  	for r < len(s) {
   206  		if w >= len(b)-utf8.UTFMax {
   207  			nb := make([]byte, utfDoubleMax+(2*len(b)))
   208  			copy(nb, b)
   209  			b = nb
   210  		}
   211  
   212  		c := s[r]
   213  		if c == backSlash {
   214  			r++
   215  			if r >= len(s) {
   216  				return nil, false
   217  			}
   218  
   219  			if s[r] == 'u' {
   220  				rr, res := decodeUnicodeEscape(s[r-1:])
   221  				if res < 0 {
   222  					return nil, false
   223  				}
   224  
   225  				w += utf8.EncodeRune(b[w:], rr)
   226  				r += 5
   227  			} else {
   228  				decode := escapeByteSet[s[r]]
   229  				if decode == 0 {
   230  					return nil, false
   231  				}
   232  
   233  				if decode == doubleQuote || decode == backSlash || decode == slash {
   234  					decode = s[r]
   235  				}
   236  
   237  				b[w] = decode
   238  				r++
   239  				w++
   240  			}
   241  		} else if c == border || c < 0x20 {
   242  			return nil, false
   243  		} else if c < utf8.RuneSelf {
   244  			b[w] = c
   245  			r++
   246  			w++
   247  		} else {
   248  			rr, size := utf8.DecodeRune(s[r:])
   249  
   250  			if rr == utf8.RuneError && size == 1 {
   251  				return nil, false
   252  			}
   253  
   254  			r += size
   255  			w += utf8.EncodeRune(b[w:], rr)
   256  		}
   257  	}
   258  
   259  	return b[:w], true
   260  }
   261  
   262  // processEscapedUTF8 processes the escape sequence in the given byte slice and
   263  // and converts them to UTF-8 characters. The function returns the length of the processed input and output.
   264  //
   265  // The input 'in' must contain the escape sequence to be processed,
   266  // and 'out' provides a space to store the converted characters.
   267  //
   268  // The function returns (input length, output length) if the escape sequence is correct.
   269  // Unicode escape sequences (e.g. \uXXXX) are decoded to UTF-8, other default escape sequences are
   270  // converted to their corresponding special characters (e.g. \n -> newline).
   271  //
   272  // If the escape sequence is invalid, or if 'in' does not completely enclose the escape sequence,
   273  // function returns (-1, -1) to indicate an error.
   274  func processEscapedUTF8(in, out []byte) (int, int, error) {
   275  	if len(in) < 2 || in[0] != backSlash {
   276  		return -1, -1, errors.New("invalid escape sequence")
   277  	}
   278  
   279  	escapeSeqLen := 2
   280  	escapeChar := in[1]
   281  
   282  	if escapeChar != 'u' {
   283  		val := escapeByteSet[escapeChar]
   284  		if val == 0 {
   285  			return -1, -1, errors.New("invalid escape sequence")
   286  		}
   287  
   288  		out[0] = val
   289  		return escapeSeqLen, 1, nil
   290  	}
   291  
   292  	r, size := decodeUnicodeEscape(in)
   293  	if size == -1 {
   294  		return -1, -1, errors.New("invalid escape sequence")
   295  	}
   296  
   297  	outLen := utf8.EncodeRune(out, r)
   298  
   299  	return size, outLen, nil
   300  }