git.lukeshu.com/go/lowmemjson@v0.3.9-0.20230723050957-72f6d13f6fb2/encode_escape.go (about) 1 // Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com> 2 // 3 // SPDX-License-Identifier: GPL-2.0-or-later 4 5 package lowmemjson 6 7 import ( 8 "fmt" 9 10 "git.lukeshu.com/go/lowmemjson/internal/jsonstring" 11 ) 12 13 // InvalidUTF8Mode identifies one of the 3 ways that an Encoder or 14 // ReEncoder can behave when encountering invalid UTF-8 in a string 15 // value: 16 // 17 // - Replace the byte with the Unicode replacement character U+FFFD. 18 // 19 // - Allow the byte through to the string-encoder, with an 20 // escape-mode of BackslashEscapeRawByte. 21 // 22 // - Emit a syntax error. 23 type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode 24 25 const ( 26 InvalidUTF8Replace = jsonstring.InvalidUTF8Replace 27 InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve 28 InvalidUTF8Error = jsonstring.InvalidUTF8Error 29 ) 30 31 // BackslashEscapeMode identifies one of the four ways that a 32 // character may be represented in a JSON string: 33 // 34 // - literally (no backslash escaping) 35 // 36 // - as a short "well-known" `\X` backslash sequence (where `X` is a 37 // single-character) 38 // 39 // - as a long Unicode `\uXXXX` backslash sequence (with 16 40 // permutations of capitalization) 41 // 42 // - as a raw byte; this allows you to emit invalid JSON; JSON must 43 // be valid UTF-8, but this allows you to emit arbitrary binary 44 // data. If the character does not satisfy `utf8.RuneSelf <= char 45 // <= 0xFF`, then the encoder will panic. 46 type BackslashEscapeMode = jsonstring.BackslashEscapeMode 47 48 const ( 49 BackslashEscapeNone = jsonstring.BackslashEscapeNone 50 BackslashEscapeShort = jsonstring.BackslashEscapeShort 51 BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte 52 53 BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX 54 BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx 55 BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX 56 BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx 57 BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX 58 BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx 59 BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX 60 BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx 61 BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX 62 BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx 63 BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX 64 BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx 65 BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX 66 BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx 67 BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX 68 BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx 69 70 BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin 71 BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax 72 73 BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat 74 ) 75 76 func hexToInt(c byte) rune { 77 switch { 78 case '0' <= c && c <= '9': 79 return rune(c) - '0' 80 case 'a' <= c && c <= 'f': 81 return rune(c) - 'a' + 10 82 case 'A' <= c && c <= 'F': 83 return rune(c) - 'A' + 10 84 default: 85 panic(fmt.Errorf("should not happen: invalid hex char: %q", c)) 86 } 87 } 88 89 func hexToRune(a, b, c, d byte) rune { 90 return 0 | 91 hexToInt(a)<<12 | 92 hexToInt(b)<<8 | 93 hexToInt(c)<<4 | 94 hexToInt(d)<<0 95 } 96 97 func hexToMode(a, b, c, d byte) BackslashEscapeMode { 98 // The 0b0010_0000 bit is the ASCII "lowercase bit". 99 return BackslashEscapeUnicodeMin + BackslashEscapeMode(0| 100 ((a&0b0010_0000)>>2)| 101 ((b&0b0010_0000)>>3)| 102 ((c&0b0010_0000)>>4)| 103 ((d&0b0010_0000)>>5)) 104 } 105 106 // A BackslashEscaper controls how a ReEncoder emits a character in a 107 // JSON string. The `rune` argument is the character being 108 // considered, and the `BackslashEscapeMode` argument is how it was 109 // originally encoded in the input. 110 // 111 // The ReEncoder will panic if a BackslashEscaper returns an unknown 112 // BackslashEscapeMode. However, a BackslashEscaper should be 113 // permissive of BackslashEscapeModes it doesn't recognize; it is safe 114 // to just return them unmodified. 115 type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode 116 117 // EscapePreserve is a BackslashEscaper that preserves the original 118 // input escaping. 119 func EscapePreserve(_ rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { 120 return wasEscaped 121 } 122 123 // EscapeJSSafe is a BackslashEscaper that escapes strings such that 124 // the JSON safe to embed in JS; it otherwise preserves the original 125 // input escaping. 126 // 127 // JSON is notionally a JS subset, but that's not actually true; so 128 // more conservative backslash-escaping is necessary to safely embed 129 // it in JS. http://timelessrepo.com/json-isnt-a-javascript-subset 130 func EscapeJSSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { 131 switch c { 132 case '\u2028', '\u2029': 133 return BackslashEscapeUnicode 134 default: 135 return wasEscaped 136 } 137 } 138 139 // EscapeHTMLSafe is a BackslashEscaper that escapes strings such that 140 // the JSON is safe to embed in HTML; it otherwise preserves the 141 // original input escaping. 142 func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { 143 switch c { 144 case '&', '<', '>': 145 return BackslashEscapeUnicode 146 default: 147 return EscapeJSSafe(c, wasEscaped) 148 } 149 } 150 151 // EscapeDefault is a BackslashEscaper that mimics the default 152 // behavior of encoding/json. 153 // 154 // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX` 155 // sequences for `\b` and `\f` 156 // 157 // A ReEncoder uses EscapeDefault if a BackslashEscaper is not 158 // specified. 159 func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { 160 switch c { 161 case '\b', '\f': 162 return BackslashEscapeUnicode 163 default: 164 return EscapeHTMLSafe(c, wasEscaped) 165 } 166 } 167 168 // EscapeDefaultNonHTMLSafe is a BackslashEscaper that mimics the 169 // default behavior of an encoding/json.Encoder that has had 170 // SetEscapeHTML(false) called on it. 171 // 172 // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX` 173 // sequences for `\b` and `\f`. 174 func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { 175 switch c { 176 case '\b', '\f': 177 return BackslashEscapeUnicode 178 default: 179 return EscapeJSSafe(c, wasEscaped) 180 } 181 }