git.lukeshu.com/go/lowmemjson@v0.3.9-0.20230723050957-72f6d13f6fb2/encode_escape.go (about)

     1  // Copyright (C) 2022-2023  Luke Shumaker <lukeshu@lukeshu.com>
     2  //
     3  // SPDX-License-Identifier: GPL-2.0-or-later
     4  
     5  package lowmemjson
     6  
     7  import (
     8  	"fmt"
     9  
    10  	"git.lukeshu.com/go/lowmemjson/internal/jsonstring"
    11  )
    12  
    13  // InvalidUTF8Mode identifies one of the 3 ways that an Encoder or
    14  // ReEncoder can behave when encountering invalid UTF-8 in a string
    15  // value:
    16  //
    17  //   - Replace the byte with the Unicode replacement character U+FFFD.
    18  //
    19  //   - Allow the byte through to the string-encoder, with an
    20  //     escape-mode of BackslashEscapeRawByte.
    21  //
    22  //   - Emit a syntax error.
    23  type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode
    24  
    25  const (
    26  	InvalidUTF8Replace  = jsonstring.InvalidUTF8Replace
    27  	InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve
    28  	InvalidUTF8Error    = jsonstring.InvalidUTF8Error
    29  )
    30  
    31  // BackslashEscapeMode identifies one of the four ways that a
    32  // character may be represented in a JSON string:
    33  //
    34  //   - literally (no backslash escaping)
    35  //
    36  //   - as a short "well-known" `\X` backslash sequence (where `X` is a
    37  //     single-character)
    38  //
    39  //   - as a long Unicode `\uXXXX` backslash sequence (with 16
    40  //     permutations of capitalization)
    41  //
    42  //   - as a raw byte; this allows you to emit invalid JSON; JSON must
    43  //     be valid UTF-8, but this allows you to emit arbitrary binary
    44  //     data.  If the character does not satisfy `utf8.RuneSelf <= char
    45  //     <= 0xFF`, then the encoder will panic.
    46  type BackslashEscapeMode = jsonstring.BackslashEscapeMode
    47  
    48  const (
    49  	BackslashEscapeNone    = jsonstring.BackslashEscapeNone
    50  	BackslashEscapeShort   = jsonstring.BackslashEscapeShort
    51  	BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
    52  
    53  	BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX
    54  	BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx
    55  	BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX
    56  	BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx
    57  	BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX
    58  	BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx
    59  	BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX
    60  	BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx
    61  	BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX
    62  	BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx
    63  	BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX
    64  	BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx
    65  	BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX
    66  	BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx
    67  	BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX
    68  	BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx
    69  
    70  	BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin
    71  	BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax
    72  
    73  	BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat
    74  )
    75  
    76  func hexToInt(c byte) rune {
    77  	switch {
    78  	case '0' <= c && c <= '9':
    79  		return rune(c) - '0'
    80  	case 'a' <= c && c <= 'f':
    81  		return rune(c) - 'a' + 10
    82  	case 'A' <= c && c <= 'F':
    83  		return rune(c) - 'A' + 10
    84  	default:
    85  		panic(fmt.Errorf("should not happen: invalid hex char: %q", c))
    86  	}
    87  }
    88  
    89  func hexToRune(a, b, c, d byte) rune {
    90  	return 0 |
    91  		hexToInt(a)<<12 |
    92  		hexToInt(b)<<8 |
    93  		hexToInt(c)<<4 |
    94  		hexToInt(d)<<0
    95  }
    96  
    97  func hexToMode(a, b, c, d byte) BackslashEscapeMode {
    98  	// The 0b0010_0000 bit is the ASCII "lowercase bit".
    99  	return BackslashEscapeUnicodeMin + BackslashEscapeMode(0|
   100  		((a&0b0010_0000)>>2)|
   101  		((b&0b0010_0000)>>3)|
   102  		((c&0b0010_0000)>>4)|
   103  		((d&0b0010_0000)>>5))
   104  }
   105  
   106  // A BackslashEscaper controls how a ReEncoder emits a character in a
   107  // JSON string.  The `rune` argument is the character being
   108  // considered, and the `BackslashEscapeMode` argument is how it was
   109  // originally encoded in the input.
   110  //
   111  // The ReEncoder will panic if a BackslashEscaper returns an unknown
   112  // BackslashEscapeMode.  However, a BackslashEscaper should be
   113  // permissive of BackslashEscapeModes it doesn't recognize; it is safe
   114  // to just return them unmodified.
   115  type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
   116  
   117  // EscapePreserve is a BackslashEscaper that preserves the original
   118  // input escaping.
   119  func EscapePreserve(_ rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
   120  	return wasEscaped
   121  }
   122  
   123  // EscapeJSSafe is a BackslashEscaper that escapes strings such that
   124  // the JSON safe to embed in JS; it otherwise preserves the original
   125  // input escaping.
   126  //
   127  // JSON is notionally a JS subset, but that's not actually true; so
   128  // more conservative backslash-escaping is necessary to safely embed
   129  // it in JS.  http://timelessrepo.com/json-isnt-a-javascript-subset
   130  func EscapeJSSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
   131  	switch c {
   132  	case '\u2028', '\u2029':
   133  		return BackslashEscapeUnicode
   134  	default:
   135  		return wasEscaped
   136  	}
   137  }
   138  
   139  // EscapeHTMLSafe is a BackslashEscaper that escapes strings such that
   140  // the JSON is safe to embed in HTML; it otherwise preserves the
   141  // original input escaping.
   142  func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
   143  	switch c {
   144  	case '&', '<', '>':
   145  		return BackslashEscapeUnicode
   146  	default:
   147  		return EscapeJSSafe(c, wasEscaped)
   148  	}
   149  }
   150  
   151  // EscapeDefault is a BackslashEscaper that mimics the default
   152  // behavior of encoding/json.
   153  //
   154  // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX`
   155  // sequences for `\b` and `\f`
   156  //
   157  // A ReEncoder uses EscapeDefault if a BackslashEscaper is not
   158  // specified.
   159  func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
   160  	switch c {
   161  	case '\b', '\f':
   162  		return BackslashEscapeUnicode
   163  	default:
   164  		return EscapeHTMLSafe(c, wasEscaped)
   165  	}
   166  }
   167  
   168  // EscapeDefaultNonHTMLSafe is a BackslashEscaper that mimics the
   169  // default behavior of an encoding/json.Encoder that has had
   170  // SetEscapeHTML(false) called on it.
   171  //
   172  // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX`
   173  // sequences for `\b` and `\f`.
   174  func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
   175  	switch c {
   176  	case '\b', '\f':
   177  		return BackslashEscapeUnicode
   178  	default:
   179  		return EscapeJSSafe(c, wasEscaped)
   180  	}
   181  }