git.lukeshu.com/go/lowmemjson@v0.3.9-0.20230723050957-72f6d13f6fb2/internal/jsonstring/encode_string.go (about)

     1  // Copyright (C) 2022-2023  Luke Shumaker <lukeshu@lukeshu.com>
     2  //
     3  // SPDX-License-Identifier: GPL-2.0-or-later
     4  
     5  package jsonstring
     6  
     7  import (
     8  	"encoding/json"
     9  	"fmt"
    10  	"io"
    11  	"reflect"
    12  	"unicode/utf8"
    13  
    14  	"git.lukeshu.com/go/lowmemjson/internal/fastio"
    15  	"git.lukeshu.com/go/lowmemjson/internal/fastio/noescape"
    16  )
    17  
    18  // InvalidUTF8Mode is describe in the main lowmemjson package docs.
    19  type InvalidUTF8Mode uint8
    20  
    21  const (
    22  	InvalidUTF8Replace InvalidUTF8Mode = iota
    23  	InvalidUTF8Preserve
    24  	InvalidUTF8Error
    25  )
    26  
    27  // BackslashEscapeMode is describe in the main lowmemjson package
    28  // docs.
    29  type BackslashEscapeMode uint8
    30  
    31  const (
    32  	BackslashEscapeNone BackslashEscapeMode = iota
    33  	BackslashEscapeShort
    34  	BackslashEscapeRawByte
    35  
    36  	// It is significant to the implementation that if X=binary-0
    37  	// and x=binary-1, then these "BackslashEscapeUnicode"
    38  	// constants are counting in-order from 0 to 15.
    39  
    40  	BackslashEscapeUnicodeXXXX
    41  	BackslashEscapeUnicodeXXXx
    42  	BackslashEscapeUnicodeXXxX
    43  	BackslashEscapeUnicodeXXxx
    44  	BackslashEscapeUnicodeXxXX
    45  	BackslashEscapeUnicodeXxXx
    46  	BackslashEscapeUnicodeXxxX
    47  	BackslashEscapeUnicodeXxxx
    48  	BackslashEscapeUnicodexXXX
    49  	BackslashEscapeUnicodexXXx
    50  	BackslashEscapeUnicodexXxX
    51  	BackslashEscapeUnicodexXxx
    52  	BackslashEscapeUnicodexxXX
    53  	BackslashEscapeUnicodexxXx
    54  	BackslashEscapeUnicodexxxX
    55  	BackslashEscapeUnicodexxxx
    56  
    57  	BackslashEscapeUnicodeMin = BackslashEscapeUnicodeXXXX
    58  	BackslashEscapeUnicodeMax = BackslashEscapeUnicodexxxx
    59  
    60  	BackslashEscapeUnicode = BackslashEscapeUnicodexxxx // back-compat
    61  )
    62  
    63  // BackslashEscaper is describe in the main lowmemjson package docs.
    64  type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
    65  
    66  func WriteStringUnicodeEscape(w io.Writer, c rune, mode BackslashEscapeMode) error {
    67  	const alphabet = "0123456789ABCDEF"
    68  	_mode := byte(mode - BackslashEscapeUnicodeMin)
    69  	buf := [6]byte{
    70  		'\\',
    71  		'u',
    72  		// The 0b0010_0000 bit is the ASCII "lowercase bit".
    73  		alphabet[(c>>12)&0xf] | ((_mode << 2) & 0b0010_0000),
    74  		alphabet[(c>>8)&0xf] | ((_mode << 3) & 0b0010_0000),
    75  		alphabet[(c>>4)&0xf] | ((_mode << 4) & 0b0010_0000),
    76  		alphabet[(c>>0)&0xf] | ((_mode << 5) & 0b0010_0000),
    77  	}
    78  	_, err := noescape.Write(w, buf[:])
    79  	return err
    80  }
    81  
    82  func writeStringShortEscape(w io.Writer, c rune) error {
    83  	var b byte
    84  	switch c {
    85  	case '"', '\\', '/':
    86  		b = byte(c)
    87  	case '\b':
    88  		b = 'b'
    89  	case '\f':
    90  		b = 'f'
    91  	case '\n':
    92  		b = 'n'
    93  	case '\r':
    94  		b = 'r'
    95  	case '\t':
    96  		b = 't'
    97  	default:
    98  		panic(fmt.Errorf("should not happen: writeStringShortEscape called with invalid rune: %q", c))
    99  	}
   100  	buf := [2]byte{'\\', b}
   101  	_, err := noescape.Write(w, buf[:])
   102  	return err
   103  }
   104  
   105  func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) error {
   106  	switch escape {
   107  	case BackslashEscapeNone:
   108  		switch {
   109  		case c < 0x0020: // override, gotta escape these
   110  			switch c {
   111  			case '\b', '\f', '\n', '\r', '\t': // short-escape if possible
   112  				return writeStringShortEscape(w, c)
   113  			default:
   114  				return WriteStringUnicodeEscape(w, c, BackslashEscapeUnicode)
   115  			}
   116  		case c == '"' || c == '\\': // override, gotta escape these
   117  			return writeStringShortEscape(w, c)
   118  		default: // obey
   119  			_, err := w.WriteRune(c)
   120  			return err
   121  		}
   122  	case BackslashEscapeShort:
   123  		switch c {
   124  		case '"', '\\', '/', '\b', '\f', '\n', '\r', '\t': // obey
   125  			return writeStringShortEscape(w, c)
   126  		default: // override, can't short-escape these
   127  			_, err := w.WriteRune(c)
   128  			return err
   129  		}
   130  	case BackslashEscapeRawByte:
   131  		switch {
   132  		case c < utf8.RuneSelf:
   133  			panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c))
   134  		case c > 0xFF:
   135  			panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c))
   136  		default:
   137  			return w.WriteByte(byte(c))
   138  		}
   139  	default:
   140  		if BackslashEscapeUnicodeMin <= escape && escape <= BackslashEscapeUnicodeMax {
   141  			switch {
   142  			case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
   143  				_, err := w.WriteRune(c)
   144  				return err
   145  			default: // obey
   146  				return WriteStringUnicodeEscape(w, c, escape)
   147  			}
   148  		}
   149  		panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape))
   150  	}
   151  }
   152  
   153  func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error {
   154  	if err := w.WriteByte('"'); err != nil {
   155  		return err
   156  	}
   157  	for i := 0; i < len(str); {
   158  		escaped := BackslashEscapeNone
   159  		c, size := utf8.DecodeRuneInString(str[i:])
   160  		if c == utf8.RuneError && size == 1 {
   161  			switch utf {
   162  			case InvalidUTF8Replace:
   163  				escaped = BackslashEscapeUnicode
   164  			case InvalidUTF8Preserve:
   165  				escaped = BackslashEscapeRawByte
   166  				c = rune(str[i])
   167  			case InvalidUTF8Error:
   168  				return &json.UnsupportedValueError{
   169  					Value: val,
   170  					Str:   fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
   171  				}
   172  			}
   173  		}
   174  		if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
   175  			return err
   176  		}
   177  		i += size
   178  	}
   179  	if err := w.WriteByte('"'); err != nil {
   180  		return err
   181  	}
   182  	return nil
   183  }
   184  
   185  func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error {
   186  	if err := w.WriteByte('"'); err != nil {
   187  		return err
   188  	}
   189  	for i := 0; i < len(str); {
   190  		escaped := BackslashEscapeNone
   191  		c, size := utf8.DecodeRune(str[i:])
   192  		if c == utf8.RuneError && size == 1 {
   193  			switch utf {
   194  			case InvalidUTF8Replace:
   195  				escaped = BackslashEscapeUnicode
   196  			case InvalidUTF8Preserve:
   197  				escaped = BackslashEscapeRawByte
   198  				c = rune(str[i])
   199  			case InvalidUTF8Error:
   200  				return &json.UnsupportedValueError{
   201  					Value: val,
   202  					Str:   fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
   203  				}
   204  			}
   205  		}
   206  		if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
   207  			return err
   208  		}
   209  		i += size
   210  	}
   211  	if err := w.WriteByte('"'); err != nil {
   212  		return err
   213  	}
   214  	return nil
   215  }