git.lukeshu.com/go/lowmemjson@v0.3.9-0.20230723050957-72f6d13f6fb2/reencode.go (about)

     1  // Copyright (C) 2022-2023  Luke Shumaker <lukeshu@lukeshu.com>
     2  //
     3  // SPDX-License-Identifier: GPL-2.0-or-later
     4  
     5  package lowmemjson
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"unicode/utf8"
    11  
    12  	"git.lukeshu.com/go/lowmemjson/internal/fastio"
    13  	"git.lukeshu.com/go/lowmemjson/internal/jsonparse"
    14  )
    15  
    16  // A ReEncoderConfig controls how a ReEncoder should behave.
    17  type ReEncoderConfig struct {
    18  	// A JSON document is specified to be a single JSON element;
    19  	// but it is often desirable to handle streams of multiple
    20  	// JSON elements.
    21  	AllowMultipleValues bool
    22  
    23  	// Whether to minify the JSON.
    24  	//
    25  	// Trims all whitespace, except that it emits a newline
    26  	// between two *number* top-level values (or puts a newline
    27  	// after all top-level values if ForceTrailingNewlines).
    28  	//
    29  	// Trims superflous 0s from numbers.
    30  	Compact bool
    31  
    32  	// CompactIfUnder causes the *ReEncoder to behave as if
    33  	// Compact=true for individual elements if doing so would
    34  	// cause that element to be under this number of bytes.
    35  	//
    36  	// Has no affect if Compact is true or Indent is empty.
    37  	//
    38  	// This has O(2^min(CompactIfUnder, depth)) time overhead, so
    39  	// set with caution.
    40  	CompactIfUnder int
    41  
    42  	// String to use to indent; ignored if Compact is true.
    43  	//
    44  	// Newlines are emitted *between* top-level values; a newline is
    45  	// not emitted after the *last* top-level value (unless
    46  	// ForceTrailingNewlines is on).
    47  	Indent string
    48  
    49  	// String to put before indents.
    50  	Prefix string
    51  
    52  	// Whether to emit a newline after each top-level value.  See
    53  	// the comments on Compact and Indent for discussion of how
    54  	// this is different than the usual behavior.
    55  	ForceTrailingNewlines bool
    56  
    57  	// CompactFloats causes the *ReEncoder to trim unnecessary '0'
    58  	// digits from floating-point number values.
    59  	CompactFloats bool
    60  
    61  	// A JSON document is specified to be a sequence of Unicode
    62  	// codepoints; InvalidUTF8 controls how the *ReEncoder behaves
    63  	// when it encounters invalid UTF-8 bytes in a JSON string
    64  	// (i.e. the string is not representable as a sequence of
    65  	// Unicode codepoints, and thus the document is invalid JSON).
    66  	InvalidUTF8 InvalidUTF8Mode
    67  
    68  	// Returns whether a given character in a string should be
    69  	// backslash-escaped.  The bool argument is whether it was
    70  	// \u-escaped in the input.  This does not affect characters
    71  	// that must or must-not be escaped to be valid JSON.
    72  	//
    73  	// If not set, then EscapeDefault is used.
    74  	BackslashEscape BackslashEscaper
    75  }
    76  
    77  // NewReEncoder returns a new ReEncoder instance.
    78  //
    79  // A ReEncoder tends to make many small writes; if Out.Write
    80  // calls are syscalls, then you may want to wrap Out in a
    81  // bufio.Writer.
    82  func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
    83  	var module reEncoderModule
    84  
    85  	// Basic
    86  	module = &reEncodeWrite{
    87  		out: fastio.NewAllWriter(out),
    88  	}
    89  
    90  	// Whitespace
    91  	if cfg.ForceTrailingNewlines {
    92  		module = &reEncodeForceNL{
    93  			out: module,
    94  		}
    95  	}
    96  	switch {
    97  	case cfg.Compact:
    98  		module = &reEncodeCompactWS{
    99  			out: module,
   100  		}
   101  	case cfg.Indent != "":
   102  		if cfg.CompactIfUnder > 0 {
   103  			module = &reEncodeCompactWSIfUnder{
   104  				out:              module,
   105  				CompactWSIfUnder: cfg.CompactIfUnder,
   106  			}
   107  		}
   108  		module = &reEncodeIndent{
   109  			out:    module,
   110  			Indent: cfg.Indent,
   111  			Prefix: cfg.Prefix,
   112  		}
   113  	}
   114  
   115  	// Numbers
   116  	if cfg.CompactFloats {
   117  		module = &reEncodeCompactNum{
   118  			out: module,
   119  		}
   120  	}
   121  
   122  	// Strings
   123  	escaper := cfg.BackslashEscape
   124  	if escaper == nil {
   125  		escaper = EscapeDefault
   126  	}
   127  	module = &reEncodeString{
   128  		out:             module,
   129  		BackslashEscape: escaper,
   130  	}
   131  
   132  	return &ReEncoder{
   133  		out:                 module,
   134  		esc:                 escaper,
   135  		utf:                 cfg.InvalidUTF8,
   136  		allowMultipleValues: cfg.AllowMultipleValues,
   137  	}
   138  }
   139  
   140  // A ReEncoder takes a stream of JSON elements (by way of implementing
   141  // io.Writer, io.StringWriter, io.ByteWriter, and WriteRune), and
   142  // re-encodes the JSON, writing it to the .Out member.
   143  //
   144  // This is useful for prettifying, minifying, sanitizing, and/or
   145  // validating JSON.
   146  //
   147  // The memory use of a ReEncoder is O(CompactIfUnder+depth).
   148  type ReEncoder struct {
   149  	out                 reEncoderModule
   150  	esc                 BackslashEscaper
   151  	utf                 InvalidUTF8Mode
   152  	allowMultipleValues bool
   153  
   154  	// state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer
   155  	buf    [utf8.UTFMax]byte
   156  	bufLen int
   157  
   158  	// state: contract between the public API and .handleRune
   159  	err      error
   160  	par      jsonparse.Parser
   161  	inputPos int64
   162  
   163  	// state: .pushWriteBarrier and .popWriteBarrier
   164  	barriers []barrier
   165  
   166  	// state: .handleRuneType
   167  	uhex [3]byte // "\uABCD"-encoded characters in strings
   168  }
   169  
   170  type barrier struct {
   171  	inputPos  int64
   172  	stackSize int
   173  }
   174  
   175  type reEncoderModule interface {
   176  	HandleRune(c rune, t jsonparse.RuneType, escape BackslashEscapeMode, stackSize int) error
   177  	PopWriteBarrier()
   178  }
   179  
   180  // public API //////////////////////////////////////////////////////////////////
   181  
   182  var (
   183  	_ fastio.AllWriter = (*ReEncoder)(nil)
   184  	_ io.Closer        = (*ReEncoder)(nil)
   185  )
   186  
   187  func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) {
   188  	var tmp []byte
   189  	if pos < enc.bufLen {
   190  		var buf [utf8.UTFMax]byte
   191  		n := copy(buf[:], enc.buf[pos:enc.bufLen])
   192  		n += copy(buf[n:], str)
   193  		tmp = buf[:n]
   194  	} else {
   195  		tmp = str[pos-enc.bufLen:]
   196  	}
   197  	c, size = utf8.DecodeRune(tmp)
   198  	switch {
   199  	case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
   200  		return c, size, false, true
   201  	case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
   202  		return rune(tmp[0]), 1, true, false
   203  	default:
   204  		return c, size, true, true
   205  	}
   206  }
   207  
   208  func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) {
   209  	if pos < enc.bufLen {
   210  		var buf [utf8.UTFMax]byte
   211  		var tmp []byte
   212  		n := copy(buf[:], enc.buf[pos:enc.bufLen])
   213  		n += copy(buf[n:], str)
   214  		tmp = buf[:n]
   215  		c, size = utf8.DecodeRune(tmp)
   216  		switch {
   217  		case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
   218  			return c, size, false, true
   219  		case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
   220  			return rune(tmp[0]), 1, true, false
   221  		default:
   222  			return c, size, true, true
   223  		}
   224  	} else {
   225  		tmp := str[pos-enc.bufLen:]
   226  		c, size := utf8.DecodeRuneInString(tmp)
   227  		switch {
   228  		case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp):
   229  			return c, size, false, true
   230  		case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
   231  			return rune(tmp[0]), 1, true, false
   232  		default:
   233  			return c, size, true, true
   234  		}
   235  	}
   236  }
   237  
   238  // Write implements io.Writer; it does what you'd expect.
   239  //
   240  // It is worth noting that Write returns the number of bytes consumed
   241  // from p, not number of bytes written to the output stream.  This
   242  // distinction that most io.Writer implementations don't need to make,
   243  // but *ReEncoder does because it transforms the data written to it,
   244  // and the number of bytes written may be wildly different than the
   245  // number of bytes handled.
   246  //
   247  //nolint:dupl // Yes, this is mostly a duplicate of .WriteString().
   248  func (enc *ReEncoder) Write(str []byte) (int, error) {
   249  	if len(str) == 0 {
   250  		return 0, nil
   251  	}
   252  	origBufLen := enc.bufLen
   253  	var n int
   254  	for {
   255  		c, size, full, isRune := enc.getRuneFromBytes(str, n)
   256  		if !full {
   257  			if n < enc.bufLen {
   258  				l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
   259  				l += copy(enc.buf[l:], str)
   260  				enc.bufLen = l
   261  			} else {
   262  				enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
   263  			}
   264  			return len(str), nil
   265  		}
   266  		if enc.utf == InvalidUTF8Error && !isRune {
   267  			return n - origBufLen, &ReEncodeSyntaxError{
   268  				Offset: enc.inputPos,
   269  				Err:    fmt.Errorf("invalid UTF-8: %#02x", c),
   270  			}
   271  		}
   272  		enc.handleRune(c, size, isRune)
   273  		if enc.err != nil {
   274  			return n - origBufLen, enc.err
   275  		}
   276  		n += size
   277  	}
   278  }
   279  
   280  // WriteString implements io.StringWriter; it does what you'd expect,
   281  // but see the notes on the Write method.
   282  //
   283  //nolint:dupl // Yes, this is mostly a duplicate of .Write().
   284  func (enc *ReEncoder) WriteString(str string) (int, error) {
   285  	if len(str) == 0 {
   286  		return 0, nil
   287  	}
   288  	origBufLen := enc.bufLen
   289  	var n int
   290  	for {
   291  		c, size, full, isRune := enc.getRuneFromString(str, n)
   292  		if !full {
   293  			if n < enc.bufLen {
   294  				l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
   295  				l += copy(enc.buf[l:], str)
   296  				enc.bufLen = l
   297  			} else {
   298  				enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
   299  			}
   300  			return len(str), nil
   301  		}
   302  		if enc.utf == InvalidUTF8Error && !isRune {
   303  			return n - origBufLen, &ReEncodeSyntaxError{
   304  				Offset: enc.inputPos,
   305  				Err:    fmt.Errorf("invalid UTF-8: %#02x", c),
   306  			}
   307  		}
   308  		enc.handleRune(c, size, isRune)
   309  		if enc.err != nil {
   310  			return n - origBufLen, enc.err
   311  		}
   312  		n += size
   313  	}
   314  }
   315  
   316  // WriteByte implements io.ByteWriter; it does what you'd expect.
   317  func (enc *ReEncoder) WriteByte(b byte) error {
   318  	return fastio.WriteByte(enc, b)
   319  }
   320  
   321  // WriteRune does what you'd expect.
   322  func (enc *ReEncoder) WriteRune(c rune) (n int, err error) {
   323  	return fastio.WriteRune(enc, c)
   324  }
   325  
   326  // Close implements io.Closer; it does what you'd expect, mostly.
   327  //
   328  // The *ReEncoder may continue to be written to with new JSON values
   329  // if enc.AllowMultipleValues is set.
   330  func (enc *ReEncoder) Close() error {
   331  	if enc.bufLen > 0 {
   332  		if enc.utf == InvalidUTF8Error {
   333  			return &ReEncodeSyntaxError{
   334  				Offset: enc.inputPos,
   335  				Err:    fmt.Errorf("truncated UTF-8: %q", enc.buf[:enc.bufLen]),
   336  			}
   337  		}
   338  		for i := 0; i < enc.bufLen; i++ {
   339  			if enc.utf == InvalidUTF8Replace {
   340  				enc.handleRune(utf8.RuneError, 1, true)
   341  			} else {
   342  				enc.handleRune(rune(enc.buf[i]), 1, false)
   343  			}
   344  			if enc.err != nil {
   345  				return enc.err
   346  			}
   347  		}
   348  	}
   349  	if _, err := enc.par.HandleEOF(); err != nil {
   350  		enc.err = &ReEncodeSyntaxError{
   351  			Err:    err,
   352  			Offset: enc.inputPos,
   353  		}
   354  		return enc.err
   355  	}
   356  	if len(enc.barriers) == 0 {
   357  		if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil {
   358  			enc.err = &ReEncodeWriteError{
   359  				Err:    err,
   360  				Offset: enc.inputPos,
   361  			}
   362  			return enc.err
   363  		}
   364  		if enc.allowMultipleValues {
   365  			enc.par.Reset()
   366  		}
   367  	}
   368  	return nil
   369  }
   370  
   371  // isRune=false indicates that 'c' is a raw byte from invalid UTF-8.
   372  func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) {
   373  	t, err := enc.par.HandleRune(c, isRune)
   374  	if err != nil {
   375  		enc.err = &ReEncodeSyntaxError{
   376  			Err:    err,
   377  			Offset: enc.inputPos,
   378  		}
   379  		return
   380  	}
   381  	if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil {
   382  		enc.err = &ReEncodeWriteError{
   383  			Err:    err,
   384  			Offset: enc.inputPos,
   385  		}
   386  		return
   387  	}
   388  	if t == jsonparse.RuneTypeEOF {
   389  		if len(enc.barriers) == 0 {
   390  			panic(fmt.Errorf("should not happen: EOF for rune %q without write barriers", c))
   391  		}
   392  		enc.err = &ReEncodeSyntaxError{
   393  			Err:    fmt.Errorf("invalid character %q after top-level value", c),
   394  			Offset: enc.inputPos,
   395  		}
   396  		return
   397  	}
   398  
   399  	enc.inputPos += int64(size)
   400  }
   401  
   402  // semi-public API /////////////////////////////////////////////////////////////
   403  
   404  func (enc *ReEncoder) pushWriteBarrier() {
   405  	enc.barriers = append(enc.barriers, barrier{
   406  		inputPos:  enc.inputPos,
   407  		stackSize: enc.stackSize(),
   408  	})
   409  	enc.par.PushWriteBarrier()
   410  	enc.inputPos = 0
   411  }
   412  
   413  func (enc *ReEncoder) popWriteBarrier() {
   414  	enc.par.PopBarrier()
   415  	enc.inputPos += enc.barriers[len(enc.barriers)-1].inputPos
   416  	enc.barriers = enc.barriers[:len(enc.barriers)-1]
   417  	enc.out.PopWriteBarrier()
   418  }
   419  
   420  // internal ////////////////////////////////////////////////////////////////////
   421  
   422  func (enc *ReEncoder) stackSize() int {
   423  	sz := enc.par.StackSize()
   424  	if len(enc.barriers) > 0 {
   425  		sz += enc.barriers[len(enc.barriers)-1].stackSize
   426  	}
   427  	return sz
   428  }
   429  
   430  func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error {
   431  	switch t {
   432  	case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU:
   433  		return nil
   434  	case jsonparse.RuneTypeStringEsc1:
   435  		switch c {
   436  		case '"', '\\', '/':
   437  			// self
   438  		case 'b':
   439  			c = '\b'
   440  		case 'f':
   441  			c = '\f'
   442  		case 'n':
   443  			c = '\n'
   444  		case 'r':
   445  			c = '\r'
   446  		case 't':
   447  			c = '\t'
   448  		default:
   449  			panic(fmt.Errorf("should not happen: rune %q is not a RuneTypeStringEsc1", c))
   450  		}
   451  		return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeShort, stackSize)
   452  	case jsonparse.RuneTypeStringEscUA:
   453  		enc.uhex[0] = byte(c)
   454  		return nil
   455  	case jsonparse.RuneTypeStringEscUB:
   456  		enc.uhex[1] = byte(c)
   457  		return nil
   458  	case jsonparse.RuneTypeStringEscUC:
   459  		enc.uhex[2] = byte(c)
   460  		return nil
   461  	case jsonparse.RuneTypeStringEscUD:
   462  		mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
   463  		c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
   464  		return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize)
   465  	case jsonparse.RuneTypeError:
   466  		panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
   467  	default:
   468  		if t > jsonparse.RuneTypeEOF {
   469  			panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
   470  		}
   471  		esc := BackslashEscapeNone
   472  		if !isRune {
   473  			esc = BackslashEscapeRawByte
   474  		}
   475  		return enc.out.HandleRune(c, t, esc, stackSize)
   476  	}
   477  }