git.lukeshu.com/go/lowmemjson@v0.3.9-0.20230723050957-72f6d13f6fb2/reencode.go (about) 1 // Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com> 2 // 3 // SPDX-License-Identifier: GPL-2.0-or-later 4 5 package lowmemjson 6 7 import ( 8 "fmt" 9 "io" 10 "unicode/utf8" 11 12 "git.lukeshu.com/go/lowmemjson/internal/fastio" 13 "git.lukeshu.com/go/lowmemjson/internal/jsonparse" 14 ) 15 16 // A ReEncoderConfig controls how a ReEncoder should behave. 17 type ReEncoderConfig struct { 18 // A JSON document is specified to be a single JSON element; 19 // but it is often desirable to handle streams of multiple 20 // JSON elements. 21 AllowMultipleValues bool 22 23 // Whether to minify the JSON. 24 // 25 // Trims all whitespace, except that it emits a newline 26 // between two *number* top-level values (or puts a newline 27 // after all top-level values if ForceTrailingNewlines). 28 // 29 // Trims superflous 0s from numbers. 30 Compact bool 31 32 // CompactIfUnder causes the *ReEncoder to behave as if 33 // Compact=true for individual elements if doing so would 34 // cause that element to be under this number of bytes. 35 // 36 // Has no affect if Compact is true or Indent is empty. 37 // 38 // This has O(2^min(CompactIfUnder, depth)) time overhead, so 39 // set with caution. 40 CompactIfUnder int 41 42 // String to use to indent; ignored if Compact is true. 43 // 44 // Newlines are emitted *between* top-level values; a newline is 45 // not emitted after the *last* top-level value (unless 46 // ForceTrailingNewlines is on). 47 Indent string 48 49 // String to put before indents. 50 Prefix string 51 52 // Whether to emit a newline after each top-level value. See 53 // the comments on Compact and Indent for discussion of how 54 // this is different than the usual behavior. 55 ForceTrailingNewlines bool 56 57 // CompactFloats causes the *ReEncoder to trim unnecessary '0' 58 // digits from floating-point number values. 59 CompactFloats bool 60 61 // A JSON document is specified to be a sequence of Unicode 62 // codepoints; InvalidUTF8 controls how the *ReEncoder behaves 63 // when it encounters invalid UTF-8 bytes in a JSON string 64 // (i.e. the string is not representable as a sequence of 65 // Unicode codepoints, and thus the document is invalid JSON). 66 InvalidUTF8 InvalidUTF8Mode 67 68 // Returns whether a given character in a string should be 69 // backslash-escaped. The bool argument is whether it was 70 // \u-escaped in the input. This does not affect characters 71 // that must or must-not be escaped to be valid JSON. 72 // 73 // If not set, then EscapeDefault is used. 74 BackslashEscape BackslashEscaper 75 } 76 77 // NewReEncoder returns a new ReEncoder instance. 78 // 79 // A ReEncoder tends to make many small writes; if Out.Write 80 // calls are syscalls, then you may want to wrap Out in a 81 // bufio.Writer. 82 func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { 83 var module reEncoderModule 84 85 // Basic 86 module = &reEncodeWrite{ 87 out: fastio.NewAllWriter(out), 88 } 89 90 // Whitespace 91 if cfg.ForceTrailingNewlines { 92 module = &reEncodeForceNL{ 93 out: module, 94 } 95 } 96 switch { 97 case cfg.Compact: 98 module = &reEncodeCompactWS{ 99 out: module, 100 } 101 case cfg.Indent != "": 102 if cfg.CompactIfUnder > 0 { 103 module = &reEncodeCompactWSIfUnder{ 104 out: module, 105 CompactWSIfUnder: cfg.CompactIfUnder, 106 } 107 } 108 module = &reEncodeIndent{ 109 out: module, 110 Indent: cfg.Indent, 111 Prefix: cfg.Prefix, 112 } 113 } 114 115 // Numbers 116 if cfg.CompactFloats { 117 module = &reEncodeCompactNum{ 118 out: module, 119 } 120 } 121 122 // Strings 123 escaper := cfg.BackslashEscape 124 if escaper == nil { 125 escaper = EscapeDefault 126 } 127 module = &reEncodeString{ 128 out: module, 129 BackslashEscape: escaper, 130 } 131 132 return &ReEncoder{ 133 out: module, 134 esc: escaper, 135 utf: cfg.InvalidUTF8, 136 allowMultipleValues: cfg.AllowMultipleValues, 137 } 138 } 139 140 // A ReEncoder takes a stream of JSON elements (by way of implementing 141 // io.Writer, io.StringWriter, io.ByteWriter, and WriteRune), and 142 // re-encodes the JSON, writing it to the .Out member. 143 // 144 // This is useful for prettifying, minifying, sanitizing, and/or 145 // validating JSON. 146 // 147 // The memory use of a ReEncoder is O(CompactIfUnder+depth). 148 type ReEncoder struct { 149 out reEncoderModule 150 esc BackslashEscaper 151 utf InvalidUTF8Mode 152 allowMultipleValues bool 153 154 // state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer 155 buf [utf8.UTFMax]byte 156 bufLen int 157 158 // state: contract between the public API and .handleRune 159 err error 160 par jsonparse.Parser 161 inputPos int64 162 163 // state: .pushWriteBarrier and .popWriteBarrier 164 barriers []barrier 165 166 // state: .handleRuneType 167 uhex [3]byte // "\uABCD"-encoded characters in strings 168 } 169 170 type barrier struct { 171 inputPos int64 172 stackSize int 173 } 174 175 type reEncoderModule interface { 176 HandleRune(c rune, t jsonparse.RuneType, escape BackslashEscapeMode, stackSize int) error 177 PopWriteBarrier() 178 } 179 180 // public API ////////////////////////////////////////////////////////////////// 181 182 var ( 183 _ fastio.AllWriter = (*ReEncoder)(nil) 184 _ io.Closer = (*ReEncoder)(nil) 185 ) 186 187 func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) { 188 var tmp []byte 189 if pos < enc.bufLen { 190 var buf [utf8.UTFMax]byte 191 n := copy(buf[:], enc.buf[pos:enc.bufLen]) 192 n += copy(buf[n:], str) 193 tmp = buf[:n] 194 } else { 195 tmp = str[pos-enc.bufLen:] 196 } 197 c, size = utf8.DecodeRune(tmp) 198 switch { 199 case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): 200 return c, size, false, true 201 case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: 202 return rune(tmp[0]), 1, true, false 203 default: 204 return c, size, true, true 205 } 206 } 207 208 func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) { 209 if pos < enc.bufLen { 210 var buf [utf8.UTFMax]byte 211 var tmp []byte 212 n := copy(buf[:], enc.buf[pos:enc.bufLen]) 213 n += copy(buf[n:], str) 214 tmp = buf[:n] 215 c, size = utf8.DecodeRune(tmp) 216 switch { 217 case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): 218 return c, size, false, true 219 case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: 220 return rune(tmp[0]), 1, true, false 221 default: 222 return c, size, true, true 223 } 224 } else { 225 tmp := str[pos-enc.bufLen:] 226 c, size := utf8.DecodeRuneInString(tmp) 227 switch { 228 case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp): 229 return c, size, false, true 230 case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: 231 return rune(tmp[0]), 1, true, false 232 default: 233 return c, size, true, true 234 } 235 } 236 } 237 238 // Write implements io.Writer; it does what you'd expect. 239 // 240 // It is worth noting that Write returns the number of bytes consumed 241 // from p, not number of bytes written to the output stream. This 242 // distinction that most io.Writer implementations don't need to make, 243 // but *ReEncoder does because it transforms the data written to it, 244 // and the number of bytes written may be wildly different than the 245 // number of bytes handled. 246 // 247 //nolint:dupl // Yes, this is mostly a duplicate of .WriteString(). 248 func (enc *ReEncoder) Write(str []byte) (int, error) { 249 if len(str) == 0 { 250 return 0, nil 251 } 252 origBufLen := enc.bufLen 253 var n int 254 for { 255 c, size, full, isRune := enc.getRuneFromBytes(str, n) 256 if !full { 257 if n < enc.bufLen { 258 l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) 259 l += copy(enc.buf[l:], str) 260 enc.bufLen = l 261 } else { 262 enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) 263 } 264 return len(str), nil 265 } 266 if enc.utf == InvalidUTF8Error && !isRune { 267 return n - origBufLen, &ReEncodeSyntaxError{ 268 Offset: enc.inputPos, 269 Err: fmt.Errorf("invalid UTF-8: %#02x", c), 270 } 271 } 272 enc.handleRune(c, size, isRune) 273 if enc.err != nil { 274 return n - origBufLen, enc.err 275 } 276 n += size 277 } 278 } 279 280 // WriteString implements io.StringWriter; it does what you'd expect, 281 // but see the notes on the Write method. 282 // 283 //nolint:dupl // Yes, this is mostly a duplicate of .Write(). 284 func (enc *ReEncoder) WriteString(str string) (int, error) { 285 if len(str) == 0 { 286 return 0, nil 287 } 288 origBufLen := enc.bufLen 289 var n int 290 for { 291 c, size, full, isRune := enc.getRuneFromString(str, n) 292 if !full { 293 if n < enc.bufLen { 294 l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) 295 l += copy(enc.buf[l:], str) 296 enc.bufLen = l 297 } else { 298 enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) 299 } 300 return len(str), nil 301 } 302 if enc.utf == InvalidUTF8Error && !isRune { 303 return n - origBufLen, &ReEncodeSyntaxError{ 304 Offset: enc.inputPos, 305 Err: fmt.Errorf("invalid UTF-8: %#02x", c), 306 } 307 } 308 enc.handleRune(c, size, isRune) 309 if enc.err != nil { 310 return n - origBufLen, enc.err 311 } 312 n += size 313 } 314 } 315 316 // WriteByte implements io.ByteWriter; it does what you'd expect. 317 func (enc *ReEncoder) WriteByte(b byte) error { 318 return fastio.WriteByte(enc, b) 319 } 320 321 // WriteRune does what you'd expect. 322 func (enc *ReEncoder) WriteRune(c rune) (n int, err error) { 323 return fastio.WriteRune(enc, c) 324 } 325 326 // Close implements io.Closer; it does what you'd expect, mostly. 327 // 328 // The *ReEncoder may continue to be written to with new JSON values 329 // if enc.AllowMultipleValues is set. 330 func (enc *ReEncoder) Close() error { 331 if enc.bufLen > 0 { 332 if enc.utf == InvalidUTF8Error { 333 return &ReEncodeSyntaxError{ 334 Offset: enc.inputPos, 335 Err: fmt.Errorf("truncated UTF-8: %q", enc.buf[:enc.bufLen]), 336 } 337 } 338 for i := 0; i < enc.bufLen; i++ { 339 if enc.utf == InvalidUTF8Replace { 340 enc.handleRune(utf8.RuneError, 1, true) 341 } else { 342 enc.handleRune(rune(enc.buf[i]), 1, false) 343 } 344 if enc.err != nil { 345 return enc.err 346 } 347 } 348 } 349 if _, err := enc.par.HandleEOF(); err != nil { 350 enc.err = &ReEncodeSyntaxError{ 351 Err: err, 352 Offset: enc.inputPos, 353 } 354 return enc.err 355 } 356 if len(enc.barriers) == 0 { 357 if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil { 358 enc.err = &ReEncodeWriteError{ 359 Err: err, 360 Offset: enc.inputPos, 361 } 362 return enc.err 363 } 364 if enc.allowMultipleValues { 365 enc.par.Reset() 366 } 367 } 368 return nil 369 } 370 371 // isRune=false indicates that 'c' is a raw byte from invalid UTF-8. 372 func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { 373 t, err := enc.par.HandleRune(c, isRune) 374 if err != nil { 375 enc.err = &ReEncodeSyntaxError{ 376 Err: err, 377 Offset: enc.inputPos, 378 } 379 return 380 } 381 if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil { 382 enc.err = &ReEncodeWriteError{ 383 Err: err, 384 Offset: enc.inputPos, 385 } 386 return 387 } 388 if t == jsonparse.RuneTypeEOF { 389 if len(enc.barriers) == 0 { 390 panic(fmt.Errorf("should not happen: EOF for rune %q without write barriers", c)) 391 } 392 enc.err = &ReEncodeSyntaxError{ 393 Err: fmt.Errorf("invalid character %q after top-level value", c), 394 Offset: enc.inputPos, 395 } 396 return 397 } 398 399 enc.inputPos += int64(size) 400 } 401 402 // semi-public API ///////////////////////////////////////////////////////////// 403 404 func (enc *ReEncoder) pushWriteBarrier() { 405 enc.barriers = append(enc.barriers, barrier{ 406 inputPos: enc.inputPos, 407 stackSize: enc.stackSize(), 408 }) 409 enc.par.PushWriteBarrier() 410 enc.inputPos = 0 411 } 412 413 func (enc *ReEncoder) popWriteBarrier() { 414 enc.par.PopBarrier() 415 enc.inputPos += enc.barriers[len(enc.barriers)-1].inputPos 416 enc.barriers = enc.barriers[:len(enc.barriers)-1] 417 enc.out.PopWriteBarrier() 418 } 419 420 // internal //////////////////////////////////////////////////////////////////// 421 422 func (enc *ReEncoder) stackSize() int { 423 sz := enc.par.StackSize() 424 if len(enc.barriers) > 0 { 425 sz += enc.barriers[len(enc.barriers)-1].stackSize 426 } 427 return sz 428 } 429 430 func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error { 431 switch t { 432 case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU: 433 return nil 434 case jsonparse.RuneTypeStringEsc1: 435 switch c { 436 case '"', '\\', '/': 437 // self 438 case 'b': 439 c = '\b' 440 case 'f': 441 c = '\f' 442 case 'n': 443 c = '\n' 444 case 'r': 445 c = '\r' 446 case 't': 447 c = '\t' 448 default: 449 panic(fmt.Errorf("should not happen: rune %q is not a RuneTypeStringEsc1", c)) 450 } 451 return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeShort, stackSize) 452 case jsonparse.RuneTypeStringEscUA: 453 enc.uhex[0] = byte(c) 454 return nil 455 case jsonparse.RuneTypeStringEscUB: 456 enc.uhex[1] = byte(c) 457 return nil 458 case jsonparse.RuneTypeStringEscUC: 459 enc.uhex[2] = byte(c) 460 return nil 461 case jsonparse.RuneTypeStringEscUD: 462 mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) 463 c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) 464 return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize) 465 case jsonparse.RuneTypeError: 466 panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) 467 default: 468 if t > jsonparse.RuneTypeEOF { 469 panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) 470 } 471 esc := BackslashEscapeNone 472 if !isRune { 473 esc = BackslashEscapeRawByte 474 } 475 return enc.out.HandleRune(c, t, esc, stackSize) 476 } 477 }