github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/json/tokenizer/scanner.go (about)

     1  // Copyright 2022 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // This is a fork of pkg/json package.
    12  
    13  // Copyright (c) 2020, Dave Cheney <dave@cheney.net>
    14  // All rights reserved.
    15  //
    16  // Redistribution and use in source and binary forms, with or without
    17  // modification, are permitted provided that the following conditions are met:
    18  //
    19  //   - Redistributions of source code must retain the above copyright notice, this
    20  //     list of conditions and the following disclaimer.
    21  //
    22  //   - Redistributions in binary form must reproduce the above copyright notice,
    23  //     this list of conditions and the following disclaimer in the documentation
    24  //     and/or other materials provided with the distribution.
    25  //
    26  // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    27  // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    28  // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
    29  // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
    30  // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    31  // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
    32  // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    33  // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    34  // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    35  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    36  
    37  package tokenizer
    38  
    39  import (
    40  	"sync"
    41  	"unicode"
    42  	"unicode/utf16"
    43  	"unicode/utf8"
    44  )
    45  
    46  const (
    47  	// ObjectStart indicates the start of JSON object.
    48  	ObjectStart = '{' // {
    49  	// ObjectEnd is the end of the JSON object.
    50  	ObjectEnd = '}' // }
    51  	// String is the start of JSON string.
    52  	String = '"' // "
    53  	// Colon indicates that the token is an object value.
    54  	Colon = ':' // :
    55  	// Comma indicates the next JSON element.
    56  	Comma = ',' // ,
    57  	// ArrayStart is the start of JSON array.
    58  	ArrayStart = '[' // [
    59  	// ArrayEnd is the end of JSON array.
    60  	ArrayEnd = ']' // ]
    61  	// True is 'true' token.
    62  	True = 't' // t
    63  	// False is 'false'.
    64  	False = 'f' // f
    65  	// Null is 'null' token.
    66  	Null = 'n' // n
    67  )
    68  
    69  // Scanner implements a JSON scanner as defined in RFC 7159.
    70  type Scanner struct {
    71  	data   []byte
    72  	offset int
    73  	tmpBuf *buffer
    74  }
    75  
    76  var whitespace = [256]bool{
    77  	' ':  true,
    78  	'\r': true,
    79  	'\n': true,
    80  	'\t': true,
    81  }
    82  
    83  // Next returns a []byte referencing the next lexical token in the stream.
    84  // The []byte is valid until Next is called again.
    85  // If the stream is at its end, or an error has occured, Next returns a zero
    86  // length []byte slice.
    87  //
    88  // A valid token begins with one of the following:
    89  //
    90  //	{ Object start
    91  //	[ Array start
    92  //	} Object end
    93  //	] Array End
    94  //	, Literal comma
    95  //	: Literal colon
    96  //	t JSON true
    97  //	f JSON false
    98  //	n JSON null
    99  //	" A string, possibly containing backslash escaped entites.
   100  //	-, 0-9 A number
   101  func (s *Scanner) Next() []byte {
   102  	w := s.buf()
   103  	for pos, c := range w {
   104  		// Strip any leading whitespace.
   105  		if whitespace[c] {
   106  			continue
   107  		}
   108  
   109  		// Simple case.
   110  		switch c {
   111  		case ObjectStart, ObjectEnd, Colon, Comma, ArrayStart, ArrayEnd:
   112  			s.offset += pos + 1
   113  			return w[pos : pos+1]
   114  		}
   115  
   116  		s.offset += pos
   117  		switch c {
   118  		case True:
   119  			return s.next(validateToken(s.buf(), "true"))
   120  		case False:
   121  			return s.next(validateToken(s.buf(), "false"))
   122  		case Null:
   123  			return s.next(validateToken(s.buf(), "null"))
   124  		case String:
   125  			return s.parseString()
   126  		default:
   127  			// Ensure the number is correct.
   128  			return s.next(s.parseNumber(c))
   129  		}
   130  	}
   131  
   132  	// it's all whitespace, ignore it
   133  	s.offset += len(w)
   134  	return nil // eof
   135  }
   136  
   137  var bufferPool = sync.Pool{New: func() interface{} { return &buffer{} }}
   138  
   139  // Release releases scanner resources.
   140  func (s *Scanner) Release() {
   141  	if s.tmpBuf != nil {
   142  		bufferPool.Put(s.tmpBuf)
   143  	}
   144  }
   145  
   146  func (s *Scanner) scratch() *buffer {
   147  	if s.tmpBuf == nil {
   148  		s.tmpBuf = bufferPool.Get().(*buffer)
   149  	}
   150  	s.tmpBuf.Reset()
   151  	return s.tmpBuf
   152  }
   153  
   154  // buf returns unread portion of the input.
   155  func (s *Scanner) buf() []byte {
   156  	if s.offset == len(s.data) {
   157  		return nil
   158  	}
   159  	return s.data[s.offset:]
   160  }
   161  
   162  // next returns n bytes from the input, and advances offset by n bytes.
   163  func (s *Scanner) next(n int) (res []byte) {
   164  	res = s.data[s.offset : s.offset+n]
   165  	s.offset += n
   166  	return res
   167  }
   168  
   169  // More returns true if scanner has more non-white space tokens.
   170  func (s *Scanner) More() bool {
   171  	for i := s.offset; i < len(s.data); i++ {
   172  		if !whitespace[s.data[i]] {
   173  			return true
   174  		}
   175  	}
   176  	return false
   177  }
   178  
   179  func validateToken(w []byte, expected string) int {
   180  	n := len(expected)
   181  	if len(w) >= n {
   182  		if string(w[:n]) != expected {
   183  			// doesn't match
   184  			return 0
   185  		}
   186  		return n
   187  	}
   188  	return 0 // eof
   189  }
   190  
   191  // parseString parses the string located at the start of the window. Returns
   192  // parsed string token, including enclosing `"`.
   193  func (s *Scanner) parseString() []byte {
   194  	pos := 1 // skip opening quote.
   195  	w := s.buf()[1:]
   196  
   197  	// Fast path: string does not have escape sequences.
   198  	for _, c := range w {
   199  		if c == '\\' {
   200  			// Alas, things are not that simple, we must handle escaped characters.
   201  			buf, n := s.parseStringSlow(pos)
   202  			s.offset += n
   203  			return buf
   204  		}
   205  
   206  		pos++
   207  		if c == '"' {
   208  			return s.next(pos)
   209  		}
   210  
   211  		if c < ' ' {
   212  			// Unescaped controlled characters < 0x30 not allowed.
   213  			return nil
   214  		}
   215  	}
   216  	return nil // eof
   217  }
   218  
   219  // parseStringSlow parses string containing escape sequences.
   220  // Everything up to pos does not have escape sequence, and buf[pos] is the first '\'
   221  // encountered when parsing the string.
   222  func (s *Scanner) parseStringSlow(pos int) ([]byte, int) {
   223  	w := s.buf()
   224  	// Sanity check.
   225  	if pos < 1 || len(w) < pos || w[0] != '"' || w[pos] != '\\' {
   226  		return nil, pos
   227  	}
   228  
   229  	// Escaped characters necessitate that the returned token will be
   230  	// different from the input token.  Reset scratch buffer, and copy
   231  	// everything processed so far.
   232  	b := s.scratch()
   233  	b.Append(w[:pos])
   234  	w = w[pos:]
   235  
   236  	for wp := 0; wp < len(w); {
   237  		switch c := w[wp]; {
   238  		default:
   239  			b.AppendByte(c)
   240  			pos++
   241  			wp++
   242  		case c < ' ':
   243  			// Control characters < 0x30 must be escaped.
   244  			return nil, pos
   245  		case c == '"':
   246  			b.AppendByte(c)
   247  			pos++
   248  			return b.Bytes(), pos
   249  		case c == '\\':
   250  			switch n := readEscaped(w[wp:], b); n {
   251  			case 0:
   252  				return nil, pos // Error
   253  			default:
   254  				wp += n
   255  				pos += n
   256  			}
   257  		}
   258  	}
   259  	return nil, pos // eof
   260  }
   261  
   262  // readEscaped reads escape sequence from the window w, and writes unescaped
   263  // values into provided buffer.
   264  // Returns number of bytes consumed from w.
   265  // Returns 0 if the input wasn't parseable / an error occurred.
   266  func readEscaped(w []byte, buf *buffer) int {
   267  	if len(w) < 2 {
   268  		return 0 // need more data
   269  	}
   270  
   271  	switch c := w[1]; {
   272  	case c == 'u':
   273  		if 2+utf8.UTFMax >= len(w) {
   274  			return 0 // need more data
   275  		}
   276  
   277  		rr := getu4(w[2:6])
   278  		if rr < 0 {
   279  			return 0
   280  		}
   281  
   282  		r := 2 + utf8.UTFMax // number of bytes read so far.
   283  		if utf16.IsSurrogate(rr) {
   284  			if 2*r >= len(w) {
   285  				return 0 // need more data
   286  			}
   287  
   288  			if w[r] != '\\' || w[r+1] != 'u' {
   289  				return 0
   290  			}
   291  
   292  			rr1 := getu4(w[r+2:])
   293  			dec := utf16.DecodeRune(rr, rr1)
   294  			if dec == unicode.ReplacementChar {
   295  				return 0
   296  			}
   297  			// A valid pair; consume.
   298  			r *= 2
   299  			buf.AppendRune(dec)
   300  		} else {
   301  			buf.AppendRune(rr)
   302  		}
   303  
   304  		return r
   305  	default:
   306  		c = unescapeTable[c]
   307  		if c == 0 {
   308  			return 0
   309  		}
   310  		buf.AppendByte(c)
   311  		return 2
   312  	}
   313  }
   314  
   315  func (s *Scanner) parseNumber(c byte) int {
   316  	const (
   317  		begin = iota
   318  		leadingzero
   319  		anydigit1
   320  		decimal
   321  		anydigit2
   322  		exponent
   323  		expsign
   324  		anydigit3
   325  	)
   326  
   327  	pos := 0
   328  	w := s.buf()
   329  	var state uint8 = begin
   330  
   331  	// Handle the case that the first character is a hyphen.
   332  	if c == '-' {
   333  		pos++
   334  		w = w[1:]
   335  	}
   336  
   337  	for _, elem := range w {
   338  		switch state {
   339  		case begin:
   340  			if elem >= '1' && elem <= '9' {
   341  				state = anydigit1
   342  			} else if elem == '0' {
   343  				state = leadingzero
   344  			} else {
   345  				// error
   346  				return 0
   347  			}
   348  		case anydigit1:
   349  			if elem >= '0' && elem <= '9' {
   350  				// Stay in this state.
   351  				break
   352  			}
   353  			fallthrough
   354  		case leadingzero:
   355  			if elem == '.' {
   356  				state = decimal
   357  				break
   358  			}
   359  			if elem == 'e' || elem == 'E' {
   360  				state = exponent
   361  				break
   362  			}
   363  			return pos // Finished.
   364  		case decimal:
   365  			if elem >= '0' && elem <= '9' {
   366  				state = anydigit2
   367  			} else {
   368  				return 0 // Error.
   369  			}
   370  		case anydigit2:
   371  			if elem >= '0' && elem <= '9' {
   372  				break
   373  			}
   374  			if elem == 'e' || elem == 'E' {
   375  				state = exponent
   376  				break
   377  			}
   378  			return pos // Finished.
   379  		case exponent:
   380  			if elem == '+' || elem == '-' {
   381  				state = expsign
   382  				break
   383  			}
   384  			fallthrough
   385  		case expsign:
   386  			if elem >= '0' && elem <= '9' {
   387  				state = anydigit3
   388  				break
   389  			}
   390  			return 0 // Error
   391  		case anydigit3:
   392  			if elem < '0' || elem > '9' {
   393  				return pos
   394  			}
   395  		}
   396  		pos++
   397  	}
   398  
   399  	// End of the item. However, not necessarily an error. Make
   400  	// sure we are in a state that allows ending the number.
   401  	switch state {
   402  	case leadingzero, anydigit1, anydigit2, anydigit3:
   403  		return pos
   404  	default:
   405  		// Error otherwise, the number isn't complete.
   406  		return 0
   407  	}
   408  }
   409  
   410  // hexTable lists quick conversion from byte to a valid
   411  // hex byte; or 0 if invalid.
   412  var hexTable = func() [256]rune {
   413  	var t [256]rune
   414  	for c := 0; c < 256; c++ {
   415  		switch {
   416  		case '0' <= c && c <= '9':
   417  			t[c] = rune(c - '0')
   418  		case 'a' <= c && c <= 'f':
   419  			t[c] = rune(c - 'a' + 10)
   420  		case 'A' <= c && c <= 'F':
   421  			t[c] = rune(c - 'A' + 10)
   422  		default:
   423  			t[c] = utf8.RuneError
   424  		}
   425  	}
   426  	return t
   427  }()
   428  
   429  // getu4 decodes \uXXXX from the beginning of s, returning the hex value,
   430  // or it returns -1.
   431  // s must be at least 4 bytes.
   432  func getu4(s []byte) rune {
   433  	r1, r2, r3, r4 := hexTable[s[0]], hexTable[s[1]], hexTable[s[2]], hexTable[s[3]]
   434  	if r1 == utf8.RuneError || r2 == utf8.RuneError || r3 == utf8.RuneError || r4 == utf8.RuneError {
   435  		return -1
   436  	}
   437  	return r1*(1<<12) + r2*(1<<8) + r3*(1<<4) + r4
   438  }
   439  
   440  // unescapeTable lists un-escaped characters for a set of valid
   441  // escape sequences.
   442  var unescapeTable = [256]byte{
   443  	'"':  '"',  // \"
   444  	'\\': '\\', // \\
   445  	'/':  '/',  // \/
   446  	'\'': '\'', // \'
   447  	'b':  '\b', // \b
   448  	'f':  '\f', // \f
   449  	'n':  '\n', // \n
   450  	'r':  '\r', // \r
   451  	't':  '\t', // \t
   452  }