github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/json/tokenizer/scanner.go (about) 1 // Copyright 2022 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // This is a fork of pkg/json package. 12 13 // Copyright (c) 2020, Dave Cheney <dave@cheney.net> 14 // All rights reserved. 15 // 16 // Redistribution and use in source and binary forms, with or without 17 // modification, are permitted provided that the following conditions are met: 18 // 19 // - Redistributions of source code must retain the above copyright notice, this 20 // list of conditions and the following disclaimer. 21 // 22 // - Redistributions in binary form must reproduce the above copyright notice, 23 // this list of conditions and the following disclaimer in the documentation 24 // and/or other materials provided with the distribution. 25 // 26 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 30 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 33 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 34 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 37 package tokenizer 38 39 import ( 40 "sync" 41 "unicode" 42 "unicode/utf16" 43 "unicode/utf8" 44 ) 45 46 const ( 47 // ObjectStart indicates the start of JSON object. 48 ObjectStart = '{' // { 49 // ObjectEnd is the end of the JSON object. 50 ObjectEnd = '}' // } 51 // String is the start of JSON string. 52 String = '"' // " 53 // Colon indicates that the token is an object value. 54 Colon = ':' // : 55 // Comma indicates the next JSON element. 56 Comma = ',' // , 57 // ArrayStart is the start of JSON array. 58 ArrayStart = '[' // [ 59 // ArrayEnd is the end of JSON array. 60 ArrayEnd = ']' // ] 61 // True is 'true' token. 62 True = 't' // t 63 // False is 'false'. 64 False = 'f' // f 65 // Null is 'null' token. 66 Null = 'n' // n 67 ) 68 69 // Scanner implements a JSON scanner as defined in RFC 7159. 70 type Scanner struct { 71 data []byte 72 offset int 73 tmpBuf *buffer 74 } 75 76 var whitespace = [256]bool{ 77 ' ': true, 78 '\r': true, 79 '\n': true, 80 '\t': true, 81 } 82 83 // Next returns a []byte referencing the next lexical token in the stream. 84 // The []byte is valid until Next is called again. 85 // If the stream is at its end, or an error has occured, Next returns a zero 86 // length []byte slice. 87 // 88 // A valid token begins with one of the following: 89 // 90 // { Object start 91 // [ Array start 92 // } Object end 93 // ] Array End 94 // , Literal comma 95 // : Literal colon 96 // t JSON true 97 // f JSON false 98 // n JSON null 99 // " A string, possibly containing backslash escaped entites. 100 // -, 0-9 A number 101 func (s *Scanner) Next() []byte { 102 w := s.buf() 103 for pos, c := range w { 104 // Strip any leading whitespace. 105 if whitespace[c] { 106 continue 107 } 108 109 // Simple case. 110 switch c { 111 case ObjectStart, ObjectEnd, Colon, Comma, ArrayStart, ArrayEnd: 112 s.offset += pos + 1 113 return w[pos : pos+1] 114 } 115 116 s.offset += pos 117 switch c { 118 case True: 119 return s.next(validateToken(s.buf(), "true")) 120 case False: 121 return s.next(validateToken(s.buf(), "false")) 122 case Null: 123 return s.next(validateToken(s.buf(), "null")) 124 case String: 125 return s.parseString() 126 default: 127 // Ensure the number is correct. 128 return s.next(s.parseNumber(c)) 129 } 130 } 131 132 // it's all whitespace, ignore it 133 s.offset += len(w) 134 return nil // eof 135 } 136 137 var bufferPool = sync.Pool{New: func() interface{} { return &buffer{} }} 138 139 // Release releases scanner resources. 140 func (s *Scanner) Release() { 141 if s.tmpBuf != nil { 142 bufferPool.Put(s.tmpBuf) 143 } 144 } 145 146 func (s *Scanner) scratch() *buffer { 147 if s.tmpBuf == nil { 148 s.tmpBuf = bufferPool.Get().(*buffer) 149 } 150 s.tmpBuf.Reset() 151 return s.tmpBuf 152 } 153 154 // buf returns unread portion of the input. 155 func (s *Scanner) buf() []byte { 156 if s.offset == len(s.data) { 157 return nil 158 } 159 return s.data[s.offset:] 160 } 161 162 // next returns n bytes from the input, and advances offset by n bytes. 163 func (s *Scanner) next(n int) (res []byte) { 164 res = s.data[s.offset : s.offset+n] 165 s.offset += n 166 return res 167 } 168 169 // More returns true if scanner has more non-white space tokens. 170 func (s *Scanner) More() bool { 171 for i := s.offset; i < len(s.data); i++ { 172 if !whitespace[s.data[i]] { 173 return true 174 } 175 } 176 return false 177 } 178 179 func validateToken(w []byte, expected string) int { 180 n := len(expected) 181 if len(w) >= n { 182 if string(w[:n]) != expected { 183 // doesn't match 184 return 0 185 } 186 return n 187 } 188 return 0 // eof 189 } 190 191 // parseString parses the string located at the start of the window. Returns 192 // parsed string token, including enclosing `"`. 193 func (s *Scanner) parseString() []byte { 194 pos := 1 // skip opening quote. 195 w := s.buf()[1:] 196 197 // Fast path: string does not have escape sequences. 198 for _, c := range w { 199 if c == '\\' { 200 // Alas, things are not that simple, we must handle escaped characters. 201 buf, n := s.parseStringSlow(pos) 202 s.offset += n 203 return buf 204 } 205 206 pos++ 207 if c == '"' { 208 return s.next(pos) 209 } 210 211 if c < ' ' { 212 // Unescaped controlled characters < 0x30 not allowed. 213 return nil 214 } 215 } 216 return nil // eof 217 } 218 219 // parseStringSlow parses string containing escape sequences. 220 // Everything up to pos does not have escape sequence, and buf[pos] is the first '\' 221 // encountered when parsing the string. 222 func (s *Scanner) parseStringSlow(pos int) ([]byte, int) { 223 w := s.buf() 224 // Sanity check. 225 if pos < 1 || len(w) < pos || w[0] != '"' || w[pos] != '\\' { 226 return nil, pos 227 } 228 229 // Escaped characters necessitate that the returned token will be 230 // different from the input token. Reset scratch buffer, and copy 231 // everything processed so far. 232 b := s.scratch() 233 b.Append(w[:pos]) 234 w = w[pos:] 235 236 for wp := 0; wp < len(w); { 237 switch c := w[wp]; { 238 default: 239 b.AppendByte(c) 240 pos++ 241 wp++ 242 case c < ' ': 243 // Control characters < 0x30 must be escaped. 244 return nil, pos 245 case c == '"': 246 b.AppendByte(c) 247 pos++ 248 return b.Bytes(), pos 249 case c == '\\': 250 switch n := readEscaped(w[wp:], b); n { 251 case 0: 252 return nil, pos // Error 253 default: 254 wp += n 255 pos += n 256 } 257 } 258 } 259 return nil, pos // eof 260 } 261 262 // readEscaped reads escape sequence from the window w, and writes unescaped 263 // values into provided buffer. 264 // Returns number of bytes consumed from w. 265 // Returns 0 if the input wasn't parseable / an error occurred. 266 func readEscaped(w []byte, buf *buffer) int { 267 if len(w) < 2 { 268 return 0 // need more data 269 } 270 271 switch c := w[1]; { 272 case c == 'u': 273 if 2+utf8.UTFMax >= len(w) { 274 return 0 // need more data 275 } 276 277 rr := getu4(w[2:6]) 278 if rr < 0 { 279 return 0 280 } 281 282 r := 2 + utf8.UTFMax // number of bytes read so far. 283 if utf16.IsSurrogate(rr) { 284 if 2*r >= len(w) { 285 return 0 // need more data 286 } 287 288 if w[r] != '\\' || w[r+1] != 'u' { 289 return 0 290 } 291 292 rr1 := getu4(w[r+2:]) 293 dec := utf16.DecodeRune(rr, rr1) 294 if dec == unicode.ReplacementChar { 295 return 0 296 } 297 // A valid pair; consume. 298 r *= 2 299 buf.AppendRune(dec) 300 } else { 301 buf.AppendRune(rr) 302 } 303 304 return r 305 default: 306 c = unescapeTable[c] 307 if c == 0 { 308 return 0 309 } 310 buf.AppendByte(c) 311 return 2 312 } 313 } 314 315 func (s *Scanner) parseNumber(c byte) int { 316 const ( 317 begin = iota 318 leadingzero 319 anydigit1 320 decimal 321 anydigit2 322 exponent 323 expsign 324 anydigit3 325 ) 326 327 pos := 0 328 w := s.buf() 329 var state uint8 = begin 330 331 // Handle the case that the first character is a hyphen. 332 if c == '-' { 333 pos++ 334 w = w[1:] 335 } 336 337 for _, elem := range w { 338 switch state { 339 case begin: 340 if elem >= '1' && elem <= '9' { 341 state = anydigit1 342 } else if elem == '0' { 343 state = leadingzero 344 } else { 345 // error 346 return 0 347 } 348 case anydigit1: 349 if elem >= '0' && elem <= '9' { 350 // Stay in this state. 351 break 352 } 353 fallthrough 354 case leadingzero: 355 if elem == '.' { 356 state = decimal 357 break 358 } 359 if elem == 'e' || elem == 'E' { 360 state = exponent 361 break 362 } 363 return pos // Finished. 364 case decimal: 365 if elem >= '0' && elem <= '9' { 366 state = anydigit2 367 } else { 368 return 0 // Error. 369 } 370 case anydigit2: 371 if elem >= '0' && elem <= '9' { 372 break 373 } 374 if elem == 'e' || elem == 'E' { 375 state = exponent 376 break 377 } 378 return pos // Finished. 379 case exponent: 380 if elem == '+' || elem == '-' { 381 state = expsign 382 break 383 } 384 fallthrough 385 case expsign: 386 if elem >= '0' && elem <= '9' { 387 state = anydigit3 388 break 389 } 390 return 0 // Error 391 case anydigit3: 392 if elem < '0' || elem > '9' { 393 return pos 394 } 395 } 396 pos++ 397 } 398 399 // End of the item. However, not necessarily an error. Make 400 // sure we are in a state that allows ending the number. 401 switch state { 402 case leadingzero, anydigit1, anydigit2, anydigit3: 403 return pos 404 default: 405 // Error otherwise, the number isn't complete. 406 return 0 407 } 408 } 409 410 // hexTable lists quick conversion from byte to a valid 411 // hex byte; or 0 if invalid. 412 var hexTable = func() [256]rune { 413 var t [256]rune 414 for c := 0; c < 256; c++ { 415 switch { 416 case '0' <= c && c <= '9': 417 t[c] = rune(c - '0') 418 case 'a' <= c && c <= 'f': 419 t[c] = rune(c - 'a' + 10) 420 case 'A' <= c && c <= 'F': 421 t[c] = rune(c - 'A' + 10) 422 default: 423 t[c] = utf8.RuneError 424 } 425 } 426 return t 427 }() 428 429 // getu4 decodes \uXXXX from the beginning of s, returning the hex value, 430 // or it returns -1. 431 // s must be at least 4 bytes. 432 func getu4(s []byte) rune { 433 r1, r2, r3, r4 := hexTable[s[0]], hexTable[s[1]], hexTable[s[2]], hexTable[s[3]] 434 if r1 == utf8.RuneError || r2 == utf8.RuneError || r3 == utf8.RuneError || r4 == utf8.RuneError { 435 return -1 436 } 437 return r1*(1<<12) + r2*(1<<8) + r3*(1<<4) + r4 438 } 439 440 // unescapeTable lists un-escaped characters for a set of valid 441 // escape sequences. 442 var unescapeTable = [256]byte{ 443 '"': '"', // \" 444 '\\': '\\', // \\ 445 '/': '/', // \/ 446 '\'': '\'', // \' 447 'b': '\b', // \b 448 'f': '\f', // \f 449 'n': '\n', // \n 450 'r': '\r', // \r 451 't': '\t', // \t 452 }