github.com/minio/simdjson-go@v0.4.6-0.20231116094823-04d21cddf993/stage2_build_tape_amd64.go (about) 1 //go:build !noasm && !appengine && gc 2 // +build !noasm,!appengine,gc 3 4 /* 5 * MinIO Cloud Storage, (C) 2020 MinIO, Inc. 6 * 7 * Licensed under the Apache License, Version 2.0 (the "License"); 8 * you may not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 package simdjson 21 22 import ( 23 "bytes" 24 "encoding/binary" 25 "fmt" 26 ) 27 28 // Constants for "return address" modes 29 const retAddressShift = 2 30 const retAddressStartConst = 1 31 const retAddressObjectConst = 2 32 const retAddressArrayConst = 3 33 34 func updateChar(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) { 35 if pj.indexesChan.index >= pj.indexesChan.length { 36 pj.indexesChan = <-pj.indexChans // Get next element from channel 37 done = pj.indexesChan.index == -1 38 if done { 39 return 40 } 41 } 42 idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index]) 43 pj.indexesChan.index++ 44 return 45 } 46 47 // Handy "debug" function to see where Stage 2 fails (rename to `updateChar`) 48 func updateCharDebug(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) { 49 if pj.indexesChan.index >= pj.indexesChan.length { 50 var ok bool 51 pj.indexesChan, ok = <-pj.indexChans // Get next element from channel 52 if !ok { 53 done = true // return done if channel closed 54 return 55 } 56 } 57 idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index]) 58 fmt.Printf("At 0x%x char: %s\n", idx, string(pj.Message[idx])) 59 pj.indexesChan.index++ 60 return 61 } 62 63 func peekSize(pj *internalParsedJson) uint64 { 64 if pj.indexesChan.index >= pj.indexesChan.length { 65 //panic("cannot peek the size") // should never happen since last string element should be saved for next buffer 66 // let's return 0 for the sake of safety (could lead to a string being to short) 67 return 0 68 } 69 return uint64(pj.indexesChan.indexes[pj.indexesChan.index]) 70 } 71 72 func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64, needCopy bool) bool { 73 size := uint64(0) 74 buf := pj.Message[idx:] 75 // Make sure that we have at least one full YMM word available after maxStringSize into the buffer 76 if len(buf)-int(maxStringSize) < 64 { 77 if len(buf) > 512-64 { // only allocated if needed 78 paddedBuf := make([]byte, len(buf)+64) 79 copy(paddedBuf, buf) 80 buf = paddedBuf 81 } else { 82 paddedBuf := [512]byte{} 83 copy(paddedBuf[:], buf) 84 buf = paddedBuf[:] 85 } 86 } 87 if !parseStringSimdValidateOnly(buf, &maxStringSize, &size, &needCopy) { 88 return false 89 } 90 if !needCopy { 91 pj.write_tape(idx+1, '"') 92 } else { 93 // Make sure we account for at least 32 bytes additional space due to 94 strs := pj.Strings.B 95 requiredLen := uint64(len(strs)) + size + 32 96 if requiredLen >= uint64(cap(strs)) { 97 newSize := uint64(cap(strs) * 2) 98 if newSize < requiredLen { 99 newSize = requiredLen + size // add size once more to account for further space 100 } 101 strs = make([]byte, len(strs), newSize) 102 copy(strs, pj.Strings.B) 103 pj.Strings.B = strs 104 } 105 start := len(strs) 106 _ = parseStringSimd(buf, &pj.Strings.B) // We can safely ignore the result since we validate above 107 pj.write_tape(uint64(STRINGBUFBIT+start), '"') 108 size = uint64(len(pj.Strings.B) - start) 109 } 110 // put length onto the tape 111 pj.Tape = append(pj.Tape, size) 112 return true 113 } 114 115 func addNumber(buf []byte, pj *ParsedJson) bool { 116 tag, val := parseNumber(buf) 117 if tag == 0 { 118 return false 119 } 120 pj.writeTapeTagValFlags(tag, val) 121 return true 122 } 123 124 func isValidTrueAtom(buf []byte) bool { 125 if len(buf) >= 5 { // fast path when there is enough space left in the buffer 126 const tv = uint32(0x0000000065757274) // "true " 127 locval := binary.LittleEndian.Uint32(buf) 128 if locval == tv { 129 return isNotStructuralOrWhitespace(buf[4]) == 0 130 } 131 } 132 return false 133 } 134 135 func isValidFalseAtom(buf []byte) bool { 136 if len(buf) >= 8 { // fast path when there is enough space left in the buffer 137 const fv = uint64(0x00000065736c6166) // "false " 138 const mask5 = uint64(0x000000ffffffffff) 139 error := uint64(isNotStructuralOrWhitespace(buf[5])) 140 locval := binary.LittleEndian.Uint64(buf) 141 error |= (locval & mask5) ^ fv 142 return error == 0 143 } else if len(buf) >= 6 { 144 return bytes.Equal(buf[:5], []byte("false")) && isNotStructuralOrWhitespace(buf[5]) == 0 145 } 146 return false 147 } 148 149 func isValidNullAtom(buf []byte) bool { 150 if len(buf) >= 5 { // fast path when there is enough space left in the buffer 151 const nv = 0x000000006c6c756e // "null " 152 locval := binary.LittleEndian.Uint32(buf) // we want to avoid unaligned 64-bit loads (undefined in C/C++) 153 if locval == nv { 154 return isNotStructuralOrWhitespace(buf[4]) == 0 155 } 156 } 157 return false 158 } 159 160 func (pj *internalParsedJson) unifiedMachine() (ok, done bool) { 161 buf := pj.Message 162 const addOneForRoot = 1 163 164 idx := ^uint64(0) // location of the structural character in the input (buf) 165 offset := uint64(0) // used to contain last element of containing_scope_offset 166 167 ////////////////////////////// START STATE ///////////////////////////// 168 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 169 170 pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten 171 // the root is used, if nothing else, to capture the size of the tape 172 173 if done, idx = updateChar(pj, idx); done { 174 goto succeed 175 } 176 continueRoot: 177 switch buf[idx] { 178 case '{': 179 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 180 pj.write_tape(0, '{') 181 goto object_begin 182 case '[': 183 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 184 pj.write_tape(0, '[') 185 goto arrayBegin 186 default: 187 goto fail 188 } 189 190 startContinue: 191 // We are back at the top, read the next char and we should be done 192 if done, idx = updateChar(pj, idx); done { 193 goto succeed 194 } else { 195 // For an ndjson object, wrap up current object, start new root and check for minimum of 1 newline 196 if buf[idx] != '\n' { 197 goto fail 198 } 199 200 // Eat any empty lines 201 for buf[idx] == '\n' { 202 if done, idx = updateChar(pj, idx); done { 203 goto succeed 204 } 205 } 206 207 // Otherwise close current root 208 offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1] 209 210 // drop last element 211 pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1] 212 213 pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot) 214 pj.write_tape(offset>>retAddressShift, 'r') // r is root 215 216 // And open a new root 217 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 218 pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten 219 220 goto continueRoot 221 } 222 223 //////////////////////////////// OBJECT STATES ///////////////////////////// 224 225 object_begin: 226 if done, idx = updateChar(pj, idx); done { 227 goto succeed 228 } 229 switch buf[idx] { 230 case '"': 231 if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) { 232 goto fail 233 } 234 goto object_key_state 235 case '}': 236 goto scopeEnd // could also go to object_continue 237 default: 238 goto fail 239 } 240 241 object_key_state: 242 if done, idx = updateChar(pj, idx); done { 243 goto succeed 244 } 245 if buf[idx] != ':' { 246 goto fail 247 } 248 if done, idx = updateChar(pj, idx); done { 249 goto succeed 250 } 251 switch buf[idx] { 252 case '"': 253 if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) { 254 goto fail 255 } 256 257 case 't': 258 if !isValidTrueAtom(buf[idx:]) { 259 goto fail 260 } 261 pj.write_tape(0, 't') 262 263 case 'f': 264 if !isValidFalseAtom(buf[idx:]) { 265 goto fail 266 } 267 pj.write_tape(0, 'f') 268 269 case 'n': 270 if !isValidNullAtom(buf[idx:]) { 271 goto fail 272 } 273 pj.write_tape(0, 'n') 274 275 case '-': 276 if !addNumber(buf[idx:], &pj.ParsedJson) { 277 goto fail 278 } 279 280 case '{': 281 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst) 282 pj.write_tape(0, '{') 283 // we have not yet encountered } so we need to come back for it 284 goto object_begin 285 286 case '[': 287 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst) 288 pj.write_tape(0, '[') 289 // we have not yet encountered } so we need to come back for it 290 goto arrayBegin 291 292 default: 293 if buf[idx] >= '0' && buf[idx] <= '9' { 294 if !addNumber(buf[idx:], &pj.ParsedJson) { 295 goto fail 296 } 297 break 298 } 299 goto fail 300 } 301 302 objectContinue: 303 if done, idx = updateChar(pj, idx); done { 304 goto succeed 305 } 306 switch buf[idx] { 307 case ',': 308 if done, idx = updateChar(pj, idx); done { 309 goto succeed 310 } 311 if buf[idx] != '"' { 312 goto fail 313 } 314 if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) { 315 goto fail 316 } 317 goto object_key_state 318 319 case '}': 320 goto scopeEnd 321 322 default: 323 goto fail 324 } 325 326 ////////////////////////////// COMMON STATE ///////////////////////////// 327 scopeEnd: 328 // write our tape location to the header scope 329 offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1] 330 // drop last element 331 pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1] 332 333 pj.write_tape(offset>>retAddressShift, buf[idx]) 334 pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()) 335 336 /* goto saved_state*/ 337 switch offset & ((1 << retAddressShift) - 1) { 338 case retAddressArrayConst: 339 goto arrayContinue 340 case retAddressObjectConst: 341 goto objectContinue 342 default: 343 goto startContinue 344 } 345 346 ////////////////////////////// ARRAY STATES ///////////////////////////// 347 arrayBegin: 348 if done, idx = updateChar(pj, idx); done { 349 goto succeed 350 } 351 if buf[idx] == ']' { 352 goto scopeEnd // could also go to array_continue 353 } 354 355 mainArraySwitch: 356 // we call update char on all paths in, so we can peek at c on the 357 // on paths that can accept a close square brace (post-, and at start) 358 switch buf[idx] { 359 case '"': 360 if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) { 361 goto fail 362 } 363 case 't': 364 if !isValidTrueAtom(buf[idx:]) { 365 goto fail 366 } 367 pj.write_tape(0, 't') 368 369 case 'f': 370 if !isValidFalseAtom(buf[idx:]) { 371 goto fail 372 } 373 pj.write_tape(0, 'f') 374 375 case 'n': 376 if !isValidNullAtom(buf[idx:]) { 377 goto fail 378 } 379 pj.write_tape(0, 'n') 380 /* goto array_continue */ 381 382 case '-': 383 if !addNumber(buf[idx:], &pj.ParsedJson) { 384 goto fail 385 } 386 387 case '{': 388 // we have not yet encountered ] so we need to come back for it 389 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst) 390 pj.write_tape(0, '{') // here the compilers knows what c is so this gets optimized 391 goto object_begin 392 393 case '[': 394 // we have not yet encountered ] so we need to come back for it 395 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst) 396 pj.write_tape(0, '[') // here the compilers knows what c is so this gets optimized 397 goto arrayBegin 398 399 default: 400 if buf[idx] >= '0' && buf[idx] <= '9' { 401 if !addNumber(buf[idx:], &pj.ParsedJson) { 402 goto fail 403 } 404 break 405 } 406 goto fail 407 } 408 409 arrayContinue: 410 if done, idx = updateChar(pj, idx); done { 411 goto succeed 412 } 413 switch buf[idx] { 414 case ',': 415 if done, idx = updateChar(pj, idx); done { 416 goto succeed 417 } 418 goto mainArraySwitch 419 420 case ']': 421 goto scopeEnd 422 423 default: 424 goto fail 425 } 426 427 ////////////////////////////// FINAL STATES ///////////////////////////// 428 succeed: 429 offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1] 430 // drop last element 431 pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1] 432 433 // Sanity checks 434 if len(pj.containingScopeOffset) != 0 { 435 return false, done 436 } 437 438 pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot) 439 pj.write_tape(offset>>retAddressShift, 'r') // r is root 440 441 pj.isvalid = true 442 return true, done 443 444 fail: 445 return false, done 446 } 447 448 // structural chars here are 449 // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL) 450 // we are also interested in the four whitespace characters 451 // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d 452 453 // these are the chars that can follow a true/false/null or number atom 454 // and nothing else 455 var structuralOrWhitespaceNegated = [256]byte{ 456 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 458 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 459 460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 461 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 463 464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 467 468 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 469 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 470 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} 471 472 // return non-zero if not a structural or whitespace char 473 // zero otherwise 474 func isNotStructuralOrWhitespace(c byte) byte { 475 return structuralOrWhitespaceNegated[c] 476 }