github.com/dgraph-io/simdjson-go@v0.3.0/stage2_build_tape_amd64.go (about) 1 //+build !noasm 2 //+build !appengine 3 //+build gc 4 5 /* 6 * MinIO Cloud Storage, (C) 2020 MinIO, Inc. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package simdjson 22 23 import ( 24 "bytes" 25 "encoding/binary" 26 "fmt" 27 ) 28 29 // Constants for "return address" modes 30 const retAddressShift = 2 31 const retAddressStartConst = 1 32 const retAddressObjectConst = 2 33 const retAddressArrayConst = 3 34 35 func updateChar(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) { 36 if pj.indexesChan.index >= pj.indexesChan.length { 37 var ok bool 38 pj.indexesChan, ok = <-pj.indexChans // Get next element from channel 39 if !ok { 40 done = true // return done if channel closed 41 return 42 } 43 } 44 idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index]) 45 pj.indexesChan.index++ 46 return 47 } 48 49 // Handy "debug" function to see where Stage 2 fails (rename to `updateChar`) 50 func updateCharDebug(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) { 51 if pj.indexesChan.index >= pj.indexesChan.length { 52 var ok bool 53 pj.indexesChan, ok = <-pj.indexChans // Get next element from channel 54 if !ok { 55 done = true // return done if channel closed 56 return 57 } 58 } 59 idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index]) 60 fmt.Printf("At 0x%x char: %s\n", idx, string(pj.Message[idx])) 61 pj.indexesChan.index++ 62 return 63 } 64 65 func peekSize(pj *internalParsedJson) uint64 { 66 if pj.indexesChan.index >= pj.indexesChan.length { 67 //panic("cannot peek the size") // should never happen since last string element should be saved for next buffer 68 // let's return 0 for the sake of safety (could lead to a string being to short) 69 return 0 70 } 71 return uint64(pj.indexesChan.indexes[pj.indexesChan.index]) 72 } 73 74 func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64) bool { 75 size := uint64(0) 76 need_copy := false 77 buf := pj.Message[idx:] 78 // Make sure that we have at least one full YMM word available after maxStringSize into the buffer 79 if len(buf)-int(maxStringSize) < 64 { 80 if len(buf) > 512-64 { // only allocated if needed 81 paddedBuf := make([]byte, len(buf)+64) 82 copy(paddedBuf, buf) 83 buf = paddedBuf 84 } else { 85 paddedBuf := [512]byte{} 86 copy(paddedBuf[:], buf) 87 buf = paddedBuf[:] 88 } 89 } 90 if !parseStringSimdValidateOnly(buf, &maxStringSize, &size, &need_copy) { 91 return false 92 } 93 if !need_copy { 94 pj.write_tape(idx+1, '"') 95 } else { 96 // Make sure we account for at least 32 bytes additional space due to 97 requiredLen := uint64(len(pj.Strings)) + size + 32 98 if requiredLen >= uint64(cap(pj.Strings)) { 99 newSize := uint64(cap(pj.Strings) * 2) 100 if newSize < requiredLen { 101 newSize = requiredLen + size // add size once more to account for further space 102 } 103 strs := make([]byte, len(pj.Strings), newSize) 104 copy(strs, pj.Strings) 105 pj.Strings = strs 106 } 107 start := len(pj.Strings) 108 _ = parseStringSimd(buf, &pj.Strings) // We can safely ignore the result since we validate above 109 pj.write_tape(uint64(STRINGBUFBIT+start), '"') 110 size = uint64(len(pj.Strings) - start) 111 } 112 // put length onto the tape 113 pj.Tape = append(pj.Tape, size) 114 return true 115 } 116 117 func addNumber(buf []byte, pj *ParsedJson) (bool, error) { 118 tag, val, flags, pos := parseNumber(buf) 119 if tag == TagEnd { 120 return false, nil 121 } 122 if FloatFlags(flags).Contains(FloatOverflowedInteger) { 123 return false, fmt.Errorf(`simdjson-go: parsing: "%s": value out of range`, string(buf[:pos])) 124 } 125 pj.writeTapeTagValFlags(tag, val, flags) 126 return true, nil 127 } 128 129 func isValidTrueAtom(buf []byte) bool { 130 if len(buf) >= 8 { // fast path when there is enough space left in the buffer 131 tv := uint64(0x0000000065757274) // "true " 132 mask4 := uint64(0x00000000ffffffff) 133 locval := binary.LittleEndian.Uint64(buf) 134 error := (locval & mask4) ^ tv 135 error |= uint64(isNotStructuralOrWhitespace(buf[4])) 136 return error == 0 137 } else if len(buf) >= 5 { 138 return bytes.Compare(buf[:4], []byte("true")) == 0 && isNotStructuralOrWhitespace(buf[4]) == 0 139 } 140 return false 141 } 142 143 func isValidFalseAtom(buf []byte) bool { 144 if len(buf) >= 8 { // fast path when there is enough space left in the buffer 145 fv := uint64(0x00000065736c6166) // "false " 146 mask5 := uint64(0x000000ffffffffff) 147 locval := binary.LittleEndian.Uint64(buf) 148 error := (locval & mask5) ^ fv 149 error |= uint64(isNotStructuralOrWhitespace(buf[5])) 150 return error == 0 151 } else if len(buf) >= 6 { 152 return bytes.Compare(buf[:5], []byte("false")) == 0 && isNotStructuralOrWhitespace(buf[5]) == 0 153 } 154 return false 155 } 156 157 func isValidNullAtom(buf []byte) bool { 158 if len(buf) >= 8 { // fast path when there is enough space left in the buffer 159 nv := uint64(0x000000006c6c756e) // "null " 160 mask4 := uint64(0x00000000ffffffff) 161 locval := binary.LittleEndian.Uint64(buf) // we want to avoid unaligned 64-bit loads (undefined in C/C++) 162 error := (locval & mask4) ^ nv 163 error |= uint64(isNotStructuralOrWhitespace(buf[4])) 164 return error == 0 165 } else if len(buf) >= 5 { 166 return bytes.Compare(buf[:4], []byte("null")) == 0 && isNotStructuralOrWhitespace(buf[4]) == 0 167 } 168 return false 169 } 170 171 func unifiedMachine(buf []byte, pj *internalParsedJson) (bool, error) { 172 173 const addOneForRoot = 1 174 175 done := false 176 idx := ^uint64(0) // location of the structural character in the input (buf) 177 offset := uint64(0) // used to contain last element of containing_scope_offset 178 179 ////////////////////////////// START STATE ///////////////////////////// 180 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 181 182 pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten 183 // the root is used, if nothing else, to capture the size of the tape 184 185 if done, idx = updateChar(pj, idx); done { 186 goto succeed 187 } 188 continueRoot: 189 switch buf[idx] { 190 case '{': 191 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 192 pj.write_tape(0, buf[idx]) 193 goto object_begin 194 case '[': 195 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 196 pj.write_tape(0, buf[idx]) 197 goto arrayBegin 198 default: 199 goto fail 200 } 201 202 startContinue: 203 // We are back at the top, read the next char and we should be done 204 if done, idx = updateChar(pj, idx); done { 205 goto succeed 206 } else { 207 // For an ndjson object, wrap up current object, start new root and check for minimum of 1 newline 208 if buf[idx] != '\n' { 209 goto fail 210 } 211 212 // Eat any empty lines 213 for buf[idx] == '\n' { 214 if done, idx = updateChar(pj, idx); done { 215 goto succeed 216 } 217 } 218 219 // Otherwise close current root 220 offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1] 221 222 // drop last element 223 pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1] 224 225 pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot) 226 pj.write_tape(offset>>retAddressShift, 'r') // r is root 227 228 // And open a new root 229 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst) 230 pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten 231 232 goto continueRoot 233 } 234 235 //////////////////////////////// OBJECT STATES ///////////////////////////// 236 237 object_begin: 238 if done, idx = updateChar(pj, idx); done { 239 goto succeed 240 } 241 switch buf[idx] { 242 case '"': 243 if !parseString(&pj.ParsedJson, idx, peekSize(pj)) { 244 goto fail 245 } 246 goto object_key_state 247 case '}': 248 goto scopeEnd // could also go to object_continue 249 default: 250 goto fail 251 } 252 253 object_key_state: 254 if done, idx = updateChar(pj, idx); done { 255 goto succeed 256 } 257 if buf[idx] != ':' { 258 goto fail 259 } 260 if done, idx = updateChar(pj, idx); done { 261 goto succeed 262 } 263 switch buf[idx] { 264 case '"': 265 if !parseString(&pj.ParsedJson, idx, peekSize(pj)) { 266 goto fail 267 } 268 269 case 't': 270 if !isValidTrueAtom(buf[idx:]) { 271 goto fail 272 } 273 pj.write_tape(0, buf[idx]) 274 275 case 'f': 276 if !isValidFalseAtom(buf[idx:]) { 277 goto fail 278 } 279 pj.write_tape(0, buf[idx]) 280 281 case 'n': 282 if !isValidNullAtom(buf[idx:]) { 283 goto fail 284 } 285 pj.write_tape(0, buf[idx]) 286 287 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 288 added, err := addNumber(buf[idx:], &pj.ParsedJson) 289 if err != nil { 290 return false, err 291 } 292 if !added { 293 goto fail 294 } 295 296 case '-': 297 added, err := addNumber(buf[idx:], &pj.ParsedJson) 298 if err != nil { 299 return false, err 300 } 301 if !added { 302 goto fail 303 } 304 305 case '{': 306 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst) 307 pj.write_tape(0, buf[idx]) 308 // we have not yet encountered } so we need to come back for it 309 goto object_begin 310 311 case '[': 312 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst) 313 pj.write_tape(0, buf[idx]) 314 // we have not yet encountered } so we need to come back for it 315 goto arrayBegin 316 317 default: 318 goto fail 319 } 320 321 objectContinue: 322 if done, idx = updateChar(pj, idx); done { 323 goto succeed 324 } 325 switch buf[idx] { 326 case ',': 327 if done, idx = updateChar(pj, idx); done { 328 goto succeed 329 } 330 if buf[idx] != '"' { 331 goto fail 332 } 333 if !parseString(&pj.ParsedJson, idx, peekSize(pj)) { 334 goto fail 335 } 336 goto object_key_state 337 338 case '}': 339 goto scopeEnd 340 341 default: 342 goto fail 343 } 344 345 ////////////////////////////// COMMON STATE ///////////////////////////// 346 scopeEnd: 347 // write our tape location to the header scope 348 offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1] 349 // drop last element 350 pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1] 351 352 pj.write_tape(offset>>retAddressShift, buf[idx]) 353 pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()) 354 355 /* goto saved_state*/ 356 switch offset & ((1 << retAddressShift) - 1) { 357 case retAddressArrayConst: 358 goto arrayContinue 359 case retAddressObjectConst: 360 goto objectContinue 361 default: 362 goto startContinue 363 } 364 365 ////////////////////////////// ARRAY STATES ///////////////////////////// 366 arrayBegin: 367 if done, idx = updateChar(pj, idx); done { 368 goto succeed 369 } 370 if buf[idx] == ']' { 371 goto scopeEnd // could also go to array_continue 372 } 373 374 mainArraySwitch: 375 // we call update char on all paths in, so we can peek at c on the 376 // on paths that can accept a close square brace (post-, and at start) 377 switch buf[idx] { 378 case '"': 379 if !parseString(&pj.ParsedJson, idx, peekSize(pj)) { 380 goto fail 381 } 382 case 't': 383 if !isValidTrueAtom(buf[idx:]) { 384 goto fail 385 } 386 pj.write_tape(0, buf[idx]) 387 388 case 'f': 389 if !isValidFalseAtom(buf[idx:]) { 390 goto fail 391 } 392 pj.write_tape(0, buf[idx]) 393 394 case 'n': 395 if !isValidNullAtom(buf[idx:]) { 396 goto fail 397 } 398 pj.write_tape(0, buf[idx]) 399 /* goto array_continue */ 400 401 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': 402 added, err := addNumber(buf[idx:], &pj.ParsedJson) 403 if err != nil { 404 return false, err 405 } 406 if !added { 407 goto fail 408 } 409 410 case '{': 411 // we have not yet encountered ] so we need to come back for it 412 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst) 413 pj.write_tape(0, buf[idx]) // here the compilers knows what c is so this gets optimized 414 goto object_begin 415 416 case '[': 417 // we have not yet encountered ] so we need to come back for it 418 pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst) 419 pj.write_tape(0, buf[idx]) // here the compilers knows what c is so this gets optimized 420 goto arrayBegin 421 422 default: 423 goto fail 424 } 425 426 arrayContinue: 427 if done, idx = updateChar(pj, idx); done { 428 goto succeed 429 } 430 switch buf[idx] { 431 case ',': 432 if done, idx = updateChar(pj, idx); done { 433 goto succeed 434 } 435 goto mainArraySwitch 436 437 case ']': 438 goto scopeEnd 439 440 default: 441 goto fail 442 } 443 444 ////////////////////////////// FINAL STATES ///////////////////////////// 445 succeed: 446 offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1] 447 // drop last element 448 pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1] 449 450 // Sanity checks 451 if len(pj.containingScopeOffset) != 0 { 452 return false, nil 453 } 454 455 pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot) 456 pj.write_tape(offset>>retAddressShift, 'r') // r is root 457 458 pj.isvalid = true 459 return true, nil 460 461 fail: 462 return false, nil 463 } 464 465 // structural chars here are 466 // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL) 467 // we are also interested in the four whitespace characters 468 // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d 469 470 // these are the chars that can follow a true/false/null or number atom 471 // and nothing else 472 var structuralOrWhitespaceNegated = [256]byte{ 473 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 475 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 476 477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 478 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 480 481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 483 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 484 485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} 488 489 // return non-zero if not a structural or whitespace char 490 // zero otherwise 491 func isNotStructuralOrWhitespace(c byte) byte { 492 return structuralOrWhitespaceNegated[c] 493 }