github.com/minio/simdjson-go@v0.4.6-0.20231116094823-04d21cddf993/stage1_find_marks_amd64.go (about) 1 //go:build !noasm && !appengine && gc 2 // +build !noasm,!appengine,gc 3 4 /* 5 * MinIO Cloud Storage, (C) 2020 MinIO, Inc. 6 * 7 * Licensed under the Apache License, Version 2.0 (the "License"); 8 * you may not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 package simdjson 21 22 import ( 23 "sync/atomic" 24 25 "github.com/klauspost/cpuid/v2" 26 ) 27 28 var jsonMarkupTable = [256]bool{ 29 '{': true, 30 '}': true, 31 '[': true, 32 ']': true, 33 ',': true, 34 ':': true, 35 } 36 37 func jsonMarkup(b byte) bool { 38 return jsonMarkupTable[b] 39 } 40 41 func (pj *internalParsedJson) findStructuralIndices() bool { 42 avx512 := cpuid.CPU.Has(cpuid.AVX512F) 43 buf := pj.Message 44 // persistent state across loop 45 // does the last iteration end with an odd-length sequence of backslashes? 46 // either 0 or 1, but a 64-bit value 47 prev_iter_ends_odd_backslash := uint64(0) 48 49 // does the previous iteration end inside a double-quote pair? 50 prev_iter_inside_quote := uint64(0) // either all zeros or all ones 51 52 // does the previous iteration end on something that is a predecessor of a 53 // pseudo-structural character - i.e. whitespace or a structural character 54 // effectively the very first char is considered to follow "whitespace" for the 55 // purposes of pseudo-structural character detection so we initialize to 1 56 prev_iter_ends_pseudo_pred := uint64(1) 57 58 error_mask := uint64(0) // for unescaped characters within strings (ASCII code points < 0x20) 59 60 indexTotal := 0 61 62 // empty bits that are carried over to the next call to flatten_bits_incremental 63 carried := uint64(0) 64 65 // absolute position into message buffer 66 position := ^uint64(0) 67 stripped_index := ^uint64(0) 68 69 for len(buf) > 0 { 70 71 index := indexChan{} 72 offset := atomic.AddUint64(&pj.buffersOffset, 1) 73 index.indexes = &pj.buffers[offset%indexSlots] 74 75 // In case last index during previous round was stripped back, put it back 76 if stripped_index != ^uint64(0) { 77 position += stripped_index 78 index.indexes[0] = uint32(stripped_index) 79 index.length = 1 80 stripped_index = ^uint64(0) 81 } 82 83 var processed uint64 84 if avx512 { 85 processed = find_structural_bits_in_slice_avx512(buf[:len(buf) & ^63], &prev_iter_ends_odd_backslash, 86 &prev_iter_inside_quote, &error_mask, 87 &prev_iter_ends_pseudo_pred, 88 index.indexes, &index.length, &carried, &position, pj.ndjson) 89 } else { 90 processed = find_structural_bits_in_slice(buf[:len(buf) & ^63], &prev_iter_ends_odd_backslash, 91 &prev_iter_inside_quote, &error_mask, 92 &prev_iter_ends_pseudo_pred, 93 index.indexes, &index.length, &carried, &position, pj.ndjson) 94 } 95 96 // Check if we have at most a single iteration of 64 bytes left, tag on to previous invocation 97 if uint64(len(buf))-processed <= 64 { 98 // Process last 64 bytes in larger buffer (to safeguard against reading beyond the end of the buffer) 99 paddedBuf := [128]byte{} 100 copy(paddedBuf[:], buf[processed:]) 101 paddedBytes := uint64(len(buf)) - processed 102 if avx512 { 103 processed += find_structural_bits_in_slice_avx512(paddedBuf[:paddedBytes], &prev_iter_ends_odd_backslash, 104 &prev_iter_inside_quote, &error_mask, 105 &prev_iter_ends_pseudo_pred, 106 index.indexes, &index.length, &carried, &position, pj.ndjson) 107 } else { 108 processed += find_structural_bits_in_slice(paddedBuf[:paddedBytes], &prev_iter_ends_odd_backslash, 109 &prev_iter_inside_quote, &error_mask, 110 &prev_iter_ends_pseudo_pred, 111 index.indexes, &index.length, &carried, &position, pj.ndjson) 112 } 113 } 114 115 if index.length == 0 { // No structural chars found, so error out 116 error_mask = ^uint64(0) 117 break 118 } 119 120 if uint64(len(buf)) == processed { // message processing completed? 121 // break out if either 122 // - is there an unmatched quote at the end 123 // - the ending structural char is not either a '}' (normal json) or a ']' (array style) 124 if prev_iter_inside_quote != 0 || 125 position >= uint64(len(buf)) || 126 !(buf[position] == '}' || buf[position] == ']') { 127 error_mask = ^uint64(0) 128 break 129 } 130 } else if !jsonMarkup(buf[position]) { 131 // There may be a dangling quote at the end of the index buffer 132 // Strip it from current index buffer and save for next round 133 stripped_index = uint64(index.indexes[index.length-1]) 134 position -= stripped_index 135 index.length -= 1 136 } 137 138 pj.indexChans <- index 139 indexTotal += index.length 140 141 buf = buf[processed:] 142 position -= processed 143 } 144 pj.indexChans <- indexChan{index: -1} 145 146 // a valid JSON file cannot have zero structural indexes - we should have found something 147 return error_mask == 0 && indexTotal > 0 148 }