github.com/minio/simdjson-go@v0.4.6-0.20231116094823-04d21cddf993/stage1_find_marks_amd64.go (about)

     1  //go:build !noasm && !appengine && gc
     2  // +build !noasm,!appengine,gc
     3  
     4  /*
     5   * MinIO Cloud Storage, (C) 2020 MinIO, Inc.
     6   *
     7   * Licensed under the Apache License, Version 2.0 (the "License");
     8   * you may not use this file except in compliance with the License.
     9   * You may obtain a copy of the License at
    10   *
    11   *     http://www.apache.org/licenses/LICENSE-2.0
    12   *
    13   * Unless required by applicable law or agreed to in writing, software
    14   * distributed under the License is distributed on an "AS IS" BASIS,
    15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16   * See the License for the specific language governing permissions and
    17   * limitations under the License.
    18   */
    19  
    20  package simdjson
    21  
    22  import (
    23  	"sync/atomic"
    24  
    25  	"github.com/klauspost/cpuid/v2"
    26  )
    27  
    28  var jsonMarkupTable = [256]bool{
    29  	'{': true,
    30  	'}': true,
    31  	'[': true,
    32  	']': true,
    33  	',': true,
    34  	':': true,
    35  }
    36  
    37  func jsonMarkup(b byte) bool {
    38  	return jsonMarkupTable[b]
    39  }
    40  
    41  func (pj *internalParsedJson) findStructuralIndices() bool {
    42  	avx512 := cpuid.CPU.Has(cpuid.AVX512F)
    43  	buf := pj.Message
    44  	// persistent state across loop
    45  	// does the last iteration end with an odd-length sequence of backslashes?
    46  	// either 0 or 1, but a 64-bit value
    47  	prev_iter_ends_odd_backslash := uint64(0)
    48  
    49  	// does the previous iteration end inside a double-quote pair?
    50  	prev_iter_inside_quote := uint64(0) // either all zeros or all ones
    51  
    52  	// does the previous iteration end on something that is a predecessor of a
    53  	// pseudo-structural character - i.e. whitespace or a structural character
    54  	// effectively the very first char is considered to follow "whitespace" for the
    55  	// purposes of pseudo-structural character detection so we initialize to 1
    56  	prev_iter_ends_pseudo_pred := uint64(1)
    57  
    58  	error_mask := uint64(0) // for unescaped characters within strings (ASCII code points < 0x20)
    59  
    60  	indexTotal := 0
    61  
    62  	// empty bits that are carried over to the next call to flatten_bits_incremental
    63  	carried := uint64(0)
    64  
    65  	// absolute position into message buffer
    66  	position := ^uint64(0)
    67  	stripped_index := ^uint64(0)
    68  
    69  	for len(buf) > 0 {
    70  
    71  		index := indexChan{}
    72  		offset := atomic.AddUint64(&pj.buffersOffset, 1)
    73  		index.indexes = &pj.buffers[offset%indexSlots]
    74  
    75  		// In case last index during previous round was stripped back, put it back
    76  		if stripped_index != ^uint64(0) {
    77  			position += stripped_index
    78  			index.indexes[0] = uint32(stripped_index)
    79  			index.length = 1
    80  			stripped_index = ^uint64(0)
    81  		}
    82  
    83  		var processed uint64
    84  		if avx512 {
    85  			processed = find_structural_bits_in_slice_avx512(buf[:len(buf) & ^63], &prev_iter_ends_odd_backslash,
    86  				&prev_iter_inside_quote, &error_mask,
    87  				&prev_iter_ends_pseudo_pred,
    88  				index.indexes, &index.length, &carried, &position, pj.ndjson)
    89  		} else {
    90  			processed = find_structural_bits_in_slice(buf[:len(buf) & ^63], &prev_iter_ends_odd_backslash,
    91  				&prev_iter_inside_quote, &error_mask,
    92  				&prev_iter_ends_pseudo_pred,
    93  				index.indexes, &index.length, &carried, &position, pj.ndjson)
    94  		}
    95  
    96  		// Check if we have at most a single iteration of 64 bytes left, tag on to previous invocation
    97  		if uint64(len(buf))-processed <= 64 {
    98  			// Process last 64 bytes in larger buffer (to safeguard against reading beyond the end of the buffer)
    99  			paddedBuf := [128]byte{}
   100  			copy(paddedBuf[:], buf[processed:])
   101  			paddedBytes := uint64(len(buf)) - processed
   102  			if avx512 {
   103  				processed += find_structural_bits_in_slice_avx512(paddedBuf[:paddedBytes], &prev_iter_ends_odd_backslash,
   104  					&prev_iter_inside_quote, &error_mask,
   105  					&prev_iter_ends_pseudo_pred,
   106  					index.indexes, &index.length, &carried, &position, pj.ndjson)
   107  			} else {
   108  				processed += find_structural_bits_in_slice(paddedBuf[:paddedBytes], &prev_iter_ends_odd_backslash,
   109  					&prev_iter_inside_quote, &error_mask,
   110  					&prev_iter_ends_pseudo_pred,
   111  					index.indexes, &index.length, &carried, &position, pj.ndjson)
   112  			}
   113  		}
   114  
   115  		if index.length == 0 { // No structural chars found, so error out
   116  			error_mask = ^uint64(0)
   117  			break
   118  		}
   119  
   120  		if uint64(len(buf)) == processed { // message processing completed?
   121  			// break out if either
   122  			// - is there an unmatched quote at the end
   123  			// - the ending structural char is not either a '}' (normal json) or a ']' (array style)
   124  			if prev_iter_inside_quote != 0 ||
   125  				position >= uint64(len(buf)) ||
   126  				!(buf[position] == '}' || buf[position] == ']') {
   127  				error_mask = ^uint64(0)
   128  				break
   129  			}
   130  		} else if !jsonMarkup(buf[position]) {
   131  			// There may be a dangling quote at the end of the index buffer
   132  			// Strip it from current index buffer and save for next round
   133  			stripped_index = uint64(index.indexes[index.length-1])
   134  			position -= stripped_index
   135  			index.length -= 1
   136  		}
   137  
   138  		pj.indexChans <- index
   139  		indexTotal += index.length
   140  
   141  		buf = buf[processed:]
   142  		position -= processed
   143  	}
   144  	pj.indexChans <- indexChan{index: -1}
   145  
   146  	// a valid JSON file cannot have zero structural indexes - we should have found something
   147  	return error_mask == 0 && indexTotal > 0
   148  }