github.com/dgraph-io/simdjson-go@v0.3.0/stage1_find_marks_amd64.go (about)

     1  //+build !noasm
     2  //+build !appengine
     3  //+build gc
     4  
     5  /*
     6   * MinIO Cloud Storage, (C) 2020 MinIO, Inc.
     7   *
     8   * Licensed under the Apache License, Version 2.0 (the "License");
     9   * you may not use this file except in compliance with the License.
    10   * You may obtain a copy of the License at
    11   *
    12   *     http://www.apache.org/licenses/LICENSE-2.0
    13   *
    14   * Unless required by applicable law or agreed to in writing, software
    15   * distributed under the License is distributed on an "AS IS" BASIS,
    16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    17   * See the License for the specific language governing permissions and
    18   * limitations under the License.
    19   */
    20  
    21  package simdjson
    22  
    23  import (
    24  	"sync/atomic"
    25  
    26  	"github.com/klauspost/cpuid/v2"
    27  )
    28  
    29  var jsonMarkupTable = [256]bool{
    30  	'{': true,
    31  	'}': true,
    32  	'[': true,
    33  	']': true,
    34  	',': true,
    35  	':': true,
    36  }
    37  
    38  func jsonMarkup(b byte) bool {
    39  	return jsonMarkupTable[b]
    40  }
    41  
    42  func findStructuralIndices(buf []byte, pj *internalParsedJson) bool {
    43  
    44  	f := find_structural_bits_in_slice
    45  	if cpuid.CPU.Has(cpuid.AVX512F) {
    46  		f = find_structural_bits_in_slice_avx512
    47  	}
    48  
    49  	// persistent state across loop
    50  	// does the last iteration end with an odd-length sequence of backslashes?
    51  	// either 0 or 1, but a 64-bit value
    52  	prev_iter_ends_odd_backslash := uint64(0)
    53  
    54  	// does the previous iteration end inside a double-quote pair?
    55  	prev_iter_inside_quote := uint64(0) // either all zeros or all ones
    56  
    57  	// does the previous iteration end on something that is a predecessor of a
    58  	// pseudo-structural character - i.e. whitespace or a structural character
    59  	// effectively the very first char is considered to follow "whitespace" for the
    60  	// purposes of pseudo-structural character detection so we initialize to 1
    61  	prev_iter_ends_pseudo_pred := uint64(1)
    62  
    63  	error_mask := uint64(0) // for unescaped characters within strings (ASCII code points < 0x20)
    64  
    65  	indexTotal := 0
    66  
    67  	// empty bits that are carried over to the next call to flatten_bits_incremental
    68  	carried := uint64(0)
    69  
    70  	// absolute position into message buffer
    71  	position := ^uint64(0)
    72  	stripped_index := ^uint64(0)
    73  
    74  	for len(buf) > 0 {
    75  
    76  		index := indexChan{}
    77  		offset := atomic.AddUint64(&pj.buffersOffset, 1)
    78  		index.indexes = &pj.buffers[offset%indexSlots]
    79  
    80  		// In case last index during previous round was stripped back, put it back
    81  		if stripped_index != ^uint64(0) {
    82  			position += stripped_index
    83  			index.indexes[0] = uint32(stripped_index)
    84  			index.length = 1
    85  			stripped_index = ^uint64(0)
    86  		}
    87  
    88  		processed := f(buf[:len(buf) & ^63], &prev_iter_ends_odd_backslash,
    89  			&prev_iter_inside_quote, &error_mask,
    90  			&prev_iter_ends_pseudo_pred,
    91  			index.indexes, &index.length, &carried, &position, pj.ndjson)
    92  
    93  		// Check if we have at most a single iteration of 64 bytes left, tag on to previous invocation
    94  		if uint64(len(buf))-processed <= 64 {
    95  			// Process last 64 bytes in larger buffer (to safeguard against reading beyond the end of the buffer)
    96  			paddedBuf := [128]byte{}
    97  			copy(paddedBuf[:], buf[processed:])
    98  			paddedBytes := uint64(len(buf)) - processed
    99  			processed += f(paddedBuf[:paddedBytes], &prev_iter_ends_odd_backslash,
   100  				&prev_iter_inside_quote, &error_mask,
   101  				&prev_iter_ends_pseudo_pred,
   102  				index.indexes, &index.length, &carried, &position, pj.ndjson)
   103  		}
   104  
   105  		if index.length == 0 { // No structural chars found, so error out
   106  			error_mask = ^uint64(0)
   107  			break
   108  		}
   109  
   110  		if uint64(len(buf)) == processed { // message processing completed?
   111  			// break out if either
   112  			// - is there an unmatched quote at the end
   113  			// - the ending structural char is not either a '}' (normal json) or a ']' (array style)
   114  			if prev_iter_inside_quote != 0 ||
   115  				position >= uint64(len(buf)) ||
   116  				!(buf[position] == '}' || buf[position] == ']') {
   117  				error_mask = ^uint64(0)
   118  				break
   119  			}
   120  		} else if !jsonMarkup(buf[position]) {
   121  			// There may be a dangling quote at the end of the index buffer
   122  			// Strip it from current index buffer and save for next round
   123  			stripped_index = uint64(index.indexes[index.length-1])
   124  			position -= stripped_index
   125  			index.length -= 1
   126  		}
   127  
   128  		pj.indexChans <- index
   129  		indexTotal += index.length
   130  
   131  		buf = buf[processed:]
   132  		position -= processed
   133  	}
   134  	close(pj.indexChans)
   135  
   136  	// a valid JSON file cannot have zero structural indexes - we should have found something
   137  	return error_mask == 0 && indexTotal > 0
   138  }