github.com/minio/simdjson-go@v0.4.6-0.20231116094823-04d21cddf993/stage2_build_tape_amd64.go (about)

     1  //go:build !noasm && !appengine && gc
     2  // +build !noasm,!appengine,gc
     3  
     4  /*
     5   * MinIO Cloud Storage, (C) 2020 MinIO, Inc.
     6   *
     7   * Licensed under the Apache License, Version 2.0 (the "License");
     8   * you may not use this file except in compliance with the License.
     9   * You may obtain a copy of the License at
    10   *
    11   *     http://www.apache.org/licenses/LICENSE-2.0
    12   *
    13   * Unless required by applicable law or agreed to in writing, software
    14   * distributed under the License is distributed on an "AS IS" BASIS,
    15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16   * See the License for the specific language governing permissions and
    17   * limitations under the License.
    18   */
    19  
    20  package simdjson
    21  
    22  import (
    23  	"bytes"
    24  	"encoding/binary"
    25  	"fmt"
    26  )
    27  
    28  // Constants for "return address" modes
    29  const retAddressShift = 2
    30  const retAddressStartConst = 1
    31  const retAddressObjectConst = 2
    32  const retAddressArrayConst = 3
    33  
    34  func updateChar(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) {
    35  	if pj.indexesChan.index >= pj.indexesChan.length {
    36  		pj.indexesChan = <-pj.indexChans // Get next element from channel
    37  		done = pj.indexesChan.index == -1
    38  		if done {
    39  			return
    40  		}
    41  	}
    42  	idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index])
    43  	pj.indexesChan.index++
    44  	return
    45  }
    46  
    47  // Handy "debug" function to see where Stage 2 fails (rename to `updateChar`)
    48  func updateCharDebug(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) {
    49  	if pj.indexesChan.index >= pj.indexesChan.length {
    50  		var ok bool
    51  		pj.indexesChan, ok = <-pj.indexChans // Get next element from channel
    52  		if !ok {
    53  			done = true // return done if channel closed
    54  			return
    55  		}
    56  	}
    57  	idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index])
    58  	fmt.Printf("At 0x%x char: %s\n", idx, string(pj.Message[idx]))
    59  	pj.indexesChan.index++
    60  	return
    61  }
    62  
    63  func peekSize(pj *internalParsedJson) uint64 {
    64  	if pj.indexesChan.index >= pj.indexesChan.length {
    65  		//panic("cannot peek the size") // should never happen since last string element should be saved for next buffer
    66  		// let's return 0 for the sake of safety (could lead to a string being to short)
    67  		return 0
    68  	}
    69  	return uint64(pj.indexesChan.indexes[pj.indexesChan.index])
    70  }
    71  
    72  func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64, needCopy bool) bool {
    73  	size := uint64(0)
    74  	buf := pj.Message[idx:]
    75  	// Make sure that we have at least one full YMM word available after maxStringSize into the buffer
    76  	if len(buf)-int(maxStringSize) < 64 {
    77  		if len(buf) > 512-64 { // only allocated if needed
    78  			paddedBuf := make([]byte, len(buf)+64)
    79  			copy(paddedBuf, buf)
    80  			buf = paddedBuf
    81  		} else {
    82  			paddedBuf := [512]byte{}
    83  			copy(paddedBuf[:], buf)
    84  			buf = paddedBuf[:]
    85  		}
    86  	}
    87  	if !parseStringSimdValidateOnly(buf, &maxStringSize, &size, &needCopy) {
    88  		return false
    89  	}
    90  	if !needCopy {
    91  		pj.write_tape(idx+1, '"')
    92  	} else {
    93  		// Make sure we account for at least 32 bytes additional space due to
    94  		strs := pj.Strings.B
    95  		requiredLen := uint64(len(strs)) + size + 32
    96  		if requiredLen >= uint64(cap(strs)) {
    97  			newSize := uint64(cap(strs) * 2)
    98  			if newSize < requiredLen {
    99  				newSize = requiredLen + size // add size once more to account for further space
   100  			}
   101  			strs = make([]byte, len(strs), newSize)
   102  			copy(strs, pj.Strings.B)
   103  			pj.Strings.B = strs
   104  		}
   105  		start := len(strs)
   106  		_ = parseStringSimd(buf, &pj.Strings.B) // We can safely ignore the result since we validate above
   107  		pj.write_tape(uint64(STRINGBUFBIT+start), '"')
   108  		size = uint64(len(pj.Strings.B) - start)
   109  	}
   110  	// put length onto the tape
   111  	pj.Tape = append(pj.Tape, size)
   112  	return true
   113  }
   114  
   115  func addNumber(buf []byte, pj *ParsedJson) bool {
   116  	tag, val := parseNumber(buf)
   117  	if tag == 0 {
   118  		return false
   119  	}
   120  	pj.writeTapeTagValFlags(tag, val)
   121  	return true
   122  }
   123  
   124  func isValidTrueAtom(buf []byte) bool {
   125  	if len(buf) >= 5 { // fast path when there is enough space left in the buffer
   126  		const tv = uint32(0x0000000065757274) // "true    "
   127  		locval := binary.LittleEndian.Uint32(buf)
   128  		if locval == tv {
   129  			return isNotStructuralOrWhitespace(buf[4]) == 0
   130  		}
   131  	}
   132  	return false
   133  }
   134  
   135  func isValidFalseAtom(buf []byte) bool {
   136  	if len(buf) >= 8 { // fast path when there is enough space left in the buffer
   137  		const fv = uint64(0x00000065736c6166) // "false   "
   138  		const mask5 = uint64(0x000000ffffffffff)
   139  		error := uint64(isNotStructuralOrWhitespace(buf[5]))
   140  		locval := binary.LittleEndian.Uint64(buf)
   141  		error |= (locval & mask5) ^ fv
   142  		return error == 0
   143  	} else if len(buf) >= 6 {
   144  		return bytes.Equal(buf[:5], []byte("false")) && isNotStructuralOrWhitespace(buf[5]) == 0
   145  	}
   146  	return false
   147  }
   148  
   149  func isValidNullAtom(buf []byte) bool {
   150  	if len(buf) >= 5 { // fast path when there is enough space left in the buffer
   151  		const nv = 0x000000006c6c756e             // "null    "
   152  		locval := binary.LittleEndian.Uint32(buf) // we want to avoid unaligned 64-bit loads (undefined in C/C++)
   153  		if locval == nv {
   154  			return isNotStructuralOrWhitespace(buf[4]) == 0
   155  		}
   156  	}
   157  	return false
   158  }
   159  
   160  func (pj *internalParsedJson) unifiedMachine() (ok, done bool) {
   161  	buf := pj.Message
   162  	const addOneForRoot = 1
   163  
   164  	idx := ^uint64(0)   // location of the structural character in the input (buf)
   165  	offset := uint64(0) // used to contain last element of containing_scope_offset
   166  
   167  	////////////////////////////// START STATE /////////////////////////////
   168  	pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   169  
   170  	pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten
   171  	// the root is used, if nothing else, to capture the size of the tape
   172  
   173  	if done, idx = updateChar(pj, idx); done {
   174  		goto succeed
   175  	}
   176  continueRoot:
   177  	switch buf[idx] {
   178  	case '{':
   179  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   180  		pj.write_tape(0, '{')
   181  		goto object_begin
   182  	case '[':
   183  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   184  		pj.write_tape(0, '[')
   185  		goto arrayBegin
   186  	default:
   187  		goto fail
   188  	}
   189  
   190  startContinue:
   191  	// We are back at the top, read the next char and we should be done
   192  	if done, idx = updateChar(pj, idx); done {
   193  		goto succeed
   194  	} else {
   195  		// For an ndjson object, wrap up current object, start new root and check for minimum of 1 newline
   196  		if buf[idx] != '\n' {
   197  			goto fail
   198  		}
   199  
   200  		// Eat any empty lines
   201  		for buf[idx] == '\n' {
   202  			if done, idx = updateChar(pj, idx); done {
   203  				goto succeed
   204  			}
   205  		}
   206  
   207  		// Otherwise close current root
   208  		offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1]
   209  
   210  		// drop last element
   211  		pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1]
   212  
   213  		pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot)
   214  		pj.write_tape(offset>>retAddressShift, 'r') // r is root
   215  
   216  		// And open a new root
   217  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   218  		pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten
   219  
   220  		goto continueRoot
   221  	}
   222  
   223  	//////////////////////////////// OBJECT STATES /////////////////////////////
   224  
   225  object_begin:
   226  	if done, idx = updateChar(pj, idx); done {
   227  		goto succeed
   228  	}
   229  	switch buf[idx] {
   230  	case '"':
   231  		if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
   232  			goto fail
   233  		}
   234  		goto object_key_state
   235  	case '}':
   236  		goto scopeEnd // could also go to object_continue
   237  	default:
   238  		goto fail
   239  	}
   240  
   241  object_key_state:
   242  	if done, idx = updateChar(pj, idx); done {
   243  		goto succeed
   244  	}
   245  	if buf[idx] != ':' {
   246  		goto fail
   247  	}
   248  	if done, idx = updateChar(pj, idx); done {
   249  		goto succeed
   250  	}
   251  	switch buf[idx] {
   252  	case '"':
   253  		if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
   254  			goto fail
   255  		}
   256  
   257  	case 't':
   258  		if !isValidTrueAtom(buf[idx:]) {
   259  			goto fail
   260  		}
   261  		pj.write_tape(0, 't')
   262  
   263  	case 'f':
   264  		if !isValidFalseAtom(buf[idx:]) {
   265  			goto fail
   266  		}
   267  		pj.write_tape(0, 'f')
   268  
   269  	case 'n':
   270  		if !isValidNullAtom(buf[idx:]) {
   271  			goto fail
   272  		}
   273  		pj.write_tape(0, 'n')
   274  
   275  	case '-':
   276  		if !addNumber(buf[idx:], &pj.ParsedJson) {
   277  			goto fail
   278  		}
   279  
   280  	case '{':
   281  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst)
   282  		pj.write_tape(0, '{')
   283  		// we have not yet encountered } so we need to come back for it
   284  		goto object_begin
   285  
   286  	case '[':
   287  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst)
   288  		pj.write_tape(0, '[')
   289  		// we have not yet encountered } so we need to come back for it
   290  		goto arrayBegin
   291  
   292  	default:
   293  		if buf[idx] >= '0' && buf[idx] <= '9' {
   294  			if !addNumber(buf[idx:], &pj.ParsedJson) {
   295  				goto fail
   296  			}
   297  			break
   298  		}
   299  		goto fail
   300  	}
   301  
   302  objectContinue:
   303  	if done, idx = updateChar(pj, idx); done {
   304  		goto succeed
   305  	}
   306  	switch buf[idx] {
   307  	case ',':
   308  		if done, idx = updateChar(pj, idx); done {
   309  			goto succeed
   310  		}
   311  		if buf[idx] != '"' {
   312  			goto fail
   313  		}
   314  		if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
   315  			goto fail
   316  		}
   317  		goto object_key_state
   318  
   319  	case '}':
   320  		goto scopeEnd
   321  
   322  	default:
   323  		goto fail
   324  	}
   325  
   326  	////////////////////////////// COMMON STATE /////////////////////////////
   327  scopeEnd:
   328  	// write our tape location to the header scope
   329  	offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1]
   330  	// drop last element
   331  	pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1]
   332  
   333  	pj.write_tape(offset>>retAddressShift, buf[idx])
   334  	pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc())
   335  
   336  	/* goto saved_state*/
   337  	switch offset & ((1 << retAddressShift) - 1) {
   338  	case retAddressArrayConst:
   339  		goto arrayContinue
   340  	case retAddressObjectConst:
   341  		goto objectContinue
   342  	default:
   343  		goto startContinue
   344  	}
   345  
   346  	////////////////////////////// ARRAY STATES /////////////////////////////
   347  arrayBegin:
   348  	if done, idx = updateChar(pj, idx); done {
   349  		goto succeed
   350  	}
   351  	if buf[idx] == ']' {
   352  		goto scopeEnd // could also go to array_continue
   353  	}
   354  
   355  mainArraySwitch:
   356  	// we call update char on all paths in, so we can peek at c on the
   357  	// on paths that can accept a close square brace (post-, and at start)
   358  	switch buf[idx] {
   359  	case '"':
   360  		if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
   361  			goto fail
   362  		}
   363  	case 't':
   364  		if !isValidTrueAtom(buf[idx:]) {
   365  			goto fail
   366  		}
   367  		pj.write_tape(0, 't')
   368  
   369  	case 'f':
   370  		if !isValidFalseAtom(buf[idx:]) {
   371  			goto fail
   372  		}
   373  		pj.write_tape(0, 'f')
   374  
   375  	case 'n':
   376  		if !isValidNullAtom(buf[idx:]) {
   377  			goto fail
   378  		}
   379  		pj.write_tape(0, 'n')
   380  		/* goto array_continue */
   381  
   382  	case '-':
   383  		if !addNumber(buf[idx:], &pj.ParsedJson) {
   384  			goto fail
   385  		}
   386  
   387  	case '{':
   388  		// we have not yet encountered ] so we need to come back for it
   389  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst)
   390  		pj.write_tape(0, '{') //  here the compilers knows what c is so this gets optimized
   391  		goto object_begin
   392  
   393  	case '[':
   394  		// we have not yet encountered ] so we need to come back for it
   395  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst)
   396  		pj.write_tape(0, '[') // here the compilers knows what c is so this gets optimized
   397  		goto arrayBegin
   398  
   399  	default:
   400  		if buf[idx] >= '0' && buf[idx] <= '9' {
   401  			if !addNumber(buf[idx:], &pj.ParsedJson) {
   402  				goto fail
   403  			}
   404  			break
   405  		}
   406  		goto fail
   407  	}
   408  
   409  arrayContinue:
   410  	if done, idx = updateChar(pj, idx); done {
   411  		goto succeed
   412  	}
   413  	switch buf[idx] {
   414  	case ',':
   415  		if done, idx = updateChar(pj, idx); done {
   416  			goto succeed
   417  		}
   418  		goto mainArraySwitch
   419  
   420  	case ']':
   421  		goto scopeEnd
   422  
   423  	default:
   424  		goto fail
   425  	}
   426  
   427  	////////////////////////////// FINAL STATES /////////////////////////////
   428  succeed:
   429  	offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1]
   430  	// drop last element
   431  	pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1]
   432  
   433  	// Sanity checks
   434  	if len(pj.containingScopeOffset) != 0 {
   435  		return false, done
   436  	}
   437  
   438  	pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot)
   439  	pj.write_tape(offset>>retAddressShift, 'r') // r is root
   440  
   441  	pj.isvalid = true
   442  	return true, done
   443  
   444  fail:
   445  	return false, done
   446  }
   447  
   448  // structural chars here are
   449  // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
   450  // we are also interested in the four whitespace characters
   451  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
   452  
   453  // these are the chars that can follow a true/false/null or number atom
   454  // and nothing else
   455  var structuralOrWhitespaceNegated = [256]byte{
   456  	0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
   457  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   458  	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   459  
   460  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   461  	1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   462  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
   463  
   464  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   465  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   466  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   467  
   468  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   469  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   470  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
   471  
   472  // return non-zero if not a structural or whitespace char
   473  // zero otherwise
   474  func isNotStructuralOrWhitespace(c byte) byte {
   475  	return structuralOrWhitespaceNegated[c]
   476  }