github.com/dgraph-io/simdjson-go@v0.3.0/stage2_build_tape_amd64.go (about)

     1  //+build !noasm
     2  //+build !appengine
     3  //+build gc
     4  
     5  /*
     6   * MinIO Cloud Storage, (C) 2020 MinIO, Inc.
     7   *
     8   * Licensed under the Apache License, Version 2.0 (the "License");
     9   * you may not use this file except in compliance with the License.
    10   * You may obtain a copy of the License at
    11   *
    12   *     http://www.apache.org/licenses/LICENSE-2.0
    13   *
    14   * Unless required by applicable law or agreed to in writing, software
    15   * distributed under the License is distributed on an "AS IS" BASIS,
    16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    17   * See the License for the specific language governing permissions and
    18   * limitations under the License.
    19   */
    20  
    21  package simdjson
    22  
    23  import (
    24  	"bytes"
    25  	"encoding/binary"
    26  	"fmt"
    27  )
    28  
    29  // Constants for "return address" modes
    30  const retAddressShift = 2
    31  const retAddressStartConst = 1
    32  const retAddressObjectConst = 2
    33  const retAddressArrayConst = 3
    34  
    35  func updateChar(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) {
    36  	if pj.indexesChan.index >= pj.indexesChan.length {
    37  		var ok bool
    38  		pj.indexesChan, ok = <-pj.indexChans // Get next element from channel
    39  		if !ok {
    40  			done = true // return done if channel closed
    41  			return
    42  		}
    43  	}
    44  	idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index])
    45  	pj.indexesChan.index++
    46  	return
    47  }
    48  
    49  // Handy "debug" function to see where Stage 2 fails (rename to `updateChar`)
    50  func updateCharDebug(pj *internalParsedJson, idx_in uint64) (done bool, idx uint64) {
    51  	if pj.indexesChan.index >= pj.indexesChan.length {
    52  		var ok bool
    53  		pj.indexesChan, ok = <-pj.indexChans // Get next element from channel
    54  		if !ok {
    55  			done = true // return done if channel closed
    56  			return
    57  		}
    58  	}
    59  	idx = idx_in + uint64(pj.indexesChan.indexes[pj.indexesChan.index])
    60  	fmt.Printf("At 0x%x char: %s\n", idx, string(pj.Message[idx]))
    61  	pj.indexesChan.index++
    62  	return
    63  }
    64  
    65  func peekSize(pj *internalParsedJson) uint64 {
    66  	if pj.indexesChan.index >= pj.indexesChan.length {
    67  		//panic("cannot peek the size") // should never happen since last string element should be saved for next buffer
    68  		// let's return 0 for the sake of safety (could lead to a string being to short)
    69  		return 0
    70  	}
    71  	return uint64(pj.indexesChan.indexes[pj.indexesChan.index])
    72  }
    73  
    74  func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64) bool {
    75  	size := uint64(0)
    76  	need_copy := false
    77  	buf := pj.Message[idx:]
    78  	// Make sure that we have at least one full YMM word available after maxStringSize into the buffer
    79  	if len(buf)-int(maxStringSize) < 64 {
    80  		if len(buf) > 512-64 { // only allocated if needed
    81  			paddedBuf := make([]byte, len(buf)+64)
    82  			copy(paddedBuf, buf)
    83  			buf = paddedBuf
    84  		} else {
    85  			paddedBuf := [512]byte{}
    86  			copy(paddedBuf[:], buf)
    87  			buf = paddedBuf[:]
    88  		}
    89  	}
    90  	if !parseStringSimdValidateOnly(buf, &maxStringSize, &size, &need_copy) {
    91  		return false
    92  	}
    93  	if !need_copy {
    94  		pj.write_tape(idx+1, '"')
    95  	} else {
    96  		// Make sure we account for at least 32 bytes additional space due to
    97  		requiredLen := uint64(len(pj.Strings)) + size + 32
    98  		if requiredLen >= uint64(cap(pj.Strings)) {
    99  			newSize := uint64(cap(pj.Strings) * 2)
   100  			if newSize < requiredLen {
   101  				newSize = requiredLen + size // add size once more to account for further space
   102  			}
   103  			strs := make([]byte, len(pj.Strings), newSize)
   104  			copy(strs, pj.Strings)
   105  			pj.Strings = strs
   106  		}
   107  		start := len(pj.Strings)
   108  		_ = parseStringSimd(buf, &pj.Strings) // We can safely ignore the result since we validate above
   109  		pj.write_tape(uint64(STRINGBUFBIT+start), '"')
   110  		size = uint64(len(pj.Strings) - start)
   111  	}
   112  	// put length onto the tape
   113  	pj.Tape = append(pj.Tape, size)
   114  	return true
   115  }
   116  
   117  func addNumber(buf []byte, pj *ParsedJson) (bool, error) {
   118  	tag, val, flags, pos := parseNumber(buf)
   119  	if tag == TagEnd {
   120  		return false, nil
   121  	}
   122  	if FloatFlags(flags).Contains(FloatOverflowedInteger) {
   123  		return false, fmt.Errorf(`simdjson-go: parsing: "%s": value out of range`, string(buf[:pos]))
   124  	}
   125  	pj.writeTapeTagValFlags(tag, val, flags)
   126  	return true, nil
   127  }
   128  
   129  func isValidTrueAtom(buf []byte) bool {
   130  	if len(buf) >= 8 { // fast path when there is enough space left in the buffer
   131  		tv := uint64(0x0000000065757274) // "true    "
   132  		mask4 := uint64(0x00000000ffffffff)
   133  		locval := binary.LittleEndian.Uint64(buf)
   134  		error := (locval & mask4) ^ tv
   135  		error |= uint64(isNotStructuralOrWhitespace(buf[4]))
   136  		return error == 0
   137  	} else if len(buf) >= 5 {
   138  		return bytes.Compare(buf[:4], []byte("true")) == 0 && isNotStructuralOrWhitespace(buf[4]) == 0
   139  	}
   140  	return false
   141  }
   142  
   143  func isValidFalseAtom(buf []byte) bool {
   144  	if len(buf) >= 8 { // fast path when there is enough space left in the buffer
   145  		fv := uint64(0x00000065736c6166) // "false   "
   146  		mask5 := uint64(0x000000ffffffffff)
   147  		locval := binary.LittleEndian.Uint64(buf)
   148  		error := (locval & mask5) ^ fv
   149  		error |= uint64(isNotStructuralOrWhitespace(buf[5]))
   150  		return error == 0
   151  	} else if len(buf) >= 6 {
   152  		return bytes.Compare(buf[:5], []byte("false")) == 0 && isNotStructuralOrWhitespace(buf[5]) == 0
   153  	}
   154  	return false
   155  }
   156  
   157  func isValidNullAtom(buf []byte) bool {
   158  	if len(buf) >= 8 { // fast path when there is enough space left in the buffer
   159  		nv := uint64(0x000000006c6c756e) // "null    "
   160  		mask4 := uint64(0x00000000ffffffff)
   161  		locval := binary.LittleEndian.Uint64(buf) // we want to avoid unaligned 64-bit loads (undefined in C/C++)
   162  		error := (locval & mask4) ^ nv
   163  		error |= uint64(isNotStructuralOrWhitespace(buf[4]))
   164  		return error == 0
   165  	} else if len(buf) >= 5 {
   166  		return bytes.Compare(buf[:4], []byte("null")) == 0 && isNotStructuralOrWhitespace(buf[4]) == 0
   167  	}
   168  	return false
   169  }
   170  
   171  func unifiedMachine(buf []byte, pj *internalParsedJson) (bool, error) {
   172  
   173  	const addOneForRoot = 1
   174  
   175  	done := false
   176  	idx := ^uint64(0)   // location of the structural character in the input (buf)
   177  	offset := uint64(0) // used to contain last element of containing_scope_offset
   178  
   179  	////////////////////////////// START STATE /////////////////////////////
   180  	pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   181  
   182  	pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten
   183  	// the root is used, if nothing else, to capture the size of the tape
   184  
   185  	if done, idx = updateChar(pj, idx); done {
   186  		goto succeed
   187  	}
   188  continueRoot:
   189  	switch buf[idx] {
   190  	case '{':
   191  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   192  		pj.write_tape(0, buf[idx])
   193  		goto object_begin
   194  	case '[':
   195  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   196  		pj.write_tape(0, buf[idx])
   197  		goto arrayBegin
   198  	default:
   199  		goto fail
   200  	}
   201  
   202  startContinue:
   203  	// We are back at the top, read the next char and we should be done
   204  	if done, idx = updateChar(pj, idx); done {
   205  		goto succeed
   206  	} else {
   207  		// For an ndjson object, wrap up current object, start new root and check for minimum of 1 newline
   208  		if buf[idx] != '\n' {
   209  			goto fail
   210  		}
   211  
   212  		// Eat any empty lines
   213  		for buf[idx] == '\n' {
   214  			if done, idx = updateChar(pj, idx); done {
   215  				goto succeed
   216  			}
   217  		}
   218  
   219  		// Otherwise close current root
   220  		offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1]
   221  
   222  		// drop last element
   223  		pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1]
   224  
   225  		pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot)
   226  		pj.write_tape(offset>>retAddressShift, 'r') // r is root
   227  
   228  		// And open a new root
   229  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressStartConst)
   230  		pj.write_tape(0, 'r') // r for root, 0 is going to get overwritten
   231  
   232  		goto continueRoot
   233  	}
   234  
   235  	//////////////////////////////// OBJECT STATES /////////////////////////////
   236  
   237  object_begin:
   238  	if done, idx = updateChar(pj, idx); done {
   239  		goto succeed
   240  	}
   241  	switch buf[idx] {
   242  	case '"':
   243  		if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
   244  			goto fail
   245  		}
   246  		goto object_key_state
   247  	case '}':
   248  		goto scopeEnd // could also go to object_continue
   249  	default:
   250  		goto fail
   251  	}
   252  
   253  object_key_state:
   254  	if done, idx = updateChar(pj, idx); done {
   255  		goto succeed
   256  	}
   257  	if buf[idx] != ':' {
   258  		goto fail
   259  	}
   260  	if done, idx = updateChar(pj, idx); done {
   261  		goto succeed
   262  	}
   263  	switch buf[idx] {
   264  	case '"':
   265  		if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
   266  			goto fail
   267  		}
   268  
   269  	case 't':
   270  		if !isValidTrueAtom(buf[idx:]) {
   271  			goto fail
   272  		}
   273  		pj.write_tape(0, buf[idx])
   274  
   275  	case 'f':
   276  		if !isValidFalseAtom(buf[idx:]) {
   277  			goto fail
   278  		}
   279  		pj.write_tape(0, buf[idx])
   280  
   281  	case 'n':
   282  		if !isValidNullAtom(buf[idx:]) {
   283  			goto fail
   284  		}
   285  		pj.write_tape(0, buf[idx])
   286  
   287  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   288  		added, err := addNumber(buf[idx:], &pj.ParsedJson)
   289  		if err != nil {
   290  			return false, err
   291  		}
   292  		if !added {
   293  			goto fail
   294  		}
   295  
   296  	case '-':
   297  		added, err := addNumber(buf[idx:], &pj.ParsedJson)
   298  		if err != nil {
   299  			return false, err
   300  		}
   301  		if !added {
   302  			goto fail
   303  		}
   304  
   305  	case '{':
   306  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst)
   307  		pj.write_tape(0, buf[idx])
   308  		// we have not yet encountered } so we need to come back for it
   309  		goto object_begin
   310  
   311  	case '[':
   312  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressObjectConst)
   313  		pj.write_tape(0, buf[idx])
   314  		// we have not yet encountered } so we need to come back for it
   315  		goto arrayBegin
   316  
   317  	default:
   318  		goto fail
   319  	}
   320  
   321  objectContinue:
   322  	if done, idx = updateChar(pj, idx); done {
   323  		goto succeed
   324  	}
   325  	switch buf[idx] {
   326  	case ',':
   327  		if done, idx = updateChar(pj, idx); done {
   328  			goto succeed
   329  		}
   330  		if buf[idx] != '"' {
   331  			goto fail
   332  		}
   333  		if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
   334  			goto fail
   335  		}
   336  		goto object_key_state
   337  
   338  	case '}':
   339  		goto scopeEnd
   340  
   341  	default:
   342  		goto fail
   343  	}
   344  
   345  	////////////////////////////// COMMON STATE /////////////////////////////
   346  scopeEnd:
   347  	// write our tape location to the header scope
   348  	offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1]
   349  	// drop last element
   350  	pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1]
   351  
   352  	pj.write_tape(offset>>retAddressShift, buf[idx])
   353  	pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc())
   354  
   355  	/* goto saved_state*/
   356  	switch offset & ((1 << retAddressShift) - 1) {
   357  	case retAddressArrayConst:
   358  		goto arrayContinue
   359  	case retAddressObjectConst:
   360  		goto objectContinue
   361  	default:
   362  		goto startContinue
   363  	}
   364  
   365  	////////////////////////////// ARRAY STATES /////////////////////////////
   366  arrayBegin:
   367  	if done, idx = updateChar(pj, idx); done {
   368  		goto succeed
   369  	}
   370  	if buf[idx] == ']' {
   371  		goto scopeEnd // could also go to array_continue
   372  	}
   373  
   374  mainArraySwitch:
   375  	// we call update char on all paths in, so we can peek at c on the
   376  	// on paths that can accept a close square brace (post-, and at start)
   377  	switch buf[idx] {
   378  	case '"':
   379  		if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
   380  			goto fail
   381  		}
   382  	case 't':
   383  		if !isValidTrueAtom(buf[idx:]) {
   384  			goto fail
   385  		}
   386  		pj.write_tape(0, buf[idx])
   387  
   388  	case 'f':
   389  		if !isValidFalseAtom(buf[idx:]) {
   390  			goto fail
   391  		}
   392  		pj.write_tape(0, buf[idx])
   393  
   394  	case 'n':
   395  		if !isValidNullAtom(buf[idx:]) {
   396  			goto fail
   397  		}
   398  		pj.write_tape(0, buf[idx])
   399  		/* goto array_continue */
   400  
   401  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-':
   402  		added, err := addNumber(buf[idx:], &pj.ParsedJson)
   403  		if err != nil {
   404  			return false, err
   405  		}
   406  		if !added {
   407  			goto fail
   408  		}
   409  
   410  	case '{':
   411  		// we have not yet encountered ] so we need to come back for it
   412  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst)
   413  		pj.write_tape(0, buf[idx]) //  here the compilers knows what c is so this gets optimized
   414  		goto object_begin
   415  
   416  	case '[':
   417  		// we have not yet encountered ] so we need to come back for it
   418  		pj.containingScopeOffset = append(pj.containingScopeOffset, (pj.get_current_loc()<<retAddressShift)|retAddressArrayConst)
   419  		pj.write_tape(0, buf[idx]) // here the compilers knows what c is so this gets optimized
   420  		goto arrayBegin
   421  
   422  	default:
   423  		goto fail
   424  	}
   425  
   426  arrayContinue:
   427  	if done, idx = updateChar(pj, idx); done {
   428  		goto succeed
   429  	}
   430  	switch buf[idx] {
   431  	case ',':
   432  		if done, idx = updateChar(pj, idx); done {
   433  			goto succeed
   434  		}
   435  		goto mainArraySwitch
   436  
   437  	case ']':
   438  		goto scopeEnd
   439  
   440  	default:
   441  		goto fail
   442  	}
   443  
   444  	////////////////////////////// FINAL STATES /////////////////////////////
   445  succeed:
   446  	offset = pj.containingScopeOffset[len(pj.containingScopeOffset)-1]
   447  	// drop last element
   448  	pj.containingScopeOffset = pj.containingScopeOffset[:len(pj.containingScopeOffset)-1]
   449  
   450  	// Sanity checks
   451  	if len(pj.containingScopeOffset) != 0 {
   452  		return false, nil
   453  	}
   454  
   455  	pj.annotate_previousloc(offset>>retAddressShift, pj.get_current_loc()+addOneForRoot)
   456  	pj.write_tape(offset>>retAddressShift, 'r') // r is root
   457  
   458  	pj.isvalid = true
   459  	return true, nil
   460  
   461  fail:
   462  	return false, nil
   463  }
   464  
   465  // structural chars here are
   466  // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
   467  // we are also interested in the four whitespace characters
   468  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
   469  
   470  // these are the chars that can follow a true/false/null or number atom
   471  // and nothing else
   472  var structuralOrWhitespaceNegated = [256]byte{
   473  	0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
   474  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   475  	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   476  
   477  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   478  	1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   479  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
   480  
   481  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   482  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   483  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   484  
   485  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   486  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   487  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
   488  
   489  // return non-zero if not a structural or whitespace char
   490  // zero otherwise
   491  func isNotStructuralOrWhitespace(c byte) byte {
   492  	return structuralOrWhitespaceNegated[c]
   493  }