github.com/dgraph-io/simdjson-go@v0.3.0/ndjson_test.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2020 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package simdjson
    18  
    19  import (
    20  	"fmt"
    21  	"io/ioutil"
    22  	"log"
    23  	"net/http"
    24  	"os"
    25  	"path/filepath"
    26  	"strings"
    27  	"testing"
    28  
    29  	"github.com/klauspost/compress/zstd"
    30  )
    31  
    32  const demo_ndjson = `{"Image":{"Width":800,"Height":600,"Title":"View from 15th Floor","Thumbnail":{"Url":"http://www.example.com/image/481989943","Height":125,"Width":100},"Animated":false,"IDs":[116,943,234,38793]}}
    33  {"Image":{"Width":801,"Height":601,"Title":"View from 15th Floor","Thumbnail":{"Url":"http://www.example.com/image/481989943","Height":125,"Width":100},"Animated":false,"IDs":[116,943,234,38793]}}
    34  {"Image":{"Width":802,"Height":602,"Title":"View from 15th Floor","Thumbnail":{"Url":"http://www.example.com/image/481989943","Height":125,"Width":100},"Animated":false,"IDs":[116,943,234,38793]}}`
    35  
    36  func verifyDemoNdjson(pj internalParsedJson, t *testing.T, object int) {
    37  
    38  	const nul = '\000'
    39  
    40  	testCases := []struct {
    41  		expected []struct {
    42  			c   byte
    43  			val uint64
    44  		}
    45  	}{
    46  		{
    47  			[]struct {
    48  				c   byte
    49  				val uint64
    50  			}{
    51  				// First object
    52  				{'r', 0x33},
    53  				{'{', 0x32},
    54  				{'"', 0x2},
    55  				{nul, 0x5},
    56  				{'{', 0x31},
    57  				{'"', 0xb},
    58  				{nul, 0x5},
    59  				{'l', 0x0},
    60  				{nul, 0x320},
    61  				{'"', 0x17},
    62  				{nul, 0x6},
    63  				{'l', 0x0},
    64  				{nul, 0x258},
    65  				{'"', 0x24},
    66  				{nul, 0x5},
    67  				{'"', 0x2c},
    68  				{nul, 0x14},
    69  				{'"', 0x43},
    70  				{nul, 0x9},
    71  				{'{', 0x21},
    72  				{'"', 0x50},
    73  				{nul, 0x3},
    74  				{'"', 0x56},
    75  				{nul, 0x26},
    76  				{'"', 0x7f},
    77  				{nul, 0x6},
    78  				{'l', 0x0},
    79  				{nul, 0x7d},
    80  				{'"', 0x8c},
    81  				{nul, 0x5},
    82  				{'l', 0x0},
    83  				{nul, 0x64},
    84  				{'}', 0x13},
    85  				{'"', 0x99},
    86  				{nul, 0x8},
    87  				{'f', 0x0},
    88  				{'"', 0xaa},
    89  				{nul, 0x3},
    90  				{'[', 0x30},
    91  				{'l', 0x0},
    92  				{nul, 0x74},
    93  				{'l', 0x0},
    94  				{nul, 0x3af},
    95  				{'l', 0x0},
    96  				{nul, 0xea},
    97  				{'l', 0x0},
    98  				{nul, 0x9789},
    99  				{']', 0x26},
   100  				{'}', 0x4},
   101  				{'}', 0x1},
   102  				{'r', 0x0},
   103  				//
   104  				// Second object
   105  				{'r', 0x66},
   106  				{'{', 0x65},
   107  				{'"', 0xc7},
   108  				{nul, 0x5},
   109  				{'{', 0x64},
   110  				{'"', 0xd0},
   111  				{nul, 0x5},
   112  				{'l', 0x0},
   113  				{nul, 0x321},
   114  				{'"', 0xdc},
   115  				{nul, 0x6},
   116  				{'l', 0x0},
   117  				{nul, 0x259},
   118  				{'"', 0xe9},
   119  				{nul, 0x5},
   120  				{'"', 0xf1},
   121  				{nul, 0x14},
   122  				{'"', 0x108},
   123  				{nul, 0x9},
   124  				{'{', 0x54},
   125  				{'"', 0x115},
   126  				{nul, 0x3},
   127  				{'"', 0x11b},
   128  				{nul, 0x26},
   129  				{'"', 0x144},
   130  				{nul, 0x6},
   131  				{'l', 0x0},
   132  				{nul, 0x7d},
   133  				{'"', 0x151},
   134  				{nul, 0x5},
   135  				{'l', 0x0},
   136  				{nul, 0x64},
   137  				{'}', 0x46},
   138  				{'"', 0x15e},
   139  				{nul, 0x8},
   140  				{'f', 0x0},
   141  				{'"', 0x16f},
   142  				{nul, 0x3},
   143  				{'[', 0x63},
   144  				{'l', 0x0},
   145  				{nul, 0x74},
   146  				{'l', 0x0},
   147  				{nul, 0x3af},
   148  				{'l', 0x0},
   149  				{nul, 0xea},
   150  				{'l', 0x0},
   151  				{nul, 0x9789},
   152  				{']', 0x59},
   153  				{'}', 0x37},
   154  				{'}', 0x34},
   155  				{'r', 0x33},
   156  				//
   157  				// Third object
   158  				{'r', 0x99},
   159  				{'{', 0x98},
   160  				{'"', 0x18c},
   161  				{nul, 0x5},
   162  				{'{', 0x97},
   163  				{'"', 0x195},
   164  				{nul, 0x5},
   165  				{'l', 0x0},
   166  				{nul, 0x322},
   167  				{'"', 0x1a1},
   168  				{nul, 0x6},
   169  				{'l', 0x0},
   170  				{nul, 0x25a},
   171  				{'"', 0x1ae},
   172  				{nul, 0x5},
   173  				{'"', 0x1b6},
   174  				{nul, 0x14},
   175  				{'"', 0x1cd},
   176  				{nul, 0x9},
   177  				{'{', 0x87},
   178  				{'"', 0x1da},
   179  				{nul, 0x3},
   180  				{'"', 0x1e0},
   181  				{nul, 0x26},
   182  				{'"', 0x209},
   183  				{nul, 0x6},
   184  				{'l', 0x0},
   185  				{nul, 0x7d},
   186  				{'"', 0x216},
   187  				{nul, 0x5},
   188  				{'l', 0x0},
   189  				{nul, 0x64},
   190  				{'}', 0x79},
   191  				{'"', 0x223},
   192  				{nul, 0x8},
   193  				{'f', 0x0},
   194  				{'"', 0x234},
   195  				{nul, 0x3},
   196  				{'[', 0x96},
   197  				{'l', 0x0},
   198  				{nul, 0x74},
   199  				{'l', 0x0},
   200  				{nul, 0x3af},
   201  				{'l', 0x0},
   202  				{nul, 0xea},
   203  				{'l', 0x0},
   204  				{nul, 0x9789},
   205  				{']', 0x8c},
   206  				{'}', 0x6a},
   207  				{'}', 0x67},
   208  				{'r', 0x66},
   209  			},
   210  		},
   211  	}
   212  
   213  	tc := testCases[0]
   214  
   215  	//	For TestFindNewlineDelimiters, adjust the array that we are testing against
   216  	if object == 1 {
   217  		tc.expected = tc.expected[:51]
   218  	} else if object == 2 || object == 3 {
   219  		tc.expected = tc.expected[:51]
   220  
   221  		adjustQoutes := []uint64{2, 5, 9, 13, 15, 17, 20, 22, 24, 28, 33, 36}
   222  		for _, a := range adjustQoutes {
   223  			tc.expected[a].val += 1
   224  		}
   225  		if object == 2 {
   226  			tc.expected[8].val = 801
   227  			tc.expected[12].val = 601
   228  		} else if object == 3 {
   229  			tc.expected[8].val = 802
   230  			tc.expected[12].val = 602
   231  		}
   232  	}
   233  
   234  	if len(pj.Tape) != len(tc.expected) {
   235  		t.Errorf("verifyDemoNdjson: got: %d want: %d", len(pj.Tape), len(tc.expected))
   236  	}
   237  	for ii, tp := range pj.Tape {
   238  		//c := "'" + string(byte(tp >> 56)) + "'"
   239  		//if byte(tp >> 56) == 0 {
   240  		//	c = "nul"
   241  		//}
   242  		//fmt.Printf("{%s, 0x%x},\n", c, tp&0xffffffffffffff)
   243  		expected := tc.expected[ii].val | (uint64(tc.expected[ii].c) << 56)
   244  		if !alwaysCopyStrings && tp != expected {
   245  			t.Errorf("verifyDemoNdjson(%d): got: %016x want: %016x", ii, tp, expected)
   246  		}
   247  	}
   248  }
   249  
   250  func TestNdjsonCountWhere(t *testing.T) {
   251  	if !SupportedCPU() {
   252  		t.SkipNow()
   253  	}
   254  	if testing.Short() {
   255  		t.Skip("skipping... too long")
   256  	}
   257  	ndjson := loadFile("testdata/parking-citations-1M.json.zst")
   258  	pj, err := ParseND(ndjson, nil)
   259  	if err != nil {
   260  		t.Fatal(err)
   261  	}
   262  
   263  	const want = 110349
   264  	if result := countWhere("Make", "HOND", *pj); result != want {
   265  		t.Errorf("TestNdjsonCountWhere: got: %d want: %d", result, want)
   266  	}
   267  }
   268  
   269  func TestNdjsonCountWhere2(t *testing.T) {
   270  	if !SupportedCPU() {
   271  		t.SkipNow()
   272  	}
   273  	if testing.Short() {
   274  		t.Skip("skipping... too long")
   275  	}
   276  	ndjson := loadFile("testdata/RC_2009-01.json.zst")
   277  	// Test trimming
   278  	b := make([]byte, 0, len(ndjson)+4)
   279  	b = append(b, '\n', '\n')
   280  	b = append(b, ndjson...)
   281  	b = append(b, '\n', '\n')
   282  	pj, err := ParseND(ndjson, nil)
   283  	if err != nil {
   284  		t.Fatal(err)
   285  	}
   286  	const want = 170315
   287  	if result := countWhere("subreddit", "reddit.com", *pj); result != want {
   288  		t.Errorf("TestNdjsonCountWhere: got: %d want: %d", result, want)
   289  	}
   290  }
   291  
   292  func loadFile(filename string) []byte {
   293  	if !strings.HasSuffix(filename, ".zst") {
   294  		ndjson, err := ioutil.ReadFile(filename)
   295  		if err != nil {
   296  			panic("Failed to load file")
   297  		}
   298  		return ndjson
   299  	}
   300  	var f *os.File
   301  	var err error
   302  	for {
   303  		f, err = os.Open(filename)
   304  		if err == nil {
   305  			defer f.Close()
   306  			break
   307  		}
   308  		if os.IsNotExist(err) {
   309  			fmt.Println("downloading file" + filename)
   310  			resp, err := http.DefaultClient.Get("https://files.klauspost.com/compress/" + filepath.Base(filename))
   311  			if err == nil && resp.StatusCode == http.StatusOK {
   312  				b, err := ioutil.ReadAll(resp.Body)
   313  				if err == nil {
   314  					err = ioutil.WriteFile(filename, b, os.ModePerm)
   315  					if err == nil {
   316  						continue
   317  					}
   318  				}
   319  			}
   320  		}
   321  		panic("Failed to (down)load file:" + err.Error())
   322  	}
   323  	dec, err := zstd.NewReader(f)
   324  	if err != nil {
   325  		panic("Failed to create decompressor")
   326  	}
   327  	defer dec.Close()
   328  	ndjson, err := ioutil.ReadAll(dec)
   329  	if err != nil {
   330  		panic("Failed to load file")
   331  	}
   332  	return ndjson
   333  }
   334  
   335  func count_raw_tape(tape []uint64) (count int) {
   336  
   337  	for tapeidx := uint64(0); tapeidx < uint64(len(tape)); count++ {
   338  		tape_val := tape[tapeidx]
   339  		tapeidx = tape_val & JSONVALUEMASK
   340  	}
   341  
   342  	return
   343  }
   344  
   345  func countWhere(key, value string, data ParsedJson) (count int) {
   346  	tmpi := data.Iter()
   347  	stack := []*Iter{&tmpi}
   348  	var obj *Object
   349  	var tmp *Iter
   350  	var elem Element
   351  
   352  	for len(stack) > 0 {
   353  		iter := stack[len(stack)-1]
   354  		typ := iter.Advance()
   355  
   356  	typeswitch:
   357  		switch typ {
   358  		case TypeNone:
   359  			if len(stack) == 0 {
   360  				return
   361  			}
   362  			stack = stack[:len(stack)-1]
   363  		case TypeRoot:
   364  			var err error
   365  			typ, tmp, err = iter.Root(tmp)
   366  			if err != nil {
   367  				log.Fatal(err)
   368  			}
   369  			switch typ {
   370  			case TypeNone:
   371  				break typeswitch
   372  			case TypeObject:
   373  			default:
   374  				log.Fatalf("expected object inside root, got %v", typ)
   375  			}
   376  			if len(stack) > 2 {
   377  				break
   378  			}
   379  			obj, err = tmp.Object(obj)
   380  			if err != nil {
   381  				log.Fatal(err)
   382  			}
   383  			e := obj.FindKey(key, &elem)
   384  			if e != nil && elem.Type == TypeString {
   385  				v, _ := elem.Iter.StringBytes()
   386  				if string(v) == value {
   387  					count++
   388  				}
   389  			}
   390  		default:
   391  		}
   392  	}
   393  
   394  	return
   395  }
   396  
   397  func countObjects(data ParsedJson) (count int) {
   398  	iter := data.Iter()
   399  	for {
   400  		typ := iter.Advance()
   401  		switch typ {
   402  		case TypeNone:
   403  			return
   404  		case TypeRoot:
   405  			count++
   406  		default:
   407  			panic(typ)
   408  		}
   409  	}
   410  }
   411  
   412  func BenchmarkNdjsonWarmCountStar(b *testing.B) {
   413  	if !SupportedCPU() {
   414  		b.SkipNow()
   415  	}
   416  
   417  	ndjson := loadFile("testdata/parking-citations-1M.json.zst")
   418  
   419  	pj, err := ParseND(ndjson, nil)
   420  	if err != nil {
   421  		b.Fatal(err)
   422  	}
   423  	b.SetBytes(int64(len(ndjson)))
   424  	b.ReportAllocs()
   425  	b.ResetTimer()
   426  
   427  	for i := 0; i < b.N; i++ {
   428  		countObjects(*pj)
   429  	}
   430  }
   431  
   432  func BenchmarkNdjsonWarmCountStarWithWhere(b *testing.B) {
   433  	if !SupportedCPU() {
   434  		b.SkipNow()
   435  	}
   436  
   437  	ndjson := loadFile("testdata/parking-citations-1M.json.zst")
   438  
   439  	pj, err := ParseND(ndjson, nil)
   440  	if err != nil {
   441  		b.Fatal(err)
   442  	}
   443  
   444  	b.Run("iter", func(b *testing.B) {
   445  		b.SetBytes(int64(len(ndjson)))
   446  		b.ReportAllocs()
   447  		b.ResetTimer()
   448  		for i := 0; i < b.N; i++ {
   449  			countWhere("Make", "HOND", *pj)
   450  		}
   451  	})
   452  }