github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/analyzer_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"bytes"
    16  	"math"
    17  	"sort"
    18  	"testing"
    19  
    20  	"github.com/stretchr/testify/assert"
    21  	"github.com/stretchr/testify/require"
    22  	"github.com/weaviate/weaviate/entities/models"
    23  )
    24  
    25  func TestAnalyzer(t *testing.T) {
    26  	a := NewAnalyzer(nil)
    27  
    28  	countable := func(data []string, freq []int) []Countable {
    29  		countable := make([]Countable, len(data))
    30  		for i := range data {
    31  			countable[i] = Countable{
    32  				Data:          []byte(data[i]),
    33  				TermFrequency: float32(freq[i]),
    34  			}
    35  		}
    36  		return countable
    37  	}
    38  
    39  	t.Run("with text", func(t *testing.T) {
    40  		type testCase struct {
    41  			name              string
    42  			input             string
    43  			tokenization      string
    44  			expectedCountable []Countable
    45  		}
    46  
    47  		testCases := []testCase{
    48  			{
    49  				name:         "tokenization word, unique words",
    50  				input:        "Hello, my name is John Doe",
    51  				tokenization: models.PropertyTokenizationWord,
    52  				expectedCountable: countable(
    53  					[]string{"hello", "my", "name", "is", "john", "doe"},
    54  					[]int{1, 1, 1, 1, 1, 1},
    55  				),
    56  			},
    57  			{
    58  				name:         "tokenization word, duplicated words",
    59  				input:        "Du. Du hast. Du hast. Du hast mich gefragt.",
    60  				tokenization: models.PropertyTokenizationWord,
    61  				expectedCountable: countable(
    62  					[]string{"du", "hast", "mich", "gefragt"},
    63  					[]int{4, 3, 1, 1},
    64  				),
    65  			},
    66  			{
    67  				name:         "tokenization lowercase, unique words",
    68  				input:        "My email is john-thats-jay.ohh.age.n+alloneword@doe.com",
    69  				tokenization: models.PropertyTokenizationLowercase,
    70  				expectedCountable: countable(
    71  					[]string{"my", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"},
    72  					[]int{1, 1, 1, 1},
    73  				),
    74  			},
    75  			{
    76  				name:         "tokenization lowercase, duplicated words",
    77  				input:        "Du. Du hast. Du hast. Du hast mich gefragt.",
    78  				tokenization: models.PropertyTokenizationLowercase,
    79  				expectedCountable: countable(
    80  					[]string{"du.", "du", "hast.", "hast", "mich", "gefragt."},
    81  					[]int{1, 3, 2, 1, 1, 1},
    82  				),
    83  			},
    84  			{
    85  				name:         "tokenization whitespace, unique words",
    86  				input:        "My email is john-thats-jay.ohh.age.n+alloneword@doe.com",
    87  				tokenization: models.PropertyTokenizationWhitespace,
    88  				expectedCountable: countable(
    89  					[]string{"My", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"},
    90  					[]int{1, 1, 1, 1},
    91  				),
    92  			},
    93  			{
    94  				name:         "tokenization whitespace, duplicated words",
    95  				input:        "Du. Du hast. Du hast. Du hast mich gefragt.",
    96  				tokenization: models.PropertyTokenizationWhitespace,
    97  				expectedCountable: countable(
    98  					[]string{"Du.", "Du", "hast.", "hast", "mich", "gefragt."},
    99  					[]int{1, 3, 2, 1, 1, 1},
   100  				),
   101  			},
   102  			{
   103  				name:         "tokenization field",
   104  				input:        "\n Du. Du hast. Du hast. Du hast mich gefragt.\t ",
   105  				tokenization: models.PropertyTokenizationField,
   106  				expectedCountable: countable(
   107  					[]string{"Du. Du hast. Du hast. Du hast mich gefragt."},
   108  					[]int{1},
   109  				),
   110  			},
   111  			{
   112  				name:              "non existing tokenization",
   113  				input:             "Du. Du hast. Du hast. Du hast mich gefragt.",
   114  				tokenization:      "non_existing",
   115  				expectedCountable: []Countable{},
   116  			},
   117  		}
   118  
   119  		for _, tc := range testCases {
   120  			t.Run(tc.name, func(t *testing.T) {
   121  				countable := a.Text(tc.tokenization, tc.input)
   122  				assert.ElementsMatch(t, tc.expectedCountable, countable)
   123  			})
   124  		}
   125  	})
   126  
   127  	t.Run("with text array", func(t *testing.T) {
   128  		type testCase struct {
   129  			name              string
   130  			input             []string
   131  			tokenization      string
   132  			expectedCountable []Countable
   133  		}
   134  
   135  		testCases := []testCase{
   136  			{
   137  				name:         "tokenization word, unique words",
   138  				input:        []string{"Hello,", "my name is John Doe"},
   139  				tokenization: models.PropertyTokenizationWord,
   140  				expectedCountable: countable(
   141  					[]string{"hello", "my", "name", "is", "john", "doe"},
   142  					[]int{1, 1, 1, 1, 1, 1},
   143  				),
   144  			},
   145  			{
   146  				name:         "tokenization word, duplicated words",
   147  				input:        []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."},
   148  				tokenization: models.PropertyTokenizationWord,
   149  				expectedCountable: countable(
   150  					[]string{"du", "hast", "mich", "gefragt"},
   151  					[]int{4, 3, 1, 1},
   152  				),
   153  			},
   154  			{
   155  				name:         "tokenization lowercase, unique words",
   156  				input:        []string{"My email", "is john-thats-jay.ohh.age.n+alloneword@doe.com"},
   157  				tokenization: models.PropertyTokenizationLowercase,
   158  				expectedCountable: countable(
   159  					[]string{"my", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"},
   160  					[]int{1, 1, 1, 1},
   161  				),
   162  			},
   163  			{
   164  				name:         "tokenization lowercase, duplicated words",
   165  				input:        []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."},
   166  				tokenization: models.PropertyTokenizationLowercase,
   167  				expectedCountable: countable(
   168  					[]string{"du.", "du", "hast.", "hast", "mich", "gefragt."},
   169  					[]int{1, 3, 2, 1, 1, 1},
   170  				),
   171  			},
   172  			{
   173  				name:         "tokenization whitespace, unique words",
   174  				input:        []string{"My email", "is john-thats-jay.ohh.age.n+alloneword@doe.com"},
   175  				tokenization: models.PropertyTokenizationWhitespace,
   176  				expectedCountable: countable(
   177  					[]string{"My", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"},
   178  					[]int{1, 1, 1, 1},
   179  				),
   180  			},
   181  			{
   182  				name:         "tokenization whitespace, duplicated words",
   183  				input:        []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."},
   184  				tokenization: models.PropertyTokenizationWhitespace,
   185  				expectedCountable: countable(
   186  					[]string{"Du.", "Du", "hast.", "hast", "mich", "gefragt."},
   187  					[]int{1, 3, 2, 1, 1, 1},
   188  				),
   189  			},
   190  			{
   191  				name:         "tokenization field",
   192  				input:        []string{"\n Du. Du hast. Du hast.", "Du hast mich gefragt.\t "},
   193  				tokenization: models.PropertyTokenizationField,
   194  				expectedCountable: countable(
   195  					[]string{"Du. Du hast. Du hast.", "Du hast mich gefragt."},
   196  					[]int{1, 1},
   197  				),
   198  			},
   199  			{
   200  				name:              "non existing tokenization",
   201  				input:             []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."},
   202  				tokenization:      "non_existing",
   203  				expectedCountable: []Countable{},
   204  			},
   205  		}
   206  
   207  		for _, tc := range testCases {
   208  			t.Run(tc.name, func(t *testing.T) {
   209  				countable := a.TextArray(tc.tokenization, tc.input)
   210  				assert.ElementsMatch(t, tc.expectedCountable, countable)
   211  			})
   212  		}
   213  	})
   214  
   215  	t.Run("with int it stays sortable", func(t *testing.T) {
   216  		getData := func(in []Countable, err error) []byte {
   217  			require.Nil(t, err)
   218  			return in[0].Data
   219  		}
   220  
   221  		results := [][]byte{
   222  			getData(a.Float(math.MinInt64)),
   223  			getData(a.Int(-1000000)),
   224  			getData(a.Int(-400000)),
   225  			getData(a.Int(-20000)),
   226  			getData(a.Int(-9000)),
   227  			getData(a.Int(-301)),
   228  			getData(a.Int(-300)),
   229  			getData(a.Int(-299)),
   230  			getData(a.Int(-1)),
   231  			getData(a.Int(0)),
   232  			getData(a.Int(1)),
   233  			getData(a.Int(299)),
   234  			getData(a.Int(300)),
   235  			getData(a.Int(301)),
   236  			getData(a.Int(9000)),
   237  			getData(a.Int(20000)),
   238  			getData(a.Int(400000)),
   239  			getData(a.Int(1000000)),
   240  			getData(a.Float(math.MaxInt64)),
   241  		}
   242  
   243  		afterSort := make([][]byte, len(results))
   244  		copy(afterSort, results)
   245  		sort.Slice(afterSort, func(a, b int) bool { return bytes.Compare(afterSort[a], afterSort[b]) == -1 })
   246  		assert.Equal(t, results, afterSort)
   247  	})
   248  
   249  	t.Run("with float it stays sortable", func(t *testing.T) {
   250  		getData := func(in []Countable, err error) []byte {
   251  			require.Nil(t, err)
   252  			return in[0].Data
   253  		}
   254  
   255  		results := [][]byte{
   256  			getData(a.Float(-math.MaxFloat64)),
   257  			getData(a.Float(-1000000)),
   258  			getData(a.Float(-400000)),
   259  			getData(a.Float(-20000)),
   260  			getData(a.Float(-9000.9)),
   261  			getData(a.Float(-9000.8999)),
   262  			getData(a.Float(-9000.8998)),
   263  			getData(a.Float(-9000.79999)),
   264  			getData(a.Float(-301)),
   265  			getData(a.Float(-300)),
   266  			getData(a.Float(-299)),
   267  			getData(a.Float(-1)),
   268  			getData(a.Float(-0.09)),
   269  			getData(a.Float(-0.01)),
   270  			getData(a.Float(-0.009)),
   271  			getData(a.Float(0)),
   272  			getData(a.Float(math.SmallestNonzeroFloat64)),
   273  			getData(a.Float(0.009)),
   274  			getData(a.Float(0.01)),
   275  			getData(a.Float(0.09)),
   276  			getData(a.Float(0.1)),
   277  			getData(a.Float(0.9)),
   278  			getData(a.Float(1)),
   279  			getData(a.Float(299)),
   280  			getData(a.Float(300)),
   281  			getData(a.Float(301)),
   282  			getData(a.Float(9000)),
   283  			getData(a.Float(20000)),
   284  			getData(a.Float(400000)),
   285  			getData(a.Float(1000000)),
   286  			getData(a.Float(math.MaxFloat64)),
   287  		}
   288  
   289  		afterSort := make([][]byte, len(results))
   290  		copy(afterSort, results)
   291  		sort.Slice(afterSort, func(a, b int) bool { return bytes.Compare(afterSort[a], afterSort[b]) == -1 })
   292  		assert.Equal(t, results, afterSort)
   293  	})
   294  
   295  	t.Run("with refCount it stays sortable", func(t *testing.T) {
   296  		getData := func(in []Countable, err error) []byte {
   297  			require.Nil(t, err)
   298  			return in[0].Data
   299  		}
   300  
   301  		results := [][]byte{
   302  			getData(a.RefCount(make(models.MultipleRef, 0))),
   303  			getData(a.RefCount(make(models.MultipleRef, 1))),
   304  			getData(a.RefCount(make(models.MultipleRef, 2))),
   305  			getData(a.RefCount(make(models.MultipleRef, 99))),
   306  			getData(a.RefCount(make(models.MultipleRef, 100))),
   307  			getData(a.RefCount(make(models.MultipleRef, 101))),
   308  			getData(a.RefCount(make(models.MultipleRef, 256))),
   309  			getData(a.RefCount(make(models.MultipleRef, 300))),
   310  			getData(a.RefCount(make(models.MultipleRef, 456))),
   311  		}
   312  
   313  		afterSort := make([][]byte, len(results))
   314  		copy(afterSort, results)
   315  		sort.Slice(afterSort, func(a, b int) bool { return bytes.Compare(afterSort[a], afterSort[b]) == -1 })
   316  		assert.Equal(t, results, afterSort)
   317  	})
   318  
   319  	byteTrue := []byte{0x1}
   320  	byteFalse := []byte{0x0}
   321  
   322  	t.Run("analyze bool", func(t *testing.T) {
   323  		t.Run("true", func(t *testing.T) {
   324  			countable, err := a.Bool(true)
   325  			require.Nil(t, err)
   326  			require.Len(t, countable, 1)
   327  
   328  			c := countable[0]
   329  			assert.Equal(t, byteTrue, c.Data)
   330  			assert.Equal(t, float32(0), c.TermFrequency)
   331  		})
   332  
   333  		t.Run("false", func(t *testing.T) {
   334  			countable, err := a.Bool(false)
   335  			require.Nil(t, err)
   336  			require.Len(t, countable, 1)
   337  
   338  			c := countable[0]
   339  			assert.Equal(t, byteFalse, c.Data)
   340  			assert.Equal(t, float32(0), c.TermFrequency)
   341  		})
   342  	})
   343  
   344  	t.Run("analyze bool array", func(t *testing.T) {
   345  		type testCase struct {
   346  			name     string
   347  			values   []bool
   348  			expected [][]byte
   349  		}
   350  
   351  		testCases := []testCase{
   352  			{
   353  				name:     "[true]",
   354  				values:   []bool{true},
   355  				expected: [][]byte{byteTrue},
   356  			},
   357  			{
   358  				name:     "[false]",
   359  				values:   []bool{false},
   360  				expected: [][]byte{byteFalse},
   361  			},
   362  			{
   363  				name:     "[true, true, true]",
   364  				values:   []bool{true, true, true},
   365  				expected: [][]byte{byteTrue, byteTrue, byteTrue},
   366  			},
   367  			{
   368  				name:     "[false, false, false]",
   369  				values:   []bool{false, false, false},
   370  				expected: [][]byte{byteFalse, byteFalse, byteFalse},
   371  			},
   372  			{
   373  				name:     "[false, true, false, true]",
   374  				values:   []bool{false, true, false, true},
   375  				expected: [][]byte{byteFalse, byteTrue, byteFalse, byteTrue},
   376  			},
   377  			{
   378  				name:     "[]",
   379  				values:   []bool{},
   380  				expected: [][]byte{},
   381  			},
   382  		}
   383  
   384  		for _, tc := range testCases {
   385  			t.Run(tc.name, func(t *testing.T) {
   386  				countable, err := a.BoolArray(tc.values)
   387  				require.Nil(t, err)
   388  				require.Len(t, countable, len(tc.expected))
   389  
   390  				for i := range countable {
   391  					assert.Equal(t, tc.expected[i], countable[i].Data)
   392  					assert.Equal(t, float32(0), countable[i].TermFrequency)
   393  				}
   394  			})
   395  		}
   396  	})
   397  }
   398  
   399  func TestAnalyzer_DefaultEngPreset(t *testing.T) {
   400  	countable := func(data []string, freq []int) []Countable {
   401  		countable := make([]Countable, len(data))
   402  		for i := range data {
   403  			countable[i] = Countable{
   404  				Data:          []byte(data[i]),
   405  				TermFrequency: float32(freq[i]),
   406  			}
   407  		}
   408  		return countable
   409  	}
   410  
   411  	a := NewAnalyzer(nil)
   412  	input := "Hello you-beautiful_World"
   413  
   414  	t.Run("with text", func(t *testing.T) {
   415  		type testCase struct {
   416  			name              string
   417  			tokenization      string
   418  			input             string
   419  			expectedCountable []Countable
   420  		}
   421  
   422  		testCases := []testCase{
   423  			{
   424  				name:         "tokenization word",
   425  				tokenization: models.PropertyTokenizationWord,
   426  				input:        input,
   427  				expectedCountable: countable(
   428  					[]string{"hello", "you", "beautiful", "world"},
   429  					[]int{1, 1, 1, 1},
   430  				),
   431  			},
   432  			{
   433  				name:         "tokenization lowercase",
   434  				tokenization: models.PropertyTokenizationLowercase,
   435  				input:        input,
   436  				expectedCountable: countable(
   437  					[]string{"hello", "you-beautiful_world"},
   438  					[]int{1, 1},
   439  				),
   440  			},
   441  			{
   442  				name:         "tokenization whitespace",
   443  				tokenization: models.PropertyTokenizationWhitespace,
   444  				input:        input,
   445  				expectedCountable: countable(
   446  					[]string{"Hello", "you-beautiful_World"},
   447  					[]int{1, 1},
   448  				),
   449  			},
   450  			{
   451  				name:         "tokenization field",
   452  				tokenization: models.PropertyTokenizationField,
   453  				input:        input,
   454  				expectedCountable: countable(
   455  					[]string{"Hello you-beautiful_World"},
   456  					[]int{1},
   457  				),
   458  			},
   459  			{
   460  				name:              "non existing tokenization",
   461  				tokenization:      "non_existing",
   462  				input:             input,
   463  				expectedCountable: []Countable{},
   464  			},
   465  		}
   466  
   467  		for _, tc := range testCases {
   468  			countable := a.Text(tc.tokenization, tc.input)
   469  			assert.ElementsMatch(t, tc.expectedCountable, countable)
   470  		}
   471  	})
   472  
   473  	t.Run("with text array", func(t *testing.T) {
   474  		type testCase struct {
   475  			name              string
   476  			tokenization      string
   477  			input             []string
   478  			expectedCountable []Countable
   479  		}
   480  
   481  		testCases := []testCase{
   482  			{
   483  				name:         "tokenization word",
   484  				tokenization: models.PropertyTokenizationWord,
   485  				input:        []string{input, input},
   486  				expectedCountable: countable(
   487  					[]string{"hello", "you", "beautiful", "world"},
   488  					[]int{2, 2, 2, 2},
   489  				),
   490  			},
   491  			{
   492  				name:         "tokenization lowercase",
   493  				tokenization: models.PropertyTokenizationLowercase,
   494  				input:        []string{input, input},
   495  				expectedCountable: countable(
   496  					[]string{"hello", "you-beautiful_world"},
   497  					[]int{2, 2},
   498  				),
   499  			},
   500  			{
   501  				name:         "tokenization whitespace",
   502  				tokenization: models.PropertyTokenizationWhitespace,
   503  				input:        []string{input, input},
   504  				expectedCountable: countable(
   505  					[]string{"Hello", "you-beautiful_World"},
   506  					[]int{2, 2},
   507  				),
   508  			},
   509  			{
   510  				name:         "tokenization field",
   511  				tokenization: models.PropertyTokenizationField,
   512  				input:        []string{input, input},
   513  				expectedCountable: countable(
   514  					[]string{"Hello you-beautiful_World"},
   515  					[]int{2},
   516  				),
   517  			},
   518  			{
   519  				name:              "non existing tokenization",
   520  				tokenization:      "non_existing",
   521  				input:             []string{input, input},
   522  				expectedCountable: []Countable{},
   523  			},
   524  		}
   525  
   526  		for _, tc := range testCases {
   527  			countable := a.TextArray(tc.tokenization, tc.input)
   528  			assert.ElementsMatch(t, tc.expectedCountable, countable)
   529  		}
   530  	})
   531  }
   532  
   533  type fakeStopwordDetector struct{}
   534  
   535  func (fsd fakeStopwordDetector) IsStopword(word string) bool {
   536  	return false
   537  }
   538  
   539  func TestDedupItems(t *testing.T) {
   540  	props := []Property{
   541  		{
   542  			Name: "propNothingToDo",
   543  			Items: []Countable{
   544  				{Data: []byte("fff"), TermFrequency: 3},
   545  				{Data: []byte("eee"), TermFrequency: 2},
   546  				{Data: []byte("ddd"), TermFrequency: 1},
   547  			},
   548  		},
   549  		{
   550  			Name: "propToDedup1",
   551  			Items: []Countable{
   552  				{Data: []byte("aaa"), TermFrequency: 1},
   553  				{Data: []byte("bbb"), TermFrequency: 2},
   554  				{Data: []byte("ccc"), TermFrequency: 3},
   555  				{Data: []byte("aaa"), TermFrequency: 4},
   556  				{Data: []byte("ccc"), TermFrequency: 0},
   557  			},
   558  		},
   559  		{
   560  			Name: "propToDedup2",
   561  			Items: []Countable{
   562  				{Data: []uint8{1}, TermFrequency: 5},
   563  				{Data: []uint8{1}, TermFrequency: 4},
   564  				{Data: []uint8{1}, TermFrequency: 3},
   565  				{Data: []uint8{1}, TermFrequency: 2},
   566  				{Data: []uint8{1}, TermFrequency: 1},
   567  			},
   568  		},
   569  	}
   570  
   571  	expectedProps := []Property{
   572  		{
   573  			Name: "propNothingToDo",
   574  			Items: []Countable{
   575  				{Data: []byte("fff"), TermFrequency: 3},
   576  				{Data: []byte("eee"), TermFrequency: 2},
   577  				{Data: []byte("ddd"), TermFrequency: 1},
   578  			},
   579  		},
   580  		{
   581  			Name: "propToDedup1",
   582  			Items: []Countable{
   583  				{Data: []byte("bbb"), TermFrequency: 2},
   584  				{Data: []byte("aaa"), TermFrequency: 4},
   585  				{Data: []byte("ccc"), TermFrequency: 0},
   586  			},
   587  		},
   588  		{
   589  			Name: "propToDedup2",
   590  			Items: []Countable{
   591  				{Data: []uint8{1}, TermFrequency: 1},
   592  			},
   593  		},
   594  	}
   595  
   596  	dedupProps := DedupItems(props)
   597  	assert.Equal(t, expectedProps, dedupProps)
   598  }