github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/schema_config_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package vectorizer
    13  
    14  import (
    15  	"context"
    16  	"testing"
    17  
    18  	"github.com/sirupsen/logrus"
    19  	ltest "github.com/sirupsen/logrus/hooks/test"
    20  	"github.com/stretchr/testify/assert"
    21  	"github.com/stretchr/testify/require"
    22  	"github.com/weaviate/weaviate/entities/models"
    23  	"github.com/weaviate/weaviate/entities/schema"
    24  )
    25  
    26  func TestConfigValidator(t *testing.T) {
    27  	t.Run("validate class names", func(t *testing.T) {
    28  		type testCase struct {
    29  			input     string
    30  			valid     bool
    31  			name      string
    32  			vectorize bool
    33  		}
    34  
    35  		// for all test cases keep in mind that the word "carrot" is not present in
    36  		// the fake c11y, but every other word is.
    37  		//
    38  		// Additionally, the word "the" is a stopword
    39  		//
    40  		// all inputs represent class names (!)
    41  		tests := []testCase{
    42  			// valid names
    43  			{
    44  				name:      "Single uppercase word present in the c11y",
    45  				input:     "Car",
    46  				valid:     true,
    47  				vectorize: true,
    48  			},
    49  			{
    50  				name:      "Single lowercase word present in the c11y, stored as uppercase",
    51  				input:     "car",
    52  				valid:     true,
    53  				vectorize: true,
    54  			},
    55  			{
    56  				name:      "combination of valid words starting with uppercase letter",
    57  				input:     "CarGarage",
    58  				valid:     true,
    59  				vectorize: true,
    60  			},
    61  			{
    62  				name:      "combination of valid words starting with lowercase letter, stored as uppercase",
    63  				input:     "carGarage",
    64  				valid:     true,
    65  				vectorize: true,
    66  			},
    67  			{
    68  				name:      "combination of valid words and stopwords, starting with uppercase",
    69  				input:     "TheCarGarage",
    70  				valid:     true,
    71  				vectorize: true,
    72  			},
    73  			{
    74  				name:      "combination of valid words and stopwords starting with lowercase letter, stored as uppercase",
    75  				input:     "carTheGarage",
    76  				valid:     true,
    77  				vectorize: true,
    78  			},
    79  
    80  			// invalid names
    81  			{
    82  				name:      "Single uppercase word NOT present in the c11y",
    83  				input:     "Carrot",
    84  				valid:     false,
    85  				vectorize: true,
    86  			},
    87  			{
    88  				name:      "Single lowercase word NOT present in the c11y",
    89  				input:     "carrot",
    90  				valid:     false,
    91  				vectorize: true,
    92  			},
    93  			{
    94  				name:      "Single uppercase stopword",
    95  				input:     "The",
    96  				valid:     false,
    97  				vectorize: true,
    98  			},
    99  			{
   100  				name:      "Single lowercase stopword",
   101  				input:     "the",
   102  				valid:     false,
   103  				vectorize: true,
   104  			},
   105  			{
   106  				name:      "combination of valid and invalid words, valid word first lowercased",
   107  				input:     "potatoCarrot",
   108  				valid:     false,
   109  				vectorize: true,
   110  			},
   111  			{
   112  				name:      "combination of valid and invalid words, valid word first uppercased",
   113  				input:     "PotatoCarrot",
   114  				valid:     false,
   115  				vectorize: true,
   116  			},
   117  			{
   118  				name:      "combination of valid and invalid words, invalid word first lowercased",
   119  				input:     "carrotPotato",
   120  				valid:     false,
   121  				vectorize: true,
   122  			},
   123  			{
   124  				name:      "combination of valid and invalid words, invalid word first uppercased",
   125  				input:     "CarrotPotato",
   126  				valid:     false,
   127  				vectorize: true,
   128  			},
   129  			{
   130  				name:      "combination of only stopwords, starting with lowercase",
   131  				input:     "theThe",
   132  				valid:     false,
   133  				vectorize: true,
   134  			},
   135  			{
   136  				name:      "combination of only stopwords, starting with uppercase",
   137  				input:     "TheThe",
   138  				valid:     false,
   139  				vectorize: true,
   140  			},
   141  
   142  			// vectorize turned off
   143  			{
   144  				name:      "non-vectorized: combination of only stopwords, starting with uppercase",
   145  				input:     "TheThe",
   146  				valid:     true,
   147  				vectorize: false,
   148  			},
   149  			{
   150  				name:      "non-vectorized: excluded word",
   151  				input:     "carrot",
   152  				valid:     true,
   153  				vectorize: false,
   154  			},
   155  		}
   156  
   157  		for _, test := range tests {
   158  			t.Run(test.name+" object class", func(t *testing.T) {
   159  				class := &models.Class{
   160  					Class: test.input,
   161  					Properties: []*models.Property{{
   162  						Name:         "dummyPropSoWeDontRunIntoAllNoindexedError",
   163  						DataType:     schema.DataTypeText.PropString(),
   164  						Tokenization: models.PropertyTokenizationWhitespace,
   165  					}},
   166  				}
   167  
   168  				logger, _ := ltest.NewNullLogger()
   169  				v := NewConfigValidator(&fakeRemote{}, logger)
   170  				err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
   171  					vectorizeClassName: test.vectorize,
   172  					propertyIndexed:    true,
   173  				})
   174  				assert.Equal(t, test.valid, err == nil)
   175  
   176  				// only proceed if input was supposed to be valid
   177  				if test.valid == false {
   178  					return
   179  				}
   180  			})
   181  		}
   182  	})
   183  
   184  	t.Run("validate property names", func(t *testing.T) {
   185  		type testCase struct {
   186  			input     string
   187  			valid     bool
   188  			name      string
   189  			vectorize bool
   190  		}
   191  
   192  		// for all test cases keep in mind that the word "carrot" is not present in
   193  		// the fake c11y, but every other word is
   194  		//
   195  		// all inputs represent property names (!)
   196  		tests := []testCase{
   197  			// valid names
   198  			{
   199  				name:      "Single uppercase word present in the c11y, stored as lowercase",
   200  				input:     "Brand",
   201  				valid:     true,
   202  				vectorize: true,
   203  			},
   204  			{
   205  				name:      "Single lowercase word present in the c11y",
   206  				input:     "brand",
   207  				valid:     true,
   208  				vectorize: true,
   209  			},
   210  			{
   211  				name:      "combination of valid words starting with uppercase letter, stored as lowercase",
   212  				input:     "BrandGarage",
   213  				valid:     true,
   214  				vectorize: true,
   215  			},
   216  			{
   217  				name:      "combination of valid words starting with lowercase letter",
   218  				input:     "brandGarage",
   219  				valid:     true,
   220  				vectorize: true,
   221  			},
   222  			{
   223  				name:      "combination of valid words and stop words starting with uppercase letter, stored as lowercase",
   224  				input:     "TheGarage",
   225  				valid:     true,
   226  				vectorize: true,
   227  			},
   228  			{
   229  				name:      "combination of valid words and stop words starting with lowercase letter",
   230  				input:     "theGarage",
   231  				valid:     true,
   232  				vectorize: true,
   233  			},
   234  
   235  			// invalid names
   236  			{
   237  				name:      "Single uppercase word NOT present in the c11y",
   238  				input:     "Carrot",
   239  				valid:     false,
   240  				vectorize: true,
   241  			},
   242  			{
   243  				name:      "Single lowercase word NOT present in the c11y",
   244  				input:     "carrot",
   245  				valid:     false,
   246  				vectorize: true,
   247  			},
   248  			{
   249  				name:      "Single lowercase stop word",
   250  				input:     "the",
   251  				valid:     false,
   252  				vectorize: true,
   253  			},
   254  			{
   255  				name:      "combination of valid and invalid words, valid word first lowercased",
   256  				input:     "potatoCarrot",
   257  				valid:     false,
   258  				vectorize: true,
   259  			},
   260  			{
   261  				name:      "combination of valid and invalid words, valid word first uppercased",
   262  				input:     "PotatoCarrot",
   263  				valid:     false,
   264  				vectorize: true,
   265  			},
   266  			{
   267  				name:      "combination of valid and invalid words, invalid word first lowercased",
   268  				input:     "carrotPotato",
   269  				valid:     false,
   270  				vectorize: true,
   271  			},
   272  			{
   273  				name:      "combination of valid and invalid words, invalid word first uppercased",
   274  				input:     "CarrotPotato",
   275  				valid:     false,
   276  				vectorize: true,
   277  			},
   278  			{
   279  				name:      "combination of only stop words,  first lowercased",
   280  				input:     "theThe",
   281  				valid:     false,
   282  				vectorize: true,
   283  			},
   284  			{
   285  				name:      "combination of only stop words, first uppercased",
   286  				input:     "TheThe",
   287  				valid:     false,
   288  				vectorize: true,
   289  			},
   290  
   291  			// without vectorizing
   292  			{
   293  				name:      "non-vectorizing: combination of only stop words, first uppercased",
   294  				input:     "TheThe",
   295  				valid:     true,
   296  				vectorize: false,
   297  			},
   298  			{
   299  				name:      "non-vectorizing: combination of only stop words, first uppercased",
   300  				input:     "carrot",
   301  				valid:     true,
   302  				vectorize: false,
   303  			},
   304  		}
   305  
   306  		for _, test := range tests {
   307  			t.Run(test.name+" object class", func(t *testing.T) {
   308  				class := &models.Class{
   309  					Class: "ValidName",
   310  					Properties: []*models.Property{{
   311  						DataType:     schema.DataTypeText.PropString(),
   312  						Tokenization: models.PropertyTokenizationWhitespace,
   313  						Name:         test.input,
   314  					}},
   315  				}
   316  
   317  				logger, _ := ltest.NewNullLogger()
   318  				v := NewConfigValidator(&fakeRemote{}, logger)
   319  				err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
   320  					vectorizePropertyName: test.vectorize,
   321  					propertyIndexed:       true,
   322  				})
   323  				assert.Equal(t, test.valid, err == nil)
   324  			})
   325  		}
   326  	})
   327  
   328  	t.Run("all usable props no-indexed", func(t *testing.T) {
   329  		t.Run("all schema vectorization turned off", func(t *testing.T) {
   330  			class := &models.Class{
   331  				Vectorizer: "text2vec-contextionary",
   332  				Class:      "ValidName",
   333  				Properties: []*models.Property{
   334  					{
   335  						DataType: []string{"text"},
   336  						Name:     "description",
   337  					},
   338  					{
   339  						DataType:     schema.DataTypeText.PropString(),
   340  						Tokenization: models.PropertyTokenizationWhitespace,
   341  						Name:         "name",
   342  					},
   343  					{
   344  						DataType: []string{"int"},
   345  						Name:     "amount",
   346  					},
   347  				},
   348  			}
   349  
   350  			logger, _ := ltest.NewNullLogger()
   351  			v := NewConfigValidator(&fakeRemote{}, logger)
   352  			err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
   353  				vectorizePropertyName: false,
   354  				vectorizeClassName:    false,
   355  				propertyIndexed:       false,
   356  			})
   357  			assert.NotNil(t, err)
   358  		})
   359  	})
   360  
   361  	t.Run("with only array types", func(t *testing.T) {
   362  		class := &models.Class{
   363  			Vectorizer: "text2vec-contextionary",
   364  			Class:      "ValidName",
   365  			Properties: []*models.Property{
   366  				{
   367  					DataType: []string{"text[]"},
   368  					Name:     "descriptions",
   369  				},
   370  				{
   371  					DataType:     schema.DataTypeTextArray.PropString(),
   372  					Tokenization: models.PropertyTokenizationWhitespace,
   373  					Name:         "names",
   374  				},
   375  			},
   376  		}
   377  
   378  		logger, _ := ltest.NewNullLogger()
   379  		v := NewConfigValidator(&fakeRemote{}, logger)
   380  		err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
   381  			vectorizePropertyName: false,
   382  			vectorizeClassName:    false,
   383  			propertyIndexed:       true,
   384  		})
   385  		assert.Nil(t, err)
   386  	})
   387  }
   388  
   389  func TestConfigValidator_RiskOfDuplicateVectors(t *testing.T) {
   390  	type test struct {
   391  		name          string
   392  		in            *models.Class
   393  		expectWarning bool
   394  		indexChecker  *fakeIndexChecker
   395  	}
   396  
   397  	tests := []test{
   398  		{
   399  			name: "usable properties",
   400  			in: &models.Class{
   401  				Class: "ValidName",
   402  				Properties: []*models.Property{
   403  					{
   404  						DataType: []string{string(schema.DataTypeText)},
   405  						Name:     "textProp",
   406  					},
   407  				},
   408  			},
   409  			expectWarning: false,
   410  			indexChecker: &fakeIndexChecker{
   411  				vectorizePropertyName: false,
   412  				vectorizeClassName:    true,
   413  				propertyIndexed:       true,
   414  			},
   415  		},
   416  		{
   417  			name: "no properties",
   418  			in: &models.Class{
   419  				Class: "ValidName",
   420  			},
   421  			expectWarning: true,
   422  			indexChecker: &fakeIndexChecker{
   423  				vectorizePropertyName: false,
   424  				vectorizeClassName:    true,
   425  				propertyIndexed:       false,
   426  			},
   427  		},
   428  		{
   429  			name: "usable properties, but they are no-indexed",
   430  			in: &models.Class{
   431  				Class: "ValidName",
   432  				Properties: []*models.Property{
   433  					{
   434  						DataType: []string{string(schema.DataTypeText)},
   435  						Name:     "textProp",
   436  					},
   437  				},
   438  			},
   439  			expectWarning: true,
   440  			indexChecker: &fakeIndexChecker{
   441  				vectorizePropertyName: false,
   442  				vectorizeClassName:    true,
   443  				propertyIndexed:       false,
   444  			},
   445  		},
   446  		{
   447  			name: "only unusable properties",
   448  			in: &models.Class{
   449  				Class: "ValidName",
   450  				Properties: []*models.Property{
   451  					{
   452  						DataType: []string{string(schema.DataTypeInt)},
   453  						Name:     "intProp",
   454  					},
   455  				},
   456  			},
   457  			expectWarning: true,
   458  			indexChecker: &fakeIndexChecker{
   459  				vectorizePropertyName: false,
   460  				vectorizeClassName:    true,
   461  				propertyIndexed:       false,
   462  			},
   463  		},
   464  	}
   465  
   466  	for _, test := range tests {
   467  		t.Run(test.name, func(t *testing.T) {
   468  			logger, hook := ltest.NewNullLogger()
   469  			v := NewConfigValidator(&fakeRemote{}, logger)
   470  			err := v.Do(context.Background(), test.in, nil, test.indexChecker)
   471  			require.Nil(t, err)
   472  
   473  			entry := hook.LastEntry()
   474  			if test.expectWarning {
   475  				require.NotNil(t, entry)
   476  				assert.Equal(t, logrus.WarnLevel, entry.Level)
   477  			} else {
   478  				assert.Nil(t, entry)
   479  			}
   480  		})
   481  	}
   482  }
   483  
   484  type fakeIndexChecker struct {
   485  	vectorizeClassName    bool
   486  	vectorizePropertyName bool
   487  	propertyIndexed       bool
   488  }
   489  
   490  func (f *fakeIndexChecker) VectorizeClassName() bool {
   491  	return f.vectorizeClassName
   492  }
   493  
   494  func (f *fakeIndexChecker) VectorizePropertyName(propName string) bool {
   495  	return f.vectorizePropertyName
   496  }
   497  
   498  func (f *fakeIndexChecker) PropertyIndexed(propName string) bool {
   499  	return f.propertyIndexed
   500  }
   501  
   502  // Every word in this fake c11y remote client is present except for the word
   503  // Carrot which is not present
   504  type fakeRemote struct{}
   505  
   506  func (f *fakeRemote) IsWordPresent(ctx context.Context, word string) (bool, error) {
   507  	if word == "carrot" || word == "the" {
   508  		return false, nil
   509  	}
   510  	return true, nil
   511  }
   512  
   513  func (f *fakeRemote) IsStopWord(ctx context.Context, word string) (bool, error) {
   514  	return word == "the", nil
   515  }