github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/schema_config.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package vectorizer
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  	"strings"
    18  
    19  	"github.com/fatih/camelcase"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  	"github.com/weaviate/weaviate/entities/models"
    23  	"github.com/weaviate/weaviate/entities/moduletools"
    24  	"github.com/weaviate/weaviate/entities/schema"
    25  )
    26  
    27  type ConfigValidator struct {
    28  	remote RemoteClient
    29  	logger logrus.FieldLogger
    30  }
    31  
    32  type IndexChecker interface {
    33  	VectorizeClassName() bool
    34  	VectorizePropertyName(propName string) bool
    35  	PropertyIndexed(propName string) bool
    36  }
    37  
    38  type RemoteClient interface {
    39  	IsStopWord(ctx context.Context, word string) (bool, error)
    40  	IsWordPresent(ctx context.Context, word string) (bool, error)
    41  }
    42  
    43  func NewConfigValidator(rc RemoteClient,
    44  	logger logrus.FieldLogger,
    45  ) *ConfigValidator {
    46  	return &ConfigValidator{remote: rc, logger: logger}
    47  }
    48  
    49  func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class,
    50  	cfg moduletools.ClassConfig, icheck IndexChecker,
    51  ) error {
    52  	err := cv.validateClassName(ctx, class.Class, icheck.VectorizeClassName())
    53  	if err != nil {
    54  		return fmt.Errorf("invalid class name: %w", err)
    55  	}
    56  
    57  	for _, prop := range class.Properties {
    58  		if !icheck.PropertyIndexed(prop.Name) {
    59  			continue
    60  		}
    61  
    62  		err = cv.validatePropertyName(ctx, prop.Name,
    63  			icheck.VectorizePropertyName(prop.Name))
    64  		if err != nil {
    65  			return errors.Wrapf(err, "class %q: invalid property name", class.Class)
    66  		}
    67  	}
    68  
    69  	if err := cv.validateIndexState(ctx, class, icheck); err != nil {
    70  		return errors.Wrap(err, "invalid combination of properties")
    71  	}
    72  
    73  	cv.checkForPossibilityOfDuplicateVectors(ctx, class, icheck)
    74  
    75  	return nil
    76  }
    77  
    78  func (cv *ConfigValidator) validateClassName(ctx context.Context, className string,
    79  	vectorizeClass bool,
    80  ) error {
    81  	// class name
    82  	if !vectorizeClass {
    83  		// if the user chooses not to vectorize the class, we don't need to check
    84  		// if its c11y-valid or not
    85  		return nil
    86  	}
    87  
    88  	camelParts := camelcase.Split(className)
    89  	stopWordsFound := 0
    90  	for _, part := range camelParts {
    91  		word := strings.ToLower(part)
    92  		sw, err := cv.remote.IsStopWord(ctx, word)
    93  		if err != nil {
    94  			return fmt.Errorf("check stopword: %v", err)
    95  		}
    96  
    97  		if sw {
    98  			stopWordsFound++
    99  			continue
   100  		}
   101  
   102  		present, err := cv.remote.IsWordPresent(ctx, word)
   103  		if err != nil {
   104  			return fmt.Errorf("check word presence: %v", err)
   105  		}
   106  
   107  		if !present {
   108  			return fmt.Errorf("could not find the word '%s' from the class name '%s' "+
   109  				"in the contextionary", word, className)
   110  		}
   111  	}
   112  
   113  	if len(camelParts) == stopWordsFound {
   114  		return fmt.Errorf("className '%s' consists of only stopwords and is therefore "+
   115  			"not a contextionary-valid class name, make sure at least one word in the "+
   116  			"classname is not a stop word", className)
   117  	}
   118  
   119  	return nil
   120  }
   121  
   122  func (cv *ConfigValidator) validatePropertyName(ctx context.Context,
   123  	propertyName string, vectorize bool,
   124  ) error {
   125  	if !vectorize {
   126  		// user does not want to vectorize this property name, so we don't have to
   127  		// validate it
   128  		return nil
   129  	}
   130  
   131  	camelParts := camelcase.Split(propertyName)
   132  	stopWordsFound := 0
   133  	for _, part := range camelParts {
   134  		word := strings.ToLower(part)
   135  		sw, err := cv.remote.IsStopWord(ctx, word)
   136  		if err != nil {
   137  			return fmt.Errorf("check stopword: %v", err)
   138  		}
   139  
   140  		if sw {
   141  			stopWordsFound++
   142  			continue
   143  		}
   144  
   145  		present, err := cv.remote.IsWordPresent(ctx, word)
   146  		if err != nil {
   147  			return fmt.Errorf("check word presence: %v", err)
   148  		}
   149  
   150  		if !present {
   151  			return fmt.Errorf("could not find word '%s' of the property '%s' in the "+
   152  				"contextionary", word, propertyName)
   153  		}
   154  	}
   155  
   156  	if len(camelParts) == stopWordsFound {
   157  		return fmt.Errorf("the propertyName '%s' consists of only stopwords and is "+
   158  			"therefore not a contextionary-valid property name, make sure at least one word "+
   159  			"in the property name is not a stop word", propertyName)
   160  	}
   161  
   162  	return nil
   163  }
   164  
   165  func (cv *ConfigValidator) validateIndexState(ctx context.Context,
   166  	class *models.Class, icheck IndexChecker,
   167  ) error {
   168  	if icheck.VectorizeClassName() {
   169  		// if the user chooses to vectorize the classname, vector-building will
   170  		// always be possible, no need to investigate further
   171  
   172  		return nil
   173  	}
   174  
   175  	// search if there is at least one indexed, string/text or string/text[]
   176  	// prop. If found pass validation
   177  	for _, prop := range class.Properties {
   178  		if len(prop.DataType) < 1 {
   179  			return errors.Errorf("property %s must have at least one datatype: "+
   180  				"got %v", prop.Name, prop.DataType)
   181  		}
   182  
   183  		if prop.DataType[0] != string(schema.DataTypeText) &&
   184  			prop.DataType[0] != string(schema.DataTypeTextArray) {
   185  			// we can only vectorize text-like props
   186  			continue
   187  		}
   188  
   189  		if icheck.PropertyIndexed(prop.Name) {
   190  			// found at least one, this is a valid schema
   191  			return nil
   192  		}
   193  	}
   194  
   195  	return fmt.Errorf("invalid properties: didn't find a single property which is " +
   196  		"of type string or text and is not excluded from indexing. In addition the " +
   197  		"class name is excluded from vectorization as well, meaning that it cannot be " +
   198  		"used to determine the vector position. To fix this, set 'vectorizeClassName' " +
   199  		"to true if the class name is contextionary-valid. Alternatively add at least " +
   200  		"contextionary-valid text/string property which is not excluded from " +
   201  		"indexing.")
   202  }
   203  
   204  func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors(
   205  	ctx context.Context, class *models.Class, icheck IndexChecker,
   206  ) {
   207  	if !icheck.VectorizeClassName() {
   208  		// if the user choses not to vectorize the class name, this means they must
   209  		// have chosen something else to vectorize, otherwise the validation would
   210  		// have error'd before we ever got here. We can skip further checking.
   211  
   212  		return
   213  	}
   214  
   215  	// search if there is at least one indexed, string/text prop. If found exit
   216  	for _, prop := range class.Properties {
   217  		// length check skipped, because validation has already passed
   218  		if prop.DataType[0] != string(schema.DataTypeText) {
   219  			// we can only vectorize text-like props
   220  			continue
   221  		}
   222  
   223  		if icheck.PropertyIndexed(prop.Name) {
   224  			// found at least one
   225  			return
   226  		}
   227  	}
   228  
   229  	cv.logger.WithField("module", "text2vec-contextionary").
   230  		WithField("class", class.Class).
   231  		Warnf("text2vec-contextionary: Class %q does not have any properties "+
   232  			"indexed (or only non text-properties indexed) and the vector position is "+
   233  			"only determined by the class name. Each object will end up with the same "+
   234  			"vector which leads to a severe performance penalty on imports. Consider "+
   235  			"setting vectorIndexConfig.skip=true for this property", class.Class)
   236  }