github.com/weaviate/weaviate@v1.24.6/modules/text2vec-transformers/config.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package modtransformers
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  
    18  	"github.com/pkg/errors"
    19  	"github.com/sirupsen/logrus"
    20  	"github.com/weaviate/weaviate/entities/models"
    21  	"github.com/weaviate/weaviate/entities/modulecapabilities"
    22  	"github.com/weaviate/weaviate/entities/moduletools"
    23  	"github.com/weaviate/weaviate/entities/schema"
    24  	"github.com/weaviate/weaviate/modules/text2vec-transformers/vectorizer"
    25  )
    26  
    27  func (m *TransformersModule) ClassConfigDefaults() map[string]interface{} {
    28  	return map[string]interface{}{
    29  		"vectorizeClassName": vectorizer.DefaultVectorizeClassName,
    30  		"poolingStrategy":    vectorizer.DefaultPoolingStrategy,
    31  	}
    32  }
    33  
    34  func (m *TransformersModule) PropertyConfigDefaults(
    35  	dt *schema.DataType,
    36  ) map[string]interface{} {
    37  	return map[string]interface{}{
    38  		"skip":                  !vectorizer.DefaultPropertyIndexed,
    39  		"vectorizePropertyName": vectorizer.DefaultVectorizePropertyName,
    40  	}
    41  }
    42  
    43  func (m *TransformersModule) ValidateClass(ctx context.Context,
    44  	class *models.Class, cfg moduletools.ClassConfig,
    45  ) error {
    46  	settings := vectorizer.NewClassSettings(cfg)
    47  	if err := settings.Validate(class); err != nil {
    48  		return err
    49  	}
    50  	return NewConfigValidator(m.logger).Do(ctx, class, cfg, settings)
    51  }
    52  
    53  var _ = modulecapabilities.ClassConfigurator(New())
    54  
    55  type ConfigValidator struct {
    56  	logger logrus.FieldLogger
    57  }
    58  
    59  type ClassSettings interface {
    60  	VectorizeClassName() bool
    61  	VectorizePropertyName(propName string) bool
    62  	PropertyIndexed(propName string) bool
    63  }
    64  
    65  func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator {
    66  	return &ConfigValidator{logger: logger}
    67  }
    68  
    69  func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class,
    70  	cfg moduletools.ClassConfig, settings ClassSettings,
    71  ) error {
    72  	// In text2vec-transformers (as opposed to e.g. text2vec-contextionary) the
    73  	// assumption is that the models will be able to deal with any words, even
    74  	// previously unseen ones. Therefore we do not need to validate individual
    75  	// properties, but only the overall "index state"
    76  
    77  	if err := cv.validateIndexState(ctx, class, settings); err != nil {
    78  		return errors.Errorf("invalid combination of properties")
    79  	}
    80  
    81  	cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings)
    82  
    83  	return nil
    84  }
    85  
    86  func (cv *ConfigValidator) validateIndexState(ctx context.Context,
    87  	class *models.Class, settings ClassSettings,
    88  ) error {
    89  	if settings.VectorizeClassName() {
    90  		// if the user chooses to vectorize the classname, vector-building will
    91  		// always be possible, no need to investigate further
    92  
    93  		return nil
    94  	}
    95  
    96  	// search if there is at least one indexed, string/text prop. If found pass
    97  	// validation
    98  	for _, prop := range class.Properties {
    99  		if len(prop.DataType) < 1 {
   100  			return errors.Errorf("property %s must have at least one datatype: "+
   101  				"got %v", prop.Name, prop.DataType)
   102  		}
   103  
   104  		if prop.DataType[0] != string(schema.DataTypeText) {
   105  			// we can only vectorize text-like props
   106  			continue
   107  		}
   108  
   109  		if settings.PropertyIndexed(prop.Name) {
   110  			// found at least one, this is a valid schema
   111  			return nil
   112  		}
   113  	}
   114  
   115  	return fmt.Errorf("invalid properties: didn't find a single property which is " +
   116  		"of type string or text and is not excluded from indexing. In addition the " +
   117  		"class name is excluded from vectorization as well, meaning that it cannot be " +
   118  		"used to determine the vector position. To fix this, set 'vectorizeClassName' " +
   119  		"to true if the class name is contextionary-valid. Alternatively add at least " +
   120  		"contextionary-valid text/string property which is not excluded from " +
   121  		"indexing.")
   122  }
   123  
   124  func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors(
   125  	ctx context.Context, class *models.Class, settings ClassSettings,
   126  ) {
   127  	if !settings.VectorizeClassName() {
   128  		// if the user choses not to vectorize the class name, this means they must
   129  		// have chosen something else to vectorize, otherwise the validation would
   130  		// have error'd before we ever got here. We can skip further checking.
   131  
   132  		return
   133  	}
   134  
   135  	// search if there is at least one indexed, string/text prop. If found exit
   136  	for _, prop := range class.Properties {
   137  		// length check skipped, because validation has already passed
   138  		if prop.DataType[0] != string(schema.DataTypeText) {
   139  			// we can only vectorize text-like props
   140  			continue
   141  		}
   142  
   143  		if settings.PropertyIndexed(prop.Name) {
   144  			// found at least one
   145  			return
   146  		}
   147  	}
   148  
   149  	cv.logger.WithField("module", "text2vec-transformers").
   150  		WithField("class", class.Class).
   151  		Warnf("text2vec-contextionary: Class %q does not have any properties "+
   152  			"indexed (or only non text-properties indexed) and the vector position is "+
   153  			"only determined by the class name. Each object will end up with the same "+
   154  			"vector which leads to a severe performance penalty on imports. Consider "+
   155  			"setting vectorIndexConfig.skip=true for this property", class.Class)
   156  }