github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/vectorizer.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package vectorizer
    13  
    14  // TODO: This entire package should be part of the text2vec-contextionary
    15  // module, if methods/objects in here are used from non-modular code, they
    16  // probably shouldn't be in here
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"strings"
    22  
    23  	"github.com/fatih/camelcase"
    24  	"github.com/weaviate/weaviate/entities/models"
    25  	"github.com/weaviate/weaviate/entities/moduletools"
    26  	txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
    27  	objectsvectorizer "github.com/weaviate/weaviate/usecases/modulecomponents/vectorizer"
    28  )
    29  
    30  // Vectorizer turns objects into vectors
    31  type Vectorizer struct {
    32  	client           client
    33  	objectVectorizer *objectsvectorizer.ObjectVectorizer
    34  }
    35  
    36  type ErrNoUsableWords struct {
    37  	Err error
    38  }
    39  
    40  func (e ErrNoUsableWords) Error() string {
    41  	return e.Err.Error()
    42  }
    43  
    44  func NewErrNoUsableWordsf(pattern string, args ...interface{}) ErrNoUsableWords {
    45  	return ErrNoUsableWords{Err: fmt.Errorf(pattern, args...)}
    46  }
    47  
    48  type client interface {
    49  	VectorForCorpi(ctx context.Context, corpi []string,
    50  		overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error)
    51  }
    52  
    53  // IndexCheck returns whether a property of a class should be indexed
    54  type ClassIndexCheck interface {
    55  	PropertyIndexed(property string) bool
    56  	VectorizeClassName() bool
    57  	VectorizePropertyName(propertyName string) bool
    58  }
    59  
    60  // New from c11y client
    61  func New(client client) *Vectorizer {
    62  	return &Vectorizer{
    63  		client:           client,
    64  		objectVectorizer: objectsvectorizer.New(),
    65  	}
    66  }
    67  
    68  func (v *Vectorizer) Texts(ctx context.Context, inputs []string,
    69  	cfg moduletools.ClassConfig,
    70  ) ([]float32, error) {
    71  	return v.Corpi(ctx, inputs)
    72  }
    73  
    74  // Object object to vector
    75  func (v *Vectorizer) Object(ctx context.Context, object *models.Object,
    76  	comp moduletools.VectorizablePropsComparator, cfg moduletools.ClassConfig,
    77  ) ([]float32, models.AdditionalProperties, error) {
    78  	var overrides map[string]string
    79  	if object.VectorWeights != nil {
    80  		overrides = object.VectorWeights.(map[string]string)
    81  	}
    82  
    83  	vec, sources, err := v.object(ctx, object.Class, comp, overrides, cfg)
    84  	if err != nil {
    85  		return nil, nil, err
    86  	}
    87  
    88  	additional := models.AdditionalProperties{}
    89  	additional["interpretation"] = &txt2vecmodels.Interpretation{
    90  		Source: sourceFromInputElements(sources),
    91  	}
    92  
    93  	return vec, additional, nil
    94  }
    95  
    96  func (v *Vectorizer) object(ctx context.Context, className string,
    97  	comp moduletools.VectorizablePropsComparator, overrides map[string]string,
    98  	cfg moduletools.ClassConfig,
    99  ) ([]float32, []txt2vecmodels.InterpretationSource, error) {
   100  	icheck := NewIndexChecker(cfg)
   101  	corpi, vector := v.objectVectorizer.TextsOrVector(ctx, className, comp, icheck, cfg.TargetVector())
   102  	// no property was changed, old vector can be used
   103  	if vector != nil {
   104  		// dont' re-vectorize
   105  		return vector, []txt2vecmodels.InterpretationSource{}, nil
   106  	}
   107  	// vectorize text
   108  	vector, ie, err := v.client.VectorForCorpi(ctx, []string{corpi}, overrides)
   109  	if err != nil {
   110  		switch err.(type) {
   111  		case ErrNoUsableWords:
   112  			return nil, nil, fmt.Errorf("The object is invalid, as weaviate could not extract "+
   113  				"any contextionary-valid words from it. This is the case when you have "+
   114  				"set the options 'vectorizeClassName: false' and 'vectorizePropertyName: false' in this class' schema definition "+
   115  				"and not a single property's value "+
   116  				"contains at least one contextionary-valid word. To fix this, you have several "+
   117  				"options:\n\n1.) Make sure that the schema class name or the set properties are "+
   118  				"a contextionary-valid term and include them in vectorization using the "+
   119  				"'vectorizeClassName' or 'vectorizePropertyName' setting. In this case the vector position "+
   120  				"will be composed of both the class/property names and the values for those fields. "+
   121  				"Even if no property values are contextionary-valid, the overall word corpus is still valid "+
   122  				"due to the contextionary-valid class/property names."+
   123  				"\n\n2.) Alternatively, if you do not want to include schema class/property names "+
   124  				"in vectorization, you must make sure that at least one text/string property contains "+
   125  				"at least one contextionary-valid word."+
   126  				"\n\n3.) If the word corpus weaviate extracted from your object "+
   127  				"(see below) does contain enough meaning to build a vector position, but the contextionary "+
   128  				"did not recognize the words, you can extend the contextionary using the "+
   129  				"REST API. This is the case	when you use mostly industry-specific terms which are "+
   130  				"not known to the common language contextionary. Once extended, simply reimport this object."+
   131  				"\n\nThe following words were extracted from your object: %v"+
   132  				"\n\nTo learn more about the contextionary and how it behaves, check out: https://www.semi.technology/documentation/weaviate/current/contextionary.html"+
   133  				"\n\nOriginal error: %v", corpi, err)
   134  		default:
   135  			return nil, nil, fmt.Errorf("vectorizing object with corpus '%+v': %v", corpi, err)
   136  		}
   137  	}
   138  
   139  	return vector, ie, nil
   140  }
   141  
   142  // Corpi takes any list of strings and builds a common vector for all of them
   143  func (v *Vectorizer) Corpi(ctx context.Context, corpi []string,
   144  ) ([]float32, error) {
   145  	for i, corpus := range corpi {
   146  		corpi[i] = camelCaseToLower(corpus)
   147  	}
   148  
   149  	vector, _, err := v.client.VectorForCorpi(ctx, corpi, nil)
   150  	if err != nil {
   151  		return nil, fmt.Errorf("vectorizing corpus '%+v': %v", corpi, err)
   152  	}
   153  
   154  	return vector, nil
   155  }
   156  
   157  func camelCaseToLower(in string) string {
   158  	parts := camelcase.Split(in)
   159  	var sb strings.Builder
   160  	for i, part := range parts {
   161  		if part == " " {
   162  			continue
   163  		}
   164  
   165  		if i > 0 {
   166  			sb.WriteString(" ")
   167  		}
   168  
   169  		sb.WriteString(strings.ToLower(part))
   170  	}
   171  
   172  	return sb.String()
   173  }
   174  
   175  func sourceFromInputElements(in []txt2vecmodels.InterpretationSource) []*txt2vecmodels.InterpretationSource {
   176  	out := make([]*txt2vecmodels.InterpretationSource, len(in))
   177  	for i, elem := range in {
   178  		out[i] = &txt2vecmodels.InterpretationSource{
   179  			Concept:    elem.Concept,
   180  			Occurrence: elem.Occurrence,
   181  			Weight:     float64(elem.Weight),
   182  		}
   183  	}
   184  
   185  	return out
   186  }