github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/inspector.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package vectorizer
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  	"strings"
    18  	"unicode"
    19  	"unicode/utf8"
    20  
    21  	"github.com/weaviate/weaviate/entities/models"
    22  	txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
    23  )
    24  
    25  type InspectorClient interface {
    26  	VectorForWord(ctx context.Context, word string) ([]float32, error)
    27  	VectorForCorpi(ctx context.Context, words []string,
    28  		overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error)
    29  	NearestWordsByVector(ctx context.Context, vector []float32, n int, k int) ([]string, []float32, error)
    30  	IsWordPresent(ctx context.Context, word string) (bool, error)
    31  }
    32  
    33  type Inspector struct {
    34  	client InspectorClient
    35  }
    36  
    37  func NewInspector(client InspectorClient) *Inspector {
    38  	return &Inspector{client: client}
    39  }
    40  
    41  func (i *Inspector) GetWords(ctx context.Context, words string) (*models.C11yWordsResponse, error) {
    42  	wordArray, err := i.validateAndSplit(words)
    43  	if err != nil {
    44  		return nil, err
    45  	}
    46  
    47  	concatWord, err := i.concatWord(ctx, words, wordArray)
    48  	if err != nil {
    49  		return nil, err
    50  	}
    51  
    52  	individualWords, err := i.individualWords(ctx, wordArray)
    53  	if err != nil {
    54  		return nil, err
    55  	}
    56  
    57  	return &models.C11yWordsResponse{
    58  		ConcatenatedWord: concatWord,
    59  		IndividualWords:  individualWords,
    60  	}, nil
    61  }
    62  
    63  func (i *Inspector) validateAndSplit(words string) ([]string, error) {
    64  	// set first character to lowercase
    65  	wordChars := []rune(words)
    66  	wordChars[0] = unicode.ToLower(wordChars[0])
    67  	words = string(wordChars)
    68  
    69  	for _, r := range words {
    70  		if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
    71  			return nil, fmt.Errorf("invalid word input: words must only contain unicode letters and digits")
    72  		}
    73  	}
    74  
    75  	return split(words), nil
    76  }
    77  
    78  func (i *Inspector) concatWord(ctx context.Context, words string,
    79  	wordArray []string,
    80  ) (*models.C11yWordsResponseConcatenatedWord, error) {
    81  	if len(wordArray) < 2 {
    82  		// only build a concat response if we have more than a single word
    83  		return nil, nil
    84  	}
    85  
    86  	// join the words into a single corpus. While the contextionary also supports
    87  	// building a centroid from multiple corpi (thus []string for Corpi, an
    88  	// occurrence-based weighing can only happen within a corpus. It is thus - by
    89  	// far - preferable in this case, to concat the words into one corpus, rather
    90  	// than treating each word as its own.
    91  	corpus := strings.Join(wordArray, " ")
    92  	vector, _, err := i.client.VectorForCorpi(ctx, []string{corpus}, nil)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	nearestNeighbors, err := i.nearestNeighbors(ctx, vector)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  
   102  	return &models.C11yWordsResponseConcatenatedWord{
   103  		ConcatenatedWord:             words,
   104  		SingleWords:                  wordArray,
   105  		ConcatenatedVector:           vector,
   106  		ConcatenatedNearestNeighbors: nearestNeighbors,
   107  	}, nil
   108  }
   109  
   110  func (i *Inspector) nearestNeighbors(ctx context.Context,
   111  	vector []float32,
   112  ) ([]*models.C11yNearestNeighborsItems0, error) {
   113  	// relate words of centroid
   114  	words, dists, err := i.client.NearestWordsByVector(ctx, vector, 12, 32)
   115  	if err != nil {
   116  		return nil, err
   117  	}
   118  
   119  	nearestNeighbors := []*models.C11yNearestNeighborsItems0{}
   120  
   121  	// loop over NN Idx' and append to the return object
   122  	for i, word := range words {
   123  		item := models.C11yNearestNeighborsItems0{
   124  			Word:     word,
   125  			Distance: dists[i],
   126  		}
   127  
   128  		nearestNeighbors = append(nearestNeighbors, &item)
   129  	}
   130  
   131  	return nearestNeighbors, nil
   132  }
   133  
   134  func (i *Inspector) individualWords(ctx context.Context,
   135  	wordArray []string,
   136  ) ([]*models.C11yWordsResponseIndividualWordsItems0, error) {
   137  	var res []*models.C11yWordsResponseIndividualWordsItems0
   138  
   139  	for _, word := range wordArray {
   140  		iw, err := i.individualWord(ctx, word)
   141  		if err != nil {
   142  			return nil, fmt.Errorf("word '%s': %v", word, err)
   143  		}
   144  
   145  		res = append(res, iw)
   146  	}
   147  
   148  	return res, nil
   149  }
   150  
   151  func (i *Inspector) individualWord(ctx context.Context,
   152  	word string,
   153  ) (*models.C11yWordsResponseIndividualWordsItems0, error) {
   154  	ok, err := i.client.IsWordPresent(ctx, word)
   155  	if err != nil {
   156  		return nil, fmt.Errorf("could not check word presence:  %v", err)
   157  	}
   158  
   159  	if !ok {
   160  		return i.individualWordNotPresent(word), nil
   161  	}
   162  
   163  	return i.individualWordPresent(ctx, word)
   164  }
   165  
   166  func (i *Inspector) individualWordNotPresent(word string) *models.C11yWordsResponseIndividualWordsItems0 {
   167  	return &models.C11yWordsResponseIndividualWordsItems0{
   168  		Word:    word,
   169  		Present: false,
   170  	}
   171  }
   172  
   173  func (i *Inspector) individualWordPresent(ctx context.Context,
   174  	word string,
   175  ) (*models.C11yWordsResponseIndividualWordsItems0, error) {
   176  	info, err := i.individualWordInfo(ctx, word)
   177  	if err != nil {
   178  		return nil, err
   179  	}
   180  
   181  	return &models.C11yWordsResponseIndividualWordsItems0{
   182  		Word:    word,
   183  		Present: true,
   184  		Info:    info,
   185  	}, nil
   186  }
   187  
   188  func (i *Inspector) individualWordInfo(ctx context.Context,
   189  	word string,
   190  ) (*models.C11yWordsResponseIndividualWordsItems0Info, error) {
   191  	vector, err := i.client.VectorForWord(ctx, word)
   192  	if err != nil {
   193  		return nil, err
   194  	}
   195  
   196  	nns, err := i.nearestNeighbors(ctx, vector)
   197  	if err != nil {
   198  		return nil, err
   199  	}
   200  
   201  	return &models.C11yWordsResponseIndividualWordsItems0Info{
   202  		Vector:           vector,
   203  		NearestNeighbors: nns,
   204  	}, nil
   205  }
   206  
   207  // Splits a CamelCase string to an array
   208  // Based on: https://github.com/fatih/camelcase
   209  func split(src string) (entries []string) {
   210  	// don't split invalid utf8
   211  	if !utf8.ValidString(src) {
   212  		return []string{src}
   213  	}
   214  	entries = []string{}
   215  	var runes [][]rune
   216  	lastClass := 0
   217  	class := 0
   218  	// split into fields based on class of unicode character
   219  	for _, r := range src {
   220  		switch true {
   221  		case unicode.IsLower(r):
   222  			class = 1
   223  		case unicode.IsUpper(r):
   224  			class = 2
   225  		case unicode.IsDigit(r):
   226  			class = 1
   227  		default:
   228  			class = 4
   229  		}
   230  		if class == lastClass {
   231  			runes[len(runes)-1] = append(runes[len(runes)-1], r)
   232  		} else {
   233  			runes = append(runes, []rune{r})
   234  		}
   235  		lastClass = class
   236  	}
   237  	// handle upper case -> lower case sequences, e.g.
   238  	// "PDFL", "oader" -> "PDF", "Loader"
   239  	for i := 0; i < len(runes)-1; i++ {
   240  		if unicode.IsUpper(runes[i][0]) && unicode.IsLower(runes[i+1][0]) {
   241  			runes[i+1] = append([]rune{runes[i][len(runes[i])-1]}, runes[i+1]...)
   242  			runes[i] = runes[i][:len(runes[i])-1]
   243  		}
   244  	}
   245  	// construct []string from results
   246  	for _, s := range runes {
   247  		if len(s) > 0 {
   248  			entries = append(entries, strings.ToLower(string(s)))
   249  		}
   250  	}
   251  	return
   252  }