github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/helpers/tokenizer.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package helpers
    13  
    14  import (
    15  	"os"
    16  	"strings"
    17  	"sync"
    18  	"unicode"
    19  
    20  	"github.com/go-ego/gse"
    21  	"github.com/weaviate/weaviate/entities/models"
    22  )
    23  
    24  var (
    25  	gseTokenizer     *gse.Segmenter
    26  	gseTokenizerLock = &sync.Mutex{}
    27  	UseGse           = false
    28  )
    29  
    30  var Tokenizations []string = []string{
    31  	models.PropertyTokenizationWord,
    32  	models.PropertyTokenizationLowercase,
    33  	models.PropertyTokenizationWhitespace,
    34  	models.PropertyTokenizationField,
    35  	models.PropertyTokenizationTrigram,
    36  	models.PropertyTokenizationGse,
    37  }
    38  
    39  func init() {
    40  	init_gse()
    41  }
    42  
    43  func init_gse() {
    44  	if os.Getenv("USE_GSE") == "true" {
    45  		UseGse = true
    46  	}
    47  	if UseGse {
    48  		gseTokenizerLock.Lock()
    49  		defer gseTokenizerLock.Unlock()
    50  		if gseTokenizer == nil {
    51  			seg, err := gse.New("ja")
    52  			if err != nil {
    53  				return //[]string{}
    54  			}
    55  			gseTokenizer = &seg
    56  		}
    57  	}
    58  }
    59  
    60  func Tokenize(tokenization string, in string) []string {
    61  	switch tokenization {
    62  	case models.PropertyTokenizationWord:
    63  		return tokenizeWord(in)
    64  	case models.PropertyTokenizationLowercase:
    65  		return tokenizeLowercase(in)
    66  	case models.PropertyTokenizationWhitespace:
    67  		return tokenizeWhitespace(in)
    68  	case models.PropertyTokenizationField:
    69  		return tokenizeField(in)
    70  	case models.PropertyTokenizationTrigram:
    71  		return tokenizetrigram(in)
    72  	case models.PropertyTokenizationGse:
    73  		return tokenizeGSE(in)
    74  	default:
    75  		return []string{}
    76  	}
    77  }
    78  
    79  func TokenizeWithWildcards(tokenization string, in string) []string {
    80  	switch tokenization {
    81  	case models.PropertyTokenizationWord:
    82  		return tokenizeWordWithWildcards(in)
    83  	case models.PropertyTokenizationLowercase:
    84  		return tokenizeLowercase(in)
    85  	case models.PropertyTokenizationWhitespace:
    86  		return tokenizeWhitespace(in)
    87  	case models.PropertyTokenizationField:
    88  		return tokenizeField(in)
    89  	case models.PropertyTokenizationTrigram:
    90  		return tokenizetrigramWithWildcards(in)
    91  	case models.PropertyTokenizationGse:
    92  		return tokenizeGSE(in)
    93  	default:
    94  		return []string{}
    95  	}
    96  }
    97  
    98  // tokenizeField trims white spaces
    99  // (former DataTypeString/Field)
   100  func tokenizeField(in string) []string {
   101  	return []string{strings.TrimFunc(in, unicode.IsSpace)}
   102  }
   103  
   104  // tokenizeWhitespace splits on white spaces, does not alter casing
   105  // (former DataTypeString/Word)
   106  func tokenizeWhitespace(in string) []string {
   107  	return strings.FieldsFunc(in, unicode.IsSpace)
   108  }
   109  
   110  // tokenizeLowercase splits on white spaces and lowercases the words
   111  func tokenizeLowercase(in string) []string {
   112  	terms := tokenizeWhitespace(in)
   113  	return lowercase(terms)
   114  }
   115  
   116  // tokenizeWord splits on any non-alphanumerical and lowercases the words
   117  // (former DataTypeText/Word)
   118  func tokenizeWord(in string) []string {
   119  	terms := strings.FieldsFunc(in, func(r rune) bool {
   120  		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
   121  	})
   122  	return lowercase(terms)
   123  }
   124  
   125  // tokenizetrigram splits on any non-alphanumerical and lowercases the words, joins them together, then groups them into trigrams
   126  func tokenizetrigram(in string) []string {
   127  	// Strip whitespace and punctuation from the input string
   128  	inputString := strings.ToLower(strings.Join(strings.FieldsFunc(in, func(r rune) bool {
   129  		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
   130  	}), ""))
   131  	runes := []rune(inputString)
   132  	var trirunes [][]rune
   133  	for i := 0; i < len(runes)-2; i++ {
   134  		trirunes = append(trirunes, runes[i:i+3])
   135  	}
   136  
   137  	var trigrams []string
   138  	for _, trirune := range trirunes {
   139  		trigrams = append(trigrams, string(trirune))
   140  	}
   141  	return trigrams
   142  }
   143  
   144  // tokenizeGSE uses the gse tokenizer to tokenise Chinese and Japanese
   145  func tokenizeGSE(in string) []string {
   146  	if !UseGse {
   147  		return []string{}
   148  	}
   149  	gseTokenizerLock.Lock()
   150  	defer gseTokenizerLock.Unlock()
   151  	terms := gseTokenizer.CutAll(in)
   152  
   153  	// Remove empty strings from terms
   154  	for i := 0; i < len(terms); i++ {
   155  		if terms[i] == "" || terms[i] == " " {
   156  			terms = append(terms[:i], terms[i+1:]...)
   157  			i--
   158  		}
   159  	}
   160  
   161  	alpha := tokenizeWord(in)
   162  	return append(terms, alpha...)
   163  }
   164  
   165  // tokenizeWordWithWildcards splits on any non-alphanumerical except wildcard-symbols and
   166  // lowercases the words
   167  func tokenizeWordWithWildcards(in string) []string {
   168  	terms := strings.FieldsFunc(in, func(r rune) bool {
   169  		return !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '?' && r != '*'
   170  	})
   171  	return lowercase(terms)
   172  }
   173  
   174  // tokenizetrigramWithWildcards splits on any non-alphanumerical and lowercases the words, applies any wildcards, then joins them together, then groups them into trigrams
   175  // this is unlikely to be useful, but is included for completeness
   176  func tokenizetrigramWithWildcards(in string) []string {
   177  	terms := tokenizeWordWithWildcards(in)
   178  	inputString := strings.Join(terms, "")
   179  	var trigrams []string
   180  	for i := 0; i < len(inputString)-2; i++ {
   181  		trigrams = append(trigrams, inputString[i:i+3])
   182  	}
   183  	return trigrams
   184  }
   185  
   186  func lowercase(terms []string) []string {
   187  	for i := range terms {
   188  		terms[i] = strings.ToLower(terms[i])
   189  	}
   190  	return terms
   191  }
   192  
   193  func TokenizeAndCountDuplicates(tokenization string, in string) ([]string, []int) {
   194  	counts := map[string]int{}
   195  	for _, term := range Tokenize(tokenization, in) {
   196  		counts[term]++
   197  	}
   198  
   199  	unique := make([]string, len(counts))
   200  	boosts := make([]int, len(counts))
   201  
   202  	i := 0
   203  	for term, boost := range counts {
   204  		unique[i] = term
   205  		boosts[i] = boost
   206  		i++
   207  	}
   208  
   209  	return unique, boosts
   210  }