github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/config.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"runtime"
    16  	"strings"
    17  
    18  	"github.com/pkg/errors"
    19  	"github.com/weaviate/weaviate/adapters/repos/db/inverted/stopwords"
    20  	"github.com/weaviate/weaviate/entities/models"
    21  	"github.com/weaviate/weaviate/entities/schema"
    22  	"github.com/weaviate/weaviate/usecases/config"
    23  )
    24  
    25  var _NUMCPU = runtime.NumCPU()
    26  
    27  func ValidateConfig(conf *models.InvertedIndexConfig) error {
    28  	if conf.CleanupIntervalSeconds < 0 {
    29  		return errors.Errorf("cleanup interval seconds must be > 0")
    30  	}
    31  
    32  	err := validateBM25Config(conf.Bm25)
    33  	if err != nil {
    34  		return err
    35  	}
    36  
    37  	err = validateStopwordConfig(conf.Stopwords)
    38  	if err != nil {
    39  		return err
    40  	}
    41  
    42  	return nil
    43  }
    44  
    45  func ConfigFromModel(iicm *models.InvertedIndexConfig) schema.InvertedIndexConfig {
    46  	var conf schema.InvertedIndexConfig
    47  
    48  	conf.IndexTimestamps = iicm.IndexTimestamps
    49  	conf.IndexNullState = iicm.IndexNullState
    50  	conf.IndexPropertyLength = iicm.IndexPropertyLength
    51  
    52  	if iicm.Bm25 == nil {
    53  		conf.BM25.K1 = float64(config.DefaultBM25k1)
    54  		conf.BM25.B = float64(config.DefaultBM25b)
    55  	} else {
    56  		conf.BM25.K1 = float64(iicm.Bm25.K1)
    57  		conf.BM25.B = float64(iicm.Bm25.B)
    58  	}
    59  
    60  	if iicm.Stopwords == nil {
    61  		conf.Stopwords = models.StopwordConfig{
    62  			Preset: stopwords.EnglishPreset,
    63  		}
    64  	} else {
    65  		conf.Stopwords.Preset = iicm.Stopwords.Preset
    66  		conf.Stopwords.Additions = iicm.Stopwords.Additions
    67  		conf.Stopwords.Removals = iicm.Stopwords.Removals
    68  	}
    69  
    70  	return conf
    71  }
    72  
    73  func validateBM25Config(conf *models.BM25Config) error {
    74  	if conf == nil {
    75  		return nil
    76  	}
    77  
    78  	if conf.K1 < 0 {
    79  		return errors.Errorf("BM25.k1 must be >= 0")
    80  	}
    81  	if conf.B < 0 || conf.B > 1 {
    82  		return errors.Errorf("BM25.b must be <= 0 and <= 1")
    83  	}
    84  
    85  	return nil
    86  }
    87  
    88  func validateStopwordConfig(conf *models.StopwordConfig) error {
    89  	if conf == nil {
    90  		conf = &models.StopwordConfig{}
    91  	}
    92  
    93  	if conf.Preset == "" {
    94  		conf.Preset = stopwords.EnglishPreset
    95  	}
    96  
    97  	if _, ok := stopwords.Presets[conf.Preset]; !ok {
    98  		return errors.Errorf("stopwordPreset '%s' does not exist", conf.Preset)
    99  	}
   100  
   101  	err := validateStopwordAdditionsRemovals(conf)
   102  	if err != nil {
   103  		return err
   104  	}
   105  
   106  	return nil
   107  }
   108  
   109  func validateStopwordAdditionsRemovals(conf *models.StopwordConfig) error {
   110  	// the same stopword cannot exist
   111  	// in both additions and removals
   112  	foundAdditions := make(map[string]int)
   113  
   114  	for idx, add := range conf.Additions {
   115  		if strings.TrimSpace(add) == "" {
   116  			return errors.Errorf("cannot use whitespace in stopword.additions")
   117  		}
   118  
   119  		// save the index of the addition since it
   120  		// is readily available here. we will need
   121  		// this below when trimming additions that
   122  		// already exist in the selected preset
   123  		foundAdditions[add] = idx
   124  	}
   125  
   126  	for _, rem := range conf.Removals {
   127  		if strings.TrimSpace(rem) == "" {
   128  			return errors.Errorf("cannot use whitespace in stopword.removals")
   129  		}
   130  
   131  		if _, ok := foundAdditions[rem]; ok {
   132  			return errors.Errorf(
   133  				"found '%s' in both stopwords.additions and stopwords.removals", rem)
   134  		}
   135  	}
   136  
   137  	removeStopwordAdditionsIfInPreset(conf, foundAdditions)
   138  	return nil
   139  }
   140  
   141  func removeStopwordAdditionsIfInPreset(conf *models.StopwordConfig, foundAdditions map[string]int) {
   142  	presets := stopwords.Presets[conf.Preset]
   143  
   144  	// if any of the elements in stopwords.additions
   145  	// already exist in the preset, mark it as to
   146  	// be removed
   147  	indicesToRemove := make(map[int]bool)
   148  	for _, preset := range presets {
   149  		if idx, ok := foundAdditions[preset]; ok {
   150  			indicesToRemove[idx] = true
   151  		}
   152  	}
   153  
   154  	if len(indicesToRemove) == 0 {
   155  		return
   156  	}
   157  
   158  	// take remaining additions, build new list
   159  	var trimmedAdditions []string
   160  	for idx, add := range conf.Additions {
   161  		if _, ok := indicesToRemove[idx]; !ok {
   162  			trimmedAdditions = append(trimmedAdditions, add)
   163  		}
   164  	}
   165  	conf.Additions = trimmedAdditions
   166  }