github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/clusterintegrationtest/helpers_for_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  //go:build integrationTest
    13  // +build integrationTest
    14  
    15  package clusterintegrationtest
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"math"
    22  	"math/rand"
    23  	"sort"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/go-openapi/strfmt"
    28  	"github.com/google/uuid"
    29  	"github.com/stretchr/testify/require"
    30  	"github.com/weaviate/weaviate/adapters/repos/db"
    31  	"github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer"
    32  	"github.com/weaviate/weaviate/entities/additional"
    33  	"github.com/weaviate/weaviate/entities/models"
    34  	"github.com/weaviate/weaviate/entities/schema"
    35  	"github.com/weaviate/weaviate/entities/schema/crossref"
    36  	enthnsw "github.com/weaviate/weaviate/entities/vectorindex/hnsw"
    37  	"github.com/weaviate/weaviate/usecases/objects"
    38  	"github.com/weaviate/weaviate/usecases/sharding"
    39  )
    40  
    41  func getRandomSeed() *rand.Rand {
    42  	return rand.New(rand.NewSource(time.Now().UnixNano()))
    43  }
    44  
    45  func setupDirectory(t *testing.T) string {
    46  	dirName := t.TempDir()
    47  	return dirName
    48  }
    49  
    50  func dataAsBatch(data []*models.Object) objects.BatchObjects {
    51  	batchObjs := make(objects.BatchObjects, len(data))
    52  	for i := range data {
    53  		batchObjs[i] = objects.BatchObject{
    54  			OriginalIndex: i,
    55  			Err:           nil,
    56  			Object:        data[i],
    57  			UUID:          data[i].ID,
    58  		}
    59  	}
    60  
    61  	return batchObjs
    62  }
    63  
    64  func dataAsBatchWithProps(data []*models.Object, props []string) objects.BatchObjects {
    65  	batchObjs := make(objects.BatchObjects, len(data))
    66  	for i := range data {
    67  		batchObjs[i] = objects.BatchObject{
    68  			OriginalIndex: i,
    69  			Err:           nil,
    70  			Object:        copyObjectWithProp(data[i], props),
    71  			UUID:          data[i].ID,
    72  		}
    73  	}
    74  
    75  	return batchObjs
    76  }
    77  
    78  // copyObjectWithProp is not a 100% copy. It may still contain the same
    79  // pointers in some properties, it does however guarantee that it does not
    80  // alter the existing input - this guarantee is lost, if you modify the output
    81  func copyObjectWithProp(in *models.Object, propsToCopy []string) *models.Object {
    82  	out := &models.Object{}
    83  
    84  	out.Additional = in.Additional
    85  	out.Class = in.Class
    86  	out.Vector = in.Vector
    87  	out.CreationTimeUnix = in.CreationTimeUnix
    88  	out.LastUpdateTimeUnix = in.LastUpdateTimeUnix
    89  	out.ID = in.ID
    90  	props := map[string]interface{}{}
    91  
    92  	for _, propName := range propsToCopy {
    93  		props[propName] = in.Properties.(map[string]interface{})[propName]
    94  	}
    95  
    96  	out.Properties = props
    97  	return out
    98  }
    99  
   100  func multiShardState(nodeCount int) *sharding.State {
   101  	config, err := sharding.ParseConfig(map[string]interface{}{
   102  		"desiredCount": json.Number(fmt.Sprintf("%d", nodeCount)),
   103  	}, 1)
   104  	if err != nil {
   105  		panic(err)
   106  	}
   107  
   108  	nodeList := make([]string, nodeCount)
   109  	for i := range nodeList {
   110  		nodeList[i] = fmt.Sprintf("node-%d", i)
   111  	}
   112  
   113  	s, err := sharding.InitState("multi-shard-test-index", config,
   114  		fakeNodes{nodeList}, 1, false)
   115  	if err != nil {
   116  		panic(err)
   117  	}
   118  
   119  	return s
   120  }
   121  
   122  func class() *models.Class {
   123  	cfg := enthnsw.NewDefaultUserConfig()
   124  	cfg.EF = 500
   125  	return &models.Class{
   126  		Class:               distributedClass,
   127  		VectorIndexConfig:   cfg,
   128  		InvertedIndexConfig: invertedConfig(),
   129  		Properties: []*models.Property{
   130  			{
   131  				Name:         "description",
   132  				DataType:     schema.DataTypeText.PropString(),
   133  				Tokenization: models.PropertyTokenizationWord,
   134  			},
   135  			{
   136  				Name:         "other_property",
   137  				DataType:     schema.DataTypeText.PropString(),
   138  				Tokenization: models.PropertyTokenizationWord,
   139  			},
   140  			{
   141  				Name:     "date_property",
   142  				DataType: schema.DataTypeDate.PropString(),
   143  			},
   144  			{
   145  				Name:     "date_array_property",
   146  				DataType: schema.DataTypeDateArray.PropString(),
   147  			},
   148  			{
   149  				Name:     "int_property",
   150  				DataType: schema.DataTypeInt.PropString(),
   151  			},
   152  			{
   153  				Name:     "phone_property",
   154  				DataType: schema.DataTypePhoneNumber.PropString(),
   155  			},
   156  		},
   157  	}
   158  }
   159  
   160  func secondClassWithRef() *models.Class {
   161  	cfg := enthnsw.NewDefaultUserConfig()
   162  	cfg.EF = 500
   163  	return &models.Class{
   164  		Class:               "SecondDistributed",
   165  		VectorIndexConfig:   cfg,
   166  		InvertedIndexConfig: invertedConfig(),
   167  		Properties: []*models.Property{
   168  			{
   169  				Name:     "description",
   170  				DataType: []string{string(schema.DataTypeText)},
   171  			},
   172  			{
   173  				Name:     "toFirst",
   174  				DataType: []string{distributedClass},
   175  			},
   176  		},
   177  	}
   178  }
   179  
   180  func invertedConfig() *models.InvertedIndexConfig {
   181  	return &models.InvertedIndexConfig{
   182  		CleanupIntervalSeconds: 60,
   183  	}
   184  }
   185  
   186  func exampleData(size int) []*models.Object {
   187  	out := make([]*models.Object, size)
   188  
   189  	for i := range out {
   190  		vec := make([]float32, vectorDims)
   191  		for i := range vec {
   192  			vec[i] = rand.Float32()
   193  		}
   194  
   195  		timestamp := time.Unix(0, 0).Add(time.Duration(i) * time.Hour)
   196  		phoneNumber := uint64(1000000 + rand.Intn(10000))
   197  
   198  		out[i] = &models.Object{
   199  			Class: distributedClass,
   200  			ID:    strfmt.UUID(uuid.New().String()),
   201  			Properties: map[string]interface{}{
   202  				"description":         fmt.Sprintf("object-%d", i),
   203  				"date_property":       timestamp,
   204  				"date_array_property": []interface{}{timestamp},
   205  				"int_property":        rand.Intn(1000),
   206  				"phone_property": &models.PhoneNumber{
   207  					CountryCode:            49,
   208  					DefaultCountry:         "DE",
   209  					Input:                  fmt.Sprintf("0171 %d", phoneNumber),
   210  					Valid:                  true,
   211  					InternationalFormatted: fmt.Sprintf("+49 171 %d", phoneNumber),
   212  					National:               phoneNumber,
   213  					NationalFormatted:      fmt.Sprintf("0171 %d", phoneNumber),
   214  				},
   215  			},
   216  			Vector: vec,
   217  		}
   218  	}
   219  
   220  	return out
   221  }
   222  
   223  func exampleDataWithRefs(size int, refCount int, targetObjs []*models.Object) []*models.Object {
   224  	out := make([]*models.Object, size)
   225  
   226  	for i := range out {
   227  		vec := make([]float32, vectorDims)
   228  		for i := range vec {
   229  			vec[i] = rand.Float32()
   230  		}
   231  
   232  		refs := make(models.MultipleRef, refCount)
   233  		for i := range refs {
   234  			randomTarget := targetObjs[rand.Intn(len(targetObjs))]
   235  			refs[i] = crossref.New("localhost", distributedClass, randomTarget.ID).SingleRef()
   236  		}
   237  
   238  		out[i] = &models.Object{
   239  			Class: "SecondDistributed",
   240  			ID:    strfmt.UUID(uuid.New().String()),
   241  			Properties: map[string]interface{}{
   242  				"description": fmt.Sprintf("second-object-%d", i),
   243  				"toFirst":     refs,
   244  			},
   245  			Vector: vec,
   246  		}
   247  	}
   248  
   249  	return out
   250  }
   251  
   252  func bruteForceObjectsByQuery(objs []*models.Object,
   253  	query []float32,
   254  ) []*models.Object {
   255  	type distanceAndObj struct {
   256  		distance float32
   257  		obj      *models.Object
   258  	}
   259  
   260  	distProv := distancer.NewCosineDistanceProvider()
   261  	distances := make([]distanceAndObj, len(objs))
   262  
   263  	for i := range objs {
   264  		dist, _, _ := distProv.SingleDist(normalize(query), normalize(objs[i].Vector))
   265  		distances[i] = distanceAndObj{
   266  			distance: dist,
   267  			obj:      objs[i],
   268  		}
   269  	}
   270  
   271  	sort.Slice(distances, func(a, b int) bool {
   272  		return distances[a].distance < distances[b].distance
   273  	})
   274  
   275  	out := make([]*models.Object, len(objs))
   276  	for i := range out {
   277  		out[i] = distances[i].obj
   278  	}
   279  
   280  	return out
   281  }
   282  
   283  func normalize(v []float32) []float32 {
   284  	var norm float32
   285  	for i := range v {
   286  		norm += v[i] * v[i]
   287  	}
   288  
   289  	norm = float32(math.Sqrt(float64(norm)))
   290  	for i := range v {
   291  		v[i] = v[i] / norm
   292  	}
   293  
   294  	return v
   295  }
   296  
   297  func manuallyResolveRef(t *testing.T, obj *models.Object,
   298  	possibleTargets []*models.Object, localPropName,
   299  	referencedPropName string,
   300  	repo *db.DB,
   301  ) []map[string]interface{} {
   302  	beacons := obj.Properties.(map[string]interface{})[localPropName].(models.MultipleRef)
   303  	out := make([]map[string]interface{}, len(beacons))
   304  
   305  	for i, ref := range beacons {
   306  		parsed, err := crossref.Parse(ref.Beacon.String())
   307  		require.Nil(t, err)
   308  		target := findId(possibleTargets, parsed.TargetID)
   309  		require.NotNil(t, target, "target not found")
   310  		if referencedPropName == "vector" {
   311  			// find referenced object to get his actual vector from DB
   312  			require.NotNil(t, repo)
   313  			res, err := repo.Object(context.Background(), parsed.Class, parsed.TargetID,
   314  				nil, additional.Properties{Vector: true}, nil, "")
   315  			require.Nil(t, err)
   316  			require.NotNil(t, res)
   317  			out[i] = map[string]interface{}{
   318  				referencedPropName: res.Vector,
   319  			}
   320  		} else {
   321  			out[i] = map[string]interface{}{
   322  				referencedPropName: target.Properties.(map[string]interface{})[referencedPropName],
   323  			}
   324  		}
   325  	}
   326  
   327  	return out
   328  }
   329  
   330  func findId(list []*models.Object, id strfmt.UUID) *models.Object {
   331  	for _, obj := range list {
   332  		if obj.ID == id {
   333  			return obj
   334  		}
   335  	}
   336  
   337  	return nil
   338  }
   339  
   340  func refsAsBatch(in []*models.Object, propName string) objects.BatchReferences {
   341  	out := objects.BatchReferences{}
   342  
   343  	originalIndex := 0
   344  	for _, obj := range in {
   345  		beacons := obj.Properties.(map[string]interface{})[propName].(models.MultipleRef)
   346  		current := make(objects.BatchReferences, len(beacons))
   347  		for i, beacon := range beacons {
   348  			to, err := crossref.Parse(beacon.Beacon.String())
   349  			if err != nil {
   350  				panic(err)
   351  			}
   352  			current[i] = objects.BatchReference{
   353  				OriginalIndex: originalIndex,
   354  				To:            to,
   355  				From: crossref.NewSource(schema.ClassName(obj.Class),
   356  					schema.PropertyName(propName), obj.ID),
   357  			}
   358  			originalIndex++
   359  		}
   360  		out = append(out, current...)
   361  	}
   362  
   363  	return out
   364  }