github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/benchmark_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package hnsw
    13  
    14  import (
    15  	"flag"
    16  	"io"
    17  	"net/http"
    18  	"net/url"
    19  	"os"
    20  	"path/filepath"
    21  	"sort"
    22  	"strconv"
    23  	"strings"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/pkg/errors"
    28  	"github.com/stretchr/testify/require"
    29  	"github.com/weaviate/weaviate/adapters/repos/db/vector/compressionhelpers"
    30  	"gopkg.in/yaml.v2"
    31  )
    32  
    33  var download = flag.Bool("download", false, "download datasets if not found locally")
    34  
    35  var datasets = map[string]string{
    36  	"random-xs":              "datasets/big-ann-benchmarks/random10000/data_10000_20",
    37  	"random-xs-clustered":    "datasets/big-ann-benchmarks/random-clustered10000/clu-random.fbin.crop_nb_10000",
    38  	"msturing-1M":            "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_1000000",
    39  	"msturing-10M":           "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_10000000",
    40  	"msspacev-1M":            "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_1000000",
    41  	"msspacev-10M":           "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_10000000",
    42  	"msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/msturing-10M-clustered.fbin",
    43  }
    44  
    45  var queries = map[string]string{
    46  	"random-xs":              "datasets/big-ann-benchmarks/random10000/queries_1000_20",
    47  	"random-xs-clustered":    "datasets/big-ann-benchmarks/random-clustered10000/queries_1000_20.fbin",
    48  	"msturing-1M":            "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin",
    49  	"msturing-10M":           "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin",
    50  	"msspacev-1M":            "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin",
    51  	"msspacev-10M":           "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin",
    52  	"msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/testQuery10K.fbin",
    53  }
    54  
    55  func BenchmarkHnswNeurips23(b *testing.B) {
    56  	runbooks := []string{
    57  		"datasets/neurips23/simple_runbook.yaml",
    58  		"datasets/neurips23/clustered_runbook.yaml",
    59  	}
    60  
    61  	type datasetPoints struct {
    62  		dataset string
    63  		points  int
    64  	}
    65  
    66  	readDatasets := make(map[datasetPoints][][]float32)
    67  
    68  	for _, runbookFile := range runbooks {
    69  		b.Run(runbookFile, func(b *testing.B) {
    70  			runbook := readRunbook(b, runbookFile)
    71  
    72  			for _, step := range runbook.Steps {
    73  				b.Run(step.Dataset, func(b *testing.B) {
    74  					// Read the dataset if we haven't already
    75  					vectors, ok := readDatasets[datasetPoints{step.Dataset, step.MaxPts}]
    76  					if !ok {
    77  						file, ok := datasets[step.Dataset]
    78  						if !ok {
    79  							b.Skipf("Neurips23 dataset %s not found", step.Dataset)
    80  						}
    81  
    82  						if _, err := os.Stat(file); err != nil {
    83  							if !*download {
    84  								b.Skipf(`Neurips23 dataset %s not found.
    85  Run test with -download to automatically download the dataset.
    86  Ex: go test -v -benchmem -bench ^BenchmarkHnswNeurips23$ -download`, step.Dataset)
    87  							}
    88  							downloadDataset(b, step.Dataset)
    89  						}
    90  
    91  						readDatasets[datasetPoints{step.Dataset, step.MaxPts}] = readBigAnnDataset(b, file, step.MaxPts)
    92  						vectors = readDatasets[datasetPoints{step.Dataset, step.MaxPts}]
    93  					}
    94  
    95  					var queryVectors [][]float32
    96  
    97  					b.ResetTimer()
    98  
    99  					for i := 0; i < b.N; i++ {
   100  						index := createEmptyHnswIndexForTests(b, idVectorSize(len(vectors[0])))
   101  
   102  						for _, op := range step.Operations {
   103  							switch op.Operation {
   104  							case "insert":
   105  								compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) {
   106  									err := index.Add(uint64(op.Start+int(i)), vectors[op.Start+int(i)])
   107  									require.NoError(b, err)
   108  								})
   109  							case "delete":
   110  								compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) {
   111  									err := index.Delete(uint64(op.Start + int(i)))
   112  									require.NoError(b, err)
   113  								})
   114  							case "search":
   115  								if len(queryVectors) == 0 {
   116  									file, ok := queries[step.Dataset]
   117  									if !ok {
   118  										b.Errorf("query file: not found for %s dataset", step.Dataset)
   119  									}
   120  
   121  									queryVectors = readBigAnnDataset(b, file, 0)
   122  								}
   123  
   124  								compressionhelpers.Concurrently(uint64(len(queryVectors)), func(i uint64) {
   125  									_, _, err := index.SearchByVector(queryVectors[i], 0, nil)
   126  									require.NoError(b, err)
   127  								})
   128  							default:
   129  								b.Errorf("Unknown operation %s", op.Operation)
   130  							}
   131  						}
   132  					}
   133  				})
   134  			}
   135  		})
   136  	}
   137  }
   138  
   139  func downloadDataset(t testing.TB, name string) {
   140  	t.Helper()
   141  
   142  	ds, ok := datasets[name]
   143  	if !ok {
   144  		t.Fatalf("Dataset %s not found", name)
   145  	}
   146  
   147  	qs, ok := queries[name]
   148  	if !ok {
   149  		t.Fatalf("Query file not found for %s dataset", name)
   150  	}
   151  
   152  	for _, f := range []string{ds, qs} {
   153  		downloadDatasetFile(t, f)
   154  	}
   155  }
   156  
   157  func downloadDatasetFile(t testing.TB, file string) {
   158  	t.Helper()
   159  
   160  	if _, err := os.Stat(file); err == nil {
   161  		return
   162  	}
   163  
   164  	err := os.MkdirAll(filepath.Dir(file), 0o755)
   165  	require.NoError(t, err)
   166  
   167  	path := strings.TrimPrefix(file, "datasets/")
   168  
   169  	u, err := url.JoinPath("https://storage.googleapis.com/ann-datasets/", path)
   170  	require.NoError(t, err)
   171  
   172  	t.Logf("Downloading dataset from %s", u)
   173  
   174  	client := http.Client{
   175  		Timeout: 60 * time.Second,
   176  	}
   177  
   178  	resp, err := client.Get(u)
   179  	require.NoError(t, err)
   180  	defer resp.Body.Close()
   181  
   182  	if resp.StatusCode != http.StatusOK {
   183  		t.Fatalf("Could not download dataset. Status code: %d", resp.StatusCode)
   184  	}
   185  
   186  	f, err := os.Create(file)
   187  	require.NoError(t, err)
   188  	defer f.Close()
   189  
   190  	_, err = io.Copy(f, resp.Body)
   191  	require.NoError(t, err)
   192  
   193  	t.Logf("Downloaded dataset %s", file)
   194  }
   195  
   196  func readBigAnnDataset(t testing.TB, file string, maxObjects int) [][]float32 {
   197  	t.Helper()
   198  
   199  	var vectors [][]float32
   200  
   201  	f, err := os.Open(file)
   202  	if err != nil {
   203  		panic(errors.Wrap(err, "Could not open SIFT file"))
   204  	}
   205  	defer f.Close()
   206  
   207  	fi, err := f.Stat()
   208  	if err != nil {
   209  		panic(errors.Wrap(err, "Could not get SIFT file properties"))
   210  	}
   211  	fileSize := fi.Size()
   212  
   213  	b := make([]byte, 4)
   214  
   215  	// The data is a binary file containing either floating point vectors or int8 vectors
   216  	// It starts with 8 bytes of header data
   217  	// The first 4 bytes are the number of vectors in the file
   218  	// The second 4 bytes are the dimensionality of the vectors in the file
   219  	// If the file is in fbin format, the vector data needs to be converted from bytes to float.
   220  	// If the file is in i8bin format, the vector data needs to be converted from bytes to int8 then to float.
   221  
   222  	// The first 4 bytes are the number of vectors in the file
   223  	_, err = f.Read(b)
   224  	require.NoError(t, err)
   225  	n := int32FromBytes(b)
   226  
   227  	// The second 4 bytes are the dimensionality of the vectors in the file
   228  	_, err = f.Read(b)
   229  	require.NoError(t, err)
   230  	d := int32FromBytes(b)
   231  
   232  	var bytesPerVector int
   233  	switch {
   234  	case strings.Contains(file, "i8bin"):
   235  		bytesPerVector = 1
   236  	case strings.Contains(file, "fbin"):
   237  		fallthrough
   238  	default:
   239  		bytesPerVector = 4
   240  	}
   241  
   242  	require.Equal(t, 8+n*d*bytesPerVector, int(fileSize))
   243  
   244  	vectorBytes := make([]byte, d*bytesPerVector)
   245  	if maxObjects > 0 && maxObjects < n {
   246  		n = maxObjects
   247  	}
   248  
   249  	for i := 0; i < n; i++ {
   250  		_, err = f.Read(vectorBytes)
   251  		if err == io.EOF {
   252  			break
   253  		}
   254  		require.NoError(t, err)
   255  
   256  		vectorFloat := make([]float32, 0, d)
   257  		for j := 0; j < d; j++ {
   258  			start := j * bytesPerVector
   259  			var f float32
   260  			if bytesPerVector == 1 {
   261  				f = float32(vectorBytes[start])
   262  			} else {
   263  				f = float32FromBytes(vectorBytes[start : start+bytesPerVector])
   264  			}
   265  
   266  			vectorFloat = append(vectorFloat, f)
   267  		}
   268  
   269  		vectors = append(vectors, vectorFloat)
   270  	}
   271  
   272  	if maxObjects > 0 {
   273  		require.Equal(t, maxObjects, len(vectors))
   274  	}
   275  
   276  	return vectors
   277  }
   278  
   279  type runbook struct {
   280  	Steps []runbookStep
   281  }
   282  type runbookStep struct {
   283  	Dataset    string
   284  	MaxPts     int
   285  	Operations []runbookOperation
   286  }
   287  
   288  type runbookOperation struct {
   289  	Operation string
   290  	Start     int
   291  	End       int
   292  }
   293  
   294  func readRunbook(t testing.TB, file string) *runbook {
   295  	f, err := os.Open(file)
   296  	require.NoError(t, err, "Could not open runbook file")
   297  	defer f.Close()
   298  
   299  	d := yaml.NewDecoder(f)
   300  
   301  	var runbook runbook
   302  
   303  	var m map[string]map[string]any
   304  	err = d.Decode(&m)
   305  	require.NoError(t, err)
   306  
   307  	var datasets []string
   308  	for datasetName := range m {
   309  		datasets = append(datasets, datasetName)
   310  	}
   311  
   312  	sort.Strings(datasets)
   313  
   314  	for _, datasetName := range datasets {
   315  		stepInfo := m[datasetName]
   316  		var step runbookStep
   317  
   318  		step.Dataset = datasetName
   319  		step.MaxPts = stepInfo["max_pts"].(int)
   320  		i := 1
   321  		for {
   322  			s := strconv.Itoa(i)
   323  			if _, ok := stepInfo[s]; !ok {
   324  				break
   325  			}
   326  
   327  			opInfo := stepInfo[s].(map[any]any)
   328  
   329  			var op runbookOperation
   330  			op.Operation = opInfo["operation"].(string)
   331  			if op.Operation == "insert" || op.Operation == "delete" {
   332  				op.Start = opInfo["start"].(int)
   333  				op.End = opInfo["end"].(int)
   334  			}
   335  
   336  			step.Operations = append(step.Operations, op)
   337  
   338  			i++
   339  		}
   340  
   341  		runbook.Steps = append(runbook.Steps, step)
   342  	}
   343  
   344  	return &runbook
   345  }