github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/benchmark_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package hnsw 13 14 import ( 15 "flag" 16 "io" 17 "net/http" 18 "net/url" 19 "os" 20 "path/filepath" 21 "sort" 22 "strconv" 23 "strings" 24 "testing" 25 "time" 26 27 "github.com/pkg/errors" 28 "github.com/stretchr/testify/require" 29 "github.com/weaviate/weaviate/adapters/repos/db/vector/compressionhelpers" 30 "gopkg.in/yaml.v2" 31 ) 32 33 var download = flag.Bool("download", false, "download datasets if not found locally") 34 35 var datasets = map[string]string{ 36 "random-xs": "datasets/big-ann-benchmarks/random10000/data_10000_20", 37 "random-xs-clustered": "datasets/big-ann-benchmarks/random-clustered10000/clu-random.fbin.crop_nb_10000", 38 "msturing-1M": "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_1000000", 39 "msturing-10M": "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_10000000", 40 "msspacev-1M": "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_1000000", 41 "msspacev-10M": "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_10000000", 42 "msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/msturing-10M-clustered.fbin", 43 } 44 45 var queries = map[string]string{ 46 "random-xs": "datasets/big-ann-benchmarks/random10000/queries_1000_20", 47 "random-xs-clustered": "datasets/big-ann-benchmarks/random-clustered10000/queries_1000_20.fbin", 48 "msturing-1M": "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin", 49 "msturing-10M": "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin", 50 "msspacev-1M": "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin", 51 "msspacev-10M": "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin", 52 "msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/testQuery10K.fbin", 53 } 54 55 func BenchmarkHnswNeurips23(b *testing.B) { 56 runbooks := []string{ 57 "datasets/neurips23/simple_runbook.yaml", 58 "datasets/neurips23/clustered_runbook.yaml", 59 } 60 61 type datasetPoints struct { 62 dataset string 63 points int 64 } 65 66 readDatasets := make(map[datasetPoints][][]float32) 67 68 for _, runbookFile := range runbooks { 69 b.Run(runbookFile, func(b *testing.B) { 70 runbook := readRunbook(b, runbookFile) 71 72 for _, step := range runbook.Steps { 73 b.Run(step.Dataset, func(b *testing.B) { 74 // Read the dataset if we haven't already 75 vectors, ok := readDatasets[datasetPoints{step.Dataset, step.MaxPts}] 76 if !ok { 77 file, ok := datasets[step.Dataset] 78 if !ok { 79 b.Skipf("Neurips23 dataset %s not found", step.Dataset) 80 } 81 82 if _, err := os.Stat(file); err != nil { 83 if !*download { 84 b.Skipf(`Neurips23 dataset %s not found. 85 Run test with -download to automatically download the dataset. 86 Ex: go test -v -benchmem -bench ^BenchmarkHnswNeurips23$ -download`, step.Dataset) 87 } 88 downloadDataset(b, step.Dataset) 89 } 90 91 readDatasets[datasetPoints{step.Dataset, step.MaxPts}] = readBigAnnDataset(b, file, step.MaxPts) 92 vectors = readDatasets[datasetPoints{step.Dataset, step.MaxPts}] 93 } 94 95 var queryVectors [][]float32 96 97 b.ResetTimer() 98 99 for i := 0; i < b.N; i++ { 100 index := createEmptyHnswIndexForTests(b, idVectorSize(len(vectors[0]))) 101 102 for _, op := range step.Operations { 103 switch op.Operation { 104 case "insert": 105 compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) { 106 err := index.Add(uint64(op.Start+int(i)), vectors[op.Start+int(i)]) 107 require.NoError(b, err) 108 }) 109 case "delete": 110 compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) { 111 err := index.Delete(uint64(op.Start + int(i))) 112 require.NoError(b, err) 113 }) 114 case "search": 115 if len(queryVectors) == 0 { 116 file, ok := queries[step.Dataset] 117 if !ok { 118 b.Errorf("query file: not found for %s dataset", step.Dataset) 119 } 120 121 queryVectors = readBigAnnDataset(b, file, 0) 122 } 123 124 compressionhelpers.Concurrently(uint64(len(queryVectors)), func(i uint64) { 125 _, _, err := index.SearchByVector(queryVectors[i], 0, nil) 126 require.NoError(b, err) 127 }) 128 default: 129 b.Errorf("Unknown operation %s", op.Operation) 130 } 131 } 132 } 133 }) 134 } 135 }) 136 } 137 } 138 139 func downloadDataset(t testing.TB, name string) { 140 t.Helper() 141 142 ds, ok := datasets[name] 143 if !ok { 144 t.Fatalf("Dataset %s not found", name) 145 } 146 147 qs, ok := queries[name] 148 if !ok { 149 t.Fatalf("Query file not found for %s dataset", name) 150 } 151 152 for _, f := range []string{ds, qs} { 153 downloadDatasetFile(t, f) 154 } 155 } 156 157 func downloadDatasetFile(t testing.TB, file string) { 158 t.Helper() 159 160 if _, err := os.Stat(file); err == nil { 161 return 162 } 163 164 err := os.MkdirAll(filepath.Dir(file), 0o755) 165 require.NoError(t, err) 166 167 path := strings.TrimPrefix(file, "datasets/") 168 169 u, err := url.JoinPath("https://storage.googleapis.com/ann-datasets/", path) 170 require.NoError(t, err) 171 172 t.Logf("Downloading dataset from %s", u) 173 174 client := http.Client{ 175 Timeout: 60 * time.Second, 176 } 177 178 resp, err := client.Get(u) 179 require.NoError(t, err) 180 defer resp.Body.Close() 181 182 if resp.StatusCode != http.StatusOK { 183 t.Fatalf("Could not download dataset. Status code: %d", resp.StatusCode) 184 } 185 186 f, err := os.Create(file) 187 require.NoError(t, err) 188 defer f.Close() 189 190 _, err = io.Copy(f, resp.Body) 191 require.NoError(t, err) 192 193 t.Logf("Downloaded dataset %s", file) 194 } 195 196 func readBigAnnDataset(t testing.TB, file string, maxObjects int) [][]float32 { 197 t.Helper() 198 199 var vectors [][]float32 200 201 f, err := os.Open(file) 202 if err != nil { 203 panic(errors.Wrap(err, "Could not open SIFT file")) 204 } 205 defer f.Close() 206 207 fi, err := f.Stat() 208 if err != nil { 209 panic(errors.Wrap(err, "Could not get SIFT file properties")) 210 } 211 fileSize := fi.Size() 212 213 b := make([]byte, 4) 214 215 // The data is a binary file containing either floating point vectors or int8 vectors 216 // It starts with 8 bytes of header data 217 // The first 4 bytes are the number of vectors in the file 218 // The second 4 bytes are the dimensionality of the vectors in the file 219 // If the file is in fbin format, the vector data needs to be converted from bytes to float. 220 // If the file is in i8bin format, the vector data needs to be converted from bytes to int8 then to float. 221 222 // The first 4 bytes are the number of vectors in the file 223 _, err = f.Read(b) 224 require.NoError(t, err) 225 n := int32FromBytes(b) 226 227 // The second 4 bytes are the dimensionality of the vectors in the file 228 _, err = f.Read(b) 229 require.NoError(t, err) 230 d := int32FromBytes(b) 231 232 var bytesPerVector int 233 switch { 234 case strings.Contains(file, "i8bin"): 235 bytesPerVector = 1 236 case strings.Contains(file, "fbin"): 237 fallthrough 238 default: 239 bytesPerVector = 4 240 } 241 242 require.Equal(t, 8+n*d*bytesPerVector, int(fileSize)) 243 244 vectorBytes := make([]byte, d*bytesPerVector) 245 if maxObjects > 0 && maxObjects < n { 246 n = maxObjects 247 } 248 249 for i := 0; i < n; i++ { 250 _, err = f.Read(vectorBytes) 251 if err == io.EOF { 252 break 253 } 254 require.NoError(t, err) 255 256 vectorFloat := make([]float32, 0, d) 257 for j := 0; j < d; j++ { 258 start := j * bytesPerVector 259 var f float32 260 if bytesPerVector == 1 { 261 f = float32(vectorBytes[start]) 262 } else { 263 f = float32FromBytes(vectorBytes[start : start+bytesPerVector]) 264 } 265 266 vectorFloat = append(vectorFloat, f) 267 } 268 269 vectors = append(vectors, vectorFloat) 270 } 271 272 if maxObjects > 0 { 273 require.Equal(t, maxObjects, len(vectors)) 274 } 275 276 return vectors 277 } 278 279 type runbook struct { 280 Steps []runbookStep 281 } 282 type runbookStep struct { 283 Dataset string 284 MaxPts int 285 Operations []runbookOperation 286 } 287 288 type runbookOperation struct { 289 Operation string 290 Start int 291 End int 292 } 293 294 func readRunbook(t testing.TB, file string) *runbook { 295 f, err := os.Open(file) 296 require.NoError(t, err, "Could not open runbook file") 297 defer f.Close() 298 299 d := yaml.NewDecoder(f) 300 301 var runbook runbook 302 303 var m map[string]map[string]any 304 err = d.Decode(&m) 305 require.NoError(t, err) 306 307 var datasets []string 308 for datasetName := range m { 309 datasets = append(datasets, datasetName) 310 } 311 312 sort.Strings(datasets) 313 314 for _, datasetName := range datasets { 315 stepInfo := m[datasetName] 316 var step runbookStep 317 318 step.Dataset = datasetName 319 step.MaxPts = stepInfo["max_pts"].(int) 320 i := 1 321 for { 322 s := strconv.Itoa(i) 323 if _, ok := stepInfo[s]; !ok { 324 break 325 } 326 327 opInfo := stepInfo[s].(map[any]any) 328 329 var op runbookOperation 330 op.Operation = opInfo["operation"].(string) 331 if op.Operation == "insert" || op.Operation == "delete" { 332 op.Start = opInfo["start"].(int) 333 op.End = opInfo["end"].(int) 334 } 335 336 step.Operations = append(step.Operations, op) 337 338 i++ 339 } 340 341 runbook.Steps = append(runbook.Steps, step) 342 } 343 344 return &runbook 345 }