github.com/weaviate/weaviate@v1.24.6/usecases/schema/add.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 /* Remark: 13 14 In the current implementation, there is no guarantee of consistent updates to the schema 15 as updating the actual index and the schema itself is not an atomic operation. 16 Resolving this issue is beyond the scope of this PR, 17 but it will be addressed in a separate task specifically dedicated to it. 18 */ 19 20 package schema 21 22 import ( 23 "context" 24 "encoding/json" 25 "fmt" 26 "os" 27 "strings" 28 29 "github.com/pkg/errors" 30 "github.com/prometheus/client_golang/prometheus" 31 "github.com/weaviate/weaviate/adapters/repos/db/inverted/stopwords" 32 "github.com/weaviate/weaviate/entities/backup" 33 "github.com/weaviate/weaviate/entities/models" 34 "github.com/weaviate/weaviate/entities/schema" 35 "github.com/weaviate/weaviate/usecases/config" 36 "github.com/weaviate/weaviate/usecases/monitoring" 37 "github.com/weaviate/weaviate/usecases/replica" 38 "github.com/weaviate/weaviate/usecases/sharding" 39 ) 40 41 // AddClass to the schema 42 func (m *Manager) AddClass(ctx context.Context, principal *models.Principal, 43 class *models.Class, 44 ) error { 45 err := m.Authorizer.Authorize(principal, "create", "schema/objects") 46 if err != nil { 47 return err 48 } 49 50 shardState, err := m.addClass(ctx, class) 51 if err != nil { 52 return err 53 } 54 55 // call to migrator needs to be outside the lock that is set in addClass 56 return m.migrator.AddClass(ctx, class, shardState) 57 // TODO gh-846: Rollback state update if migration fails 58 } 59 60 func (m *Manager) RestoreClass(ctx context.Context, d *backup.ClassDescriptor, nodeMapping map[string]string) error { 61 // get schema and sharding state 62 class := &models.Class{} 63 if err := json.Unmarshal(d.Schema, &class); err != nil { 64 return fmt.Errorf("marshal class schema: %w", err) 65 } 66 var shardingState sharding.State 67 if d.ShardingState != nil { 68 err := json.Unmarshal(d.ShardingState, &shardingState) 69 if err != nil { 70 return fmt.Errorf("marshal sharding state: %w", err) 71 } 72 } 73 74 m.Lock() 75 defer m.Unlock() 76 metric, err := monitoring.GetMetrics().BackupRestoreClassDurations.GetMetricWithLabelValues(class.Class) 77 if err == nil { 78 timer := prometheus.NewTimer(metric) 79 defer timer.ObserveDuration() 80 } 81 82 class.Class = schema.UppercaseClassName(class.Class) 83 class.Properties = schema.LowercaseAllPropertyNames(class.Properties) 84 85 m.setClassDefaults(class) 86 err = m.validateCanAddClass(ctx, class, true) 87 if err != nil { 88 return err 89 } 90 // migrate only after validation in completed 91 m.migrateClassSettings(class) 92 93 err = m.parseShardingConfig(ctx, class) 94 if err != nil { 95 return err 96 } 97 98 err = m.parseVectorIndexConfig(ctx, class) 99 if err != nil { 100 return err 101 } 102 103 err = m.invertedConfigValidator(class.InvertedIndexConfig) 104 if err != nil { 105 return err 106 } 107 108 shardingState.MigrateFromOldFormat() 109 shardingState.ApplyNodeMapping(nodeMapping) 110 111 payload, err := CreateClassPayload(class, &shardingState) 112 if err != nil { 113 return err 114 } 115 shardingState.SetLocalName(m.clusterState.LocalName()) 116 m.schemaCache.addClass(class, &shardingState) 117 118 if err := m.repo.NewClass(ctx, payload); err != nil { 119 return err 120 } 121 m.logger. 122 WithField("action", "schema_restore_class"). 123 Debugf("restore class %q from schema", class.Class) 124 m.triggerSchemaUpdateCallbacks() 125 126 out := m.migrator.AddClass(ctx, class, &shardingState) 127 return out 128 } 129 130 func (m *Manager) addClass(ctx context.Context, class *models.Class, 131 ) (*sharding.State, error) { 132 m.Lock() 133 defer m.Unlock() 134 135 class.Class = schema.UppercaseClassName(class.Class) 136 class.Properties = schema.LowercaseAllPropertyNames(class.Properties) 137 if class.ShardingConfig != nil && schema.MultiTenancyEnabled(class) { 138 return nil, fmt.Errorf("cannot have both shardingConfig and multiTenancyConfig") 139 } else if class.MultiTenancyConfig == nil { 140 class.MultiTenancyConfig = &models.MultiTenancyConfig{} 141 } else if class.MultiTenancyConfig.Enabled { 142 class.ShardingConfig = sharding.Config{DesiredCount: 0} // tenant shards will be created dynamically 143 } 144 145 m.setClassDefaults(class) 146 err := m.validateCanAddClass(ctx, class, false) 147 if err != nil { 148 return nil, err 149 } 150 // migrate only after validation in completed 151 m.migrateClassSettings(class) 152 153 err = m.parseShardingConfig(ctx, class) 154 if err != nil { 155 return nil, err 156 } 157 158 err = m.parseVectorIndexConfig(ctx, class) 159 if err != nil { 160 return nil, err 161 } 162 163 err = m.invertedConfigValidator(class.InvertedIndexConfig) 164 if err != nil { 165 return nil, err 166 } 167 168 shardState, err := sharding.InitState(class.Class, 169 class.ShardingConfig.(sharding.Config), 170 m.clusterState, class.ReplicationConfig.Factor, 171 schema.MultiTenancyEnabled(class)) 172 if err != nil { 173 return nil, errors.Wrap(err, "init sharding state") 174 } 175 176 tx, err := m.cluster.BeginTransaction(ctx, AddClass, 177 AddClassPayload{class, shardState}, DefaultTxTTL) 178 if err != nil { 179 // possible causes for errors could be nodes down (we expect every node to 180 // the up for a schema transaction) or concurrent transactions from other 181 // nodes 182 return nil, errors.Wrap(err, "open cluster-wide transaction") 183 } 184 185 if err := m.cluster.CommitWriteTransaction(ctx, tx); err != nil { 186 // Only log the commit error, but do not abort the changes locally. Once 187 // we've told others to commit, we also need to commit ourselves! 188 // 189 // The idea is that if we abort our changes we are guaranteed to create an 190 // inconsistency as soon as any other node honored the commit. This would 191 // for example be the case in a 3-node cluster where node 1 is the 192 // coordinator, node 2 honored the commit and node 3 died during the commit 193 // phase. 194 // 195 // In this scenario it is far more desirable to make sure that node 1 and 196 // node 2 stay in sync, as node 3 - who may or may not have missed the 197 // update - can use a local WAL from the first TX phase to replay any 198 // missing changes once it's back. 199 m.logger.WithError(err).Errorf("not every node was able to commit") 200 } 201 202 if err := m.addClassApplyChanges(ctx, class, shardState); err != nil { 203 return nil, err 204 } 205 return shardState, nil 206 } 207 208 func (m *Manager) addClassApplyChanges(ctx context.Context, class *models.Class, 209 shardingState *sharding.State, 210 ) error { 211 payload, err := CreateClassPayload(class, shardingState) 212 if err != nil { 213 return err 214 } 215 if err := m.repo.NewClass(ctx, payload); err != nil { 216 return err 217 } 218 219 m.logger. 220 WithField("action", "schema_add_class"). 221 Debugf("add class %q from schema", class.Class) 222 223 m.schemaCache.addClass(class, shardingState) 224 225 m.triggerSchemaUpdateCallbacks() 226 return nil 227 } 228 229 func (m *Manager) setClassDefaults(class *models.Class) { 230 // set only when no target vectors configured 231 if !hasTargetVectors(class) { 232 if class.Vectorizer == "" { 233 class.Vectorizer = m.config.DefaultVectorizerModule 234 } 235 236 if class.VectorIndexType == "" { 237 class.VectorIndexType = "hnsw" 238 } 239 240 if m.config.DefaultVectorDistanceMetric != "" { 241 if class.VectorIndexConfig == nil { 242 class.VectorIndexConfig = map[string]interface{}{"distance": m.config.DefaultVectorDistanceMetric} 243 } else if class.VectorIndexConfig.(map[string]interface{})["distance"] == nil { 244 class.VectorIndexConfig.(map[string]interface{})["distance"] = m.config.DefaultVectorDistanceMetric 245 } 246 } 247 } 248 249 setInvertedConfigDefaults(class) 250 for _, prop := range class.Properties { 251 setPropertyDefaults(prop) 252 } 253 254 m.moduleConfig.SetClassDefaults(class) 255 } 256 257 func setPropertyDefaults(prop *models.Property) { 258 setPropertyDefaultTokenization(prop) 259 setPropertyDefaultIndexing(prop) 260 setNestedPropertiesDefaults(prop.NestedProperties) 261 } 262 263 func setPropertyDefaultTokenization(prop *models.Property) { 264 switch dataType, _ := schema.AsPrimitive(prop.DataType); dataType { 265 case schema.DataTypeString, schema.DataTypeStringArray: 266 // deprecated as of v1.19, default tokenization was word 267 // which will be migrated to text+whitespace 268 if prop.Tokenization == "" { 269 prop.Tokenization = models.PropertyTokenizationWord 270 } 271 case schema.DataTypeText, schema.DataTypeTextArray: 272 if prop.Tokenization == "" { 273 if os.Getenv("DEFAULT_TOKENIZATION") != "" { 274 prop.Tokenization = os.Getenv("DEFAULT_TOKENIZATION") 275 } else { 276 prop.Tokenization = models.PropertyTokenizationWord 277 } 278 } 279 default: 280 // tokenization not supported for other data types 281 } 282 } 283 284 func setPropertyDefaultIndexing(prop *models.Property) { 285 // if IndexInverted is set but IndexFilterable and IndexSearchable are not 286 // migrate IndexInverted later. 287 if prop.IndexInverted != nil && 288 prop.IndexFilterable == nil && 289 prop.IndexSearchable == nil { 290 return 291 } 292 293 vTrue := true 294 vFalse := false 295 296 if prop.IndexFilterable == nil { 297 prop.IndexFilterable = &vTrue 298 299 primitiveDataType, isPrimitive := schema.AsPrimitive(prop.DataType) 300 if isPrimitive && primitiveDataType == schema.DataTypeBlob { 301 prop.IndexFilterable = &vFalse 302 } 303 } 304 305 if prop.IndexSearchable == nil { 306 prop.IndexSearchable = &vFalse 307 308 if dataType, isPrimitive := schema.AsPrimitive(prop.DataType); isPrimitive { 309 switch dataType { 310 case schema.DataTypeString, schema.DataTypeStringArray: 311 // string/string[] are migrated to text/text[] later, 312 // at this point they are still valid data types, therefore should be handled here 313 prop.IndexSearchable = &vTrue 314 case schema.DataTypeText, schema.DataTypeTextArray: 315 prop.IndexSearchable = &vTrue 316 default: 317 // do nothing 318 } 319 } 320 } 321 } 322 323 func setNestedPropertiesDefaults(properties []*models.NestedProperty) { 324 for _, property := range properties { 325 primitiveDataType, isPrimitive := schema.AsPrimitive(property.DataType) 326 nestedDataType, isNested := schema.AsNested(property.DataType) 327 328 setNestedPropertyDefaultTokenization(property, primitiveDataType, nestedDataType, isPrimitive, isNested) 329 setNestedPropertyDefaultIndexing(property, primitiveDataType, nestedDataType, isPrimitive, isNested) 330 331 if isNested { 332 setNestedPropertiesDefaults(property.NestedProperties) 333 } 334 } 335 } 336 337 func setNestedPropertyDefaultTokenization(property *models.NestedProperty, 338 primitiveDataType, nestedDataType schema.DataType, 339 isPrimitive, isNested bool, 340 ) { 341 if property.Tokenization == "" && isPrimitive { 342 switch primitiveDataType { 343 case schema.DataTypeText, schema.DataTypeTextArray: 344 property.Tokenization = models.NestedPropertyTokenizationWord 345 default: 346 // do nothing 347 } 348 } 349 } 350 351 func setNestedPropertyDefaultIndexing(property *models.NestedProperty, 352 primitiveDataType, nestedDataType schema.DataType, 353 isPrimitive, isNested bool, 354 ) { 355 vTrue := true 356 vFalse := false 357 358 if property.IndexFilterable == nil { 359 property.IndexFilterable = &vTrue 360 361 if isPrimitive && primitiveDataType == schema.DataTypeBlob { 362 property.IndexFilterable = &vFalse 363 } 364 } 365 366 if property.IndexSearchable == nil { 367 property.IndexSearchable = &vFalse 368 369 if isPrimitive { 370 switch primitiveDataType { 371 case schema.DataTypeText, schema.DataTypeTextArray: 372 property.IndexSearchable = &vTrue 373 default: 374 // do nothing 375 } 376 } 377 } 378 } 379 380 func (m *Manager) migrateClassSettings(class *models.Class) { 381 for _, prop := range class.Properties { 382 migratePropertySettings(prop) 383 } 384 } 385 386 func migratePropertySettings(prop *models.Property) { 387 migratePropertyDataTypeAndTokenization(prop) 388 migratePropertyIndexInverted(prop) 389 } 390 391 // as of v1.19 DataTypeString and DataTypeStringArray are deprecated 392 // here both are changed to Text/TextArray 393 // and proper, backward compatible tokenization 394 func migratePropertyDataTypeAndTokenization(prop *models.Property) { 395 switch dataType, _ := schema.AsPrimitive(prop.DataType); dataType { 396 case schema.DataTypeString: 397 prop.DataType = schema.DataTypeText.PropString() 398 case schema.DataTypeStringArray: 399 prop.DataType = schema.DataTypeTextArray.PropString() 400 default: 401 // other types need no migration and do not support tokenization 402 return 403 } 404 405 switch prop.Tokenization { 406 case models.PropertyTokenizationWord: 407 prop.Tokenization = models.PropertyTokenizationWhitespace 408 case models.PropertyTokenizationField: 409 // stays field 410 } 411 } 412 413 // as of v1.19 IndexInverted is deprecated and replaced with 414 // IndexFilterable (set inverted index) 415 // and IndexSearchable (map inverted index with term frequencies; 416 // therefore applicable only to text/text[] data types) 417 func migratePropertyIndexInverted(prop *models.Property) { 418 // if none of new options is set, use inverted settings 419 if prop.IndexInverted != nil && 420 prop.IndexFilterable == nil && 421 prop.IndexSearchable == nil { 422 prop.IndexFilterable = prop.IndexInverted 423 switch dataType, _ := schema.AsPrimitive(prop.DataType); dataType { 424 // string/string[] are already migrated into text/text[], can be skipped here 425 case schema.DataTypeText, schema.DataTypeTextArray: 426 prop.IndexSearchable = prop.IndexInverted 427 default: 428 vFalse := false 429 prop.IndexSearchable = &vFalse 430 } 431 } 432 // new options have precedence so inverted can be reset 433 prop.IndexInverted = nil 434 } 435 436 func (m *Manager) validateCanAddClass( 437 ctx context.Context, class *models.Class, 438 relaxCrossRefValidation bool, 439 ) error { 440 if err := m.validateClassNameUniqueness(class.Class); err != nil { 441 return err 442 } 443 444 if err := m.validateClassName(ctx, class.Class); err != nil { 445 return err 446 } 447 448 existingPropertyNames := map[string]bool{} 449 for _, property := range class.Properties { 450 if err := m.validateProperty(property, class, existingPropertyNames, relaxCrossRefValidation); err != nil { 451 return err 452 } 453 existingPropertyNames[strings.ToLower(property.Name)] = true 454 } 455 456 if err := m.validateVectorSettings(class); err != nil { 457 return err 458 } 459 460 if err := m.moduleConfig.ValidateClass(ctx, class); err != nil { 461 return err 462 } 463 464 if err := replica.ValidateConfig(class, m.config.Replication); err != nil { 465 return err 466 } 467 468 // all is fine! 469 return nil 470 } 471 472 func (m *Manager) validateProperty( 473 property *models.Property, class *models.Class, 474 existingPropertyNames map[string]bool, relaxCrossRefValidation bool, 475 ) error { 476 if _, err := schema.ValidatePropertyName(property.Name); err != nil { 477 return err 478 } 479 480 if err := schema.ValidateReservedPropertyName(property.Name); err != nil { 481 return err 482 } 483 484 if existingPropertyNames[strings.ToLower(property.Name)] { 485 return fmt.Errorf("class %q: conflict for property %q: already in use or provided multiple times", 486 class.Class, property.Name) 487 } 488 489 // Validate data type of property. 490 sch := m.getSchema() 491 492 propertyDataType, err := (&sch).FindPropertyDataTypeWithRefs(property.DataType, 493 relaxCrossRefValidation, schema.ClassName(class.Class)) 494 if err != nil { 495 return fmt.Errorf("property '%s': invalid dataType: %v", property.Name, err) 496 } 497 498 if propertyDataType.IsNested() { 499 if err := validateNestedProperties(property.NestedProperties, property.Name); err != nil { 500 return err 501 } 502 } else { 503 if len(property.NestedProperties) > 0 { 504 return fmt.Errorf("property '%s': nestedProperties not allowed for data types other than object/object[]", 505 property.Name) 506 } 507 } 508 509 if err := m.validatePropertyTokenization(property.Tokenization, propertyDataType); err != nil { 510 return err 511 } 512 513 if err := m.validatePropertyIndexing(property); err != nil { 514 return err 515 } 516 517 if err := m.validatePropModuleConfig(class, property); err != nil { 518 return err 519 } 520 521 // all is fine! 522 return nil 523 } 524 525 func (m *Manager) parseVectorIndexConfig(ctx context.Context, 526 class *models.Class, 527 ) error { 528 if !hasTargetVectors(class) { 529 parsed, err := m.parseGivenVectorIndexConfig(class.VectorIndexType, class.VectorIndexConfig) 530 if err != nil { 531 return err 532 } 533 class.VectorIndexConfig = parsed 534 return nil 535 } 536 537 if class.VectorIndexConfig != nil { 538 return fmt.Errorf("class.vectorIndexConfig can not be set if class.vectorConfig is configured") 539 } 540 541 if err := m.parseTargetVectorsVectorIndexConfig(class); err != nil { 542 return err 543 } 544 return nil 545 } 546 547 func (m *Manager) parseTargetVectorsVectorIndexConfig(class *models.Class) error { 548 for targetVector, vectorConfig := range class.VectorConfig { 549 parsed, err := m.parseGivenVectorIndexConfig(vectorConfig.VectorIndexType, vectorConfig.VectorIndexConfig) 550 if err != nil { 551 return fmt.Errorf("parse vector config for %s: %w", targetVector, err) 552 } 553 vectorConfig.VectorIndexConfig = parsed 554 class.VectorConfig[targetVector] = vectorConfig 555 } 556 return nil 557 } 558 559 func (m *Manager) parseGivenVectorIndexConfig(vectorIndexType string, 560 vectorIndexConfig interface{}, 561 ) (schema.VectorIndexConfig, error) { 562 if vectorIndexType != "hnsw" && vectorIndexType != "flat" { 563 return nil, errors.Errorf( 564 "parse vector index config: unsupported vector index type: %q", 565 vectorIndexType) 566 } 567 568 parsed, err := m.configParser(vectorIndexConfig, vectorIndexType) 569 if err != nil { 570 return nil, errors.Wrap(err, "parse vector index config") 571 } 572 return parsed, nil 573 } 574 575 func (m *Manager) parseShardingConfig(ctx context.Context, class *models.Class) (err error) { 576 // multiTenancyConfig and shardingConfig are mutually exclusive 577 cfg := sharding.Config{} // cfg is empty in case of MT 578 if !schema.MultiTenancyEnabled(class) { 579 cfg, err = sharding.ParseConfig(class.ShardingConfig, 580 m.clusterState.NodeCount()) 581 if err != nil { 582 return fmt.Errorf("parse sharding config: %w", err) 583 } 584 585 } 586 class.ShardingConfig = cfg 587 return nil 588 } 589 590 func setInvertedConfigDefaults(class *models.Class) { 591 if class.InvertedIndexConfig == nil { 592 class.InvertedIndexConfig = &models.InvertedIndexConfig{} 593 } 594 595 if class.InvertedIndexConfig.CleanupIntervalSeconds == 0 { 596 class.InvertedIndexConfig.CleanupIntervalSeconds = config.DefaultCleanupIntervalSeconds 597 } 598 599 if class.InvertedIndexConfig.Bm25 == nil { 600 class.InvertedIndexConfig.Bm25 = &models.BM25Config{ 601 K1: config.DefaultBM25k1, 602 B: config.DefaultBM25b, 603 } 604 } 605 606 if class.InvertedIndexConfig.Stopwords == nil { 607 class.InvertedIndexConfig.Stopwords = &models.StopwordConfig{ 608 Preset: stopwords.EnglishPreset, 609 } 610 } 611 } 612 613 func CreateClassPayload(class *models.Class, 614 shardingState *sharding.State, 615 ) (pl ClassPayload, err error) { 616 pl.Name = class.Class 617 if pl.Metadata, err = json.Marshal(class); err != nil { 618 return pl, fmt.Errorf("marshal class %q metadata: %w", pl.Name, err) 619 } 620 if shardingState != nil { 621 ss := *shardingState 622 pl.Shards = make([]KeyValuePair, len(ss.Physical)) 623 i := 0 624 for name, shard := range ss.Physical { 625 data, err := json.Marshal(shard) 626 if err != nil { 627 return pl, fmt.Errorf("marshal shard %q metadata: %w", name, err) 628 } 629 pl.Shards[i] = KeyValuePair{Key: name, Value: data} 630 i++ 631 } 632 ss.Physical = nil 633 if pl.ShardingState, err = json.Marshal(&ss); err != nil { 634 return pl, fmt.Errorf("marshal class %q sharding state: %w", pl.Name, err) 635 } 636 } 637 return pl, nil 638 } 639 640 func hasTargetVectors(class *models.Class) bool { 641 return len(class.VectorConfig) > 0 642 }