github.com/weaviate/weaviate@v1.24.6/usecases/objects/auto_schema.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package objects
    13  
    14  import (
    15  	"context"
    16  	"encoding/json"
    17  	"fmt"
    18  	"strings"
    19  	"sync"
    20  	"time"
    21  
    22  	"github.com/google/uuid"
    23  	"github.com/sirupsen/logrus"
    24  	"github.com/weaviate/weaviate/entities/additional"
    25  	"github.com/weaviate/weaviate/entities/models"
    26  	"github.com/weaviate/weaviate/entities/schema"
    27  	"github.com/weaviate/weaviate/entities/schema/crossref"
    28  	"github.com/weaviate/weaviate/entities/search"
    29  	"github.com/weaviate/weaviate/usecases/config"
    30  	"github.com/weaviate/weaviate/usecases/objects/validation"
    31  )
    32  
    33  type autoSchemaManager struct {
    34  	mutex         sync.RWMutex
    35  	schemaManager schemaManager
    36  	vectorRepo    VectorRepo
    37  	config        config.AutoSchema
    38  	logger        logrus.FieldLogger
    39  }
    40  
    41  func newAutoSchemaManager(schemaManager schemaManager, vectorRepo VectorRepo,
    42  	config *config.WeaviateConfig, logger logrus.FieldLogger,
    43  ) *autoSchemaManager {
    44  	return &autoSchemaManager{
    45  		schemaManager: schemaManager,
    46  		vectorRepo:    vectorRepo,
    47  		config:        config.Config.AutoSchema,
    48  		logger:        logger,
    49  	}
    50  }
    51  
    52  func (m *autoSchemaManager) autoSchema(ctx context.Context, principal *models.Principal,
    53  	object *models.Object, allowCreateClass bool,
    54  ) error {
    55  	if m.config.Enabled {
    56  		return m.performAutoSchema(ctx, principal, object, allowCreateClass)
    57  	}
    58  	return nil
    59  }
    60  
    61  func (m *autoSchemaManager) performAutoSchema(ctx context.Context, principal *models.Principal,
    62  	object *models.Object, allowCreateClass bool,
    63  ) error {
    64  	m.mutex.Lock()
    65  	defer m.mutex.Unlock()
    66  	if object == nil {
    67  		return fmt.Errorf(validation.ErrorMissingObject)
    68  	}
    69  
    70  	if len(object.Class) == 0 {
    71  		// stop performing auto schema
    72  		return fmt.Errorf(validation.ErrorMissingClass)
    73  	}
    74  
    75  	object.Class = schema.UppercaseClassName(object.Class)
    76  
    77  	schemaClass, err := m.getClass(principal, object)
    78  	if err != nil {
    79  		return err
    80  	}
    81  	if schemaClass == nil && !allowCreateClass {
    82  		return fmt.Errorf("given class does not exist")
    83  	}
    84  	properties, err := m.getProperties(object)
    85  	if err != nil {
    86  		return err
    87  	}
    88  	if schemaClass == nil {
    89  		return m.createClass(ctx, principal, object.Class, properties)
    90  	}
    91  	return m.updateClass(ctx, principal, object.Class, properties, schemaClass.Properties)
    92  }
    93  
    94  func (m *autoSchemaManager) getClass(principal *models.Principal,
    95  	object *models.Object,
    96  ) (*models.Class, error) {
    97  	s, err := m.schemaManager.GetSchema(principal)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  	schemaClass := s.GetClass(schema.ClassName(object.Class))
   102  	return schemaClass, nil
   103  }
   104  
   105  func (m *autoSchemaManager) createClass(ctx context.Context, principal *models.Principal,
   106  	className string, properties []*models.Property,
   107  ) error {
   108  	now := time.Now()
   109  	class := &models.Class{
   110  		Class:       className,
   111  		Properties:  properties,
   112  		Description: "This property was generated by Weaviate's auto-schema feature on " + now.Format(time.ANSIC),
   113  	}
   114  	m.logger.
   115  		WithField("auto_schema", "createClass").
   116  		Debugf("create class %s", className)
   117  	return m.schemaManager.AddClass(ctx, principal, class)
   118  }
   119  
   120  func (m *autoSchemaManager) updateClass(ctx context.Context, principal *models.Principal,
   121  	className string, properties []*models.Property, existingProperties []*models.Property,
   122  ) error {
   123  	existingPropertiesIndexMap := map[string]int{}
   124  	for index := range existingProperties {
   125  		existingPropertiesIndexMap[existingProperties[index].Name] = index
   126  	}
   127  
   128  	propertiesToAdd := []*models.Property{}
   129  	propertiesToUpdate := []*models.Property{}
   130  	for _, prop := range properties {
   131  		index, exists := existingPropertiesIndexMap[schema.LowercaseFirstLetter(prop.Name)]
   132  		if !exists {
   133  			propertiesToAdd = append(propertiesToAdd, prop)
   134  		} else if _, isNested := schema.AsNested(existingProperties[index].DataType); isNested {
   135  			mergedNestedProperties, merged := schema.MergeRecursivelyNestedProperties(existingProperties[index].NestedProperties,
   136  				prop.NestedProperties)
   137  			if merged {
   138  				prop.NestedProperties = mergedNestedProperties
   139  				propertiesToUpdate = append(propertiesToUpdate, prop)
   140  			}
   141  		}
   142  	}
   143  	for _, newProp := range propertiesToAdd {
   144  		m.logger.
   145  			WithField("auto_schema", "updateClass").
   146  			Debugf("update class %s add property %s", className, newProp.Name)
   147  		err := m.schemaManager.AddClassProperty(ctx, principal, className, newProp)
   148  		if err != nil {
   149  			return err
   150  		}
   151  	}
   152  	for _, updatedProp := range propertiesToUpdate {
   153  		m.logger.
   154  			WithField("auto_schema", "updateClass").
   155  			Debugf("update class %s merge object property %s", className, updatedProp.Name)
   156  		err := m.schemaManager.MergeClassObjectProperty(ctx, principal, className, updatedProp)
   157  		if err != nil {
   158  			return err
   159  		}
   160  	}
   161  	return nil
   162  }
   163  
   164  func (m *autoSchemaManager) getProperties(object *models.Object) ([]*models.Property, error) {
   165  	properties := []*models.Property{}
   166  	if props, ok := object.Properties.(map[string]interface{}); ok {
   167  		for name, value := range props {
   168  			now := time.Now()
   169  			dt, err := m.determineType(value, false)
   170  			if err != nil {
   171  				return nil, fmt.Errorf("property '%s' on class '%s': %w", name, object.Class, err)
   172  			}
   173  
   174  			var nestedProperties []*models.NestedProperty
   175  			if len(dt) == 1 {
   176  				switch dt[0] {
   177  				case schema.DataTypeObject:
   178  					nestedProperties, err = m.determineNestedProperties(value.(map[string]interface{}), now)
   179  				case schema.DataTypeObjectArray:
   180  					nestedProperties, err = m.determineNestedPropertiesOfArray(value.([]interface{}), now)
   181  				default:
   182  					// do nothing
   183  				}
   184  			}
   185  			if err != nil {
   186  				return nil, fmt.Errorf("property '%s' on class '%s': %w", name, object.Class, err)
   187  			}
   188  
   189  			property := &models.Property{
   190  				Name:             name,
   191  				DataType:         m.getDataTypes(dt),
   192  				Description:      "This property was generated by Weaviate's auto-schema feature on " + now.Format(time.ANSIC),
   193  				NestedProperties: nestedProperties,
   194  			}
   195  			properties = append(properties, property)
   196  		}
   197  	}
   198  	return properties, nil
   199  }
   200  
   201  func (m *autoSchemaManager) getDataTypes(dataTypes []schema.DataType) []string {
   202  	dtypes := make([]string, len(dataTypes))
   203  	for i := range dataTypes {
   204  		dtypes[i] = string(dataTypes[i])
   205  	}
   206  	return dtypes
   207  }
   208  
   209  func (m *autoSchemaManager) determineType(value interface{}, ofNestedProp bool) ([]schema.DataType, error) {
   210  	fallbackDataType := []schema.DataType{schema.DataTypeText}
   211  	fallbackArrayDataType := []schema.DataType{schema.DataTypeTextArray}
   212  
   213  	switch typedValue := value.(type) {
   214  	case string:
   215  		if _, err := time.Parse(time.RFC3339, typedValue); err == nil {
   216  			return []schema.DataType{schema.DataType(m.config.DefaultDate)}, nil
   217  		}
   218  		if _, err := uuid.Parse(typedValue); err == nil {
   219  			return []schema.DataType{schema.DataTypeUUID}, nil
   220  		}
   221  		if m.config.DefaultString != "" {
   222  			return []schema.DataType{schema.DataType(m.config.DefaultString)}, nil
   223  		}
   224  		return []schema.DataType{schema.DataTypeText}, nil
   225  	case json.Number:
   226  		return []schema.DataType{schema.DataType(m.config.DefaultNumber)}, nil
   227  	case float64:
   228  		return []schema.DataType{schema.DataTypeNumber}, nil
   229  	case int64:
   230  		return []schema.DataType{schema.DataTypeInt}, nil
   231  	case bool:
   232  		return []schema.DataType{schema.DataTypeBoolean}, nil
   233  	case map[string]interface{}:
   234  		// nested properties does not support phone and geo data types
   235  		if !ofNestedProp {
   236  			if dt, ok := m.asGeoCoordinatesType(typedValue); ok {
   237  				return dt, nil
   238  			}
   239  			if dt, ok := m.asPhoneNumber(typedValue); ok {
   240  				return dt, nil
   241  			}
   242  		}
   243  		return []schema.DataType{schema.DataTypeObject}, nil
   244  	case []interface{}:
   245  		if len(typedValue) == 0 {
   246  			return fallbackArrayDataType, nil
   247  		}
   248  
   249  		refDataTypes := []schema.DataType{}
   250  		var isRef bool
   251  		var determinedDataType schema.DataType
   252  
   253  		for i := range typedValue {
   254  			dataType, refDataType, err := m.determineArrayType(typedValue[i], ofNestedProp)
   255  			if err != nil {
   256  				return nil, fmt.Errorf("element [%d]: %w", i, err)
   257  			}
   258  			if i == 0 {
   259  				isRef = refDataType != ""
   260  				determinedDataType = dataType
   261  			}
   262  			if dataType != "" {
   263  				if isRef {
   264  					return nil, fmt.Errorf("element [%d]: mismatched data type - reference expected, got '%s'",
   265  						i, asSingleDataType(dataType))
   266  				}
   267  				if dataType != determinedDataType {
   268  					return nil, fmt.Errorf("element [%d]: mismatched data type - '%s' expected, got '%s'",
   269  						i, asSingleDataType(determinedDataType), asSingleDataType(dataType))
   270  				}
   271  			} else {
   272  				if !isRef {
   273  					return nil, fmt.Errorf("element [%d]: mismatched data type - '%s' expected, got reference",
   274  						i, asSingleDataType(determinedDataType))
   275  				}
   276  				refDataTypes = append(refDataTypes, refDataType)
   277  			}
   278  		}
   279  		if len(refDataTypes) > 0 {
   280  			return refDataTypes, nil
   281  		}
   282  		return []schema.DataType{determinedDataType}, nil
   283  	case nil:
   284  		return fallbackDataType, nil
   285  	default:
   286  		allowed := []string{
   287  			schema.DataTypeText.String(),
   288  			schema.DataTypeNumber.String(),
   289  			schema.DataTypeInt.String(),
   290  			schema.DataTypeBoolean.String(),
   291  			schema.DataTypeDate.String(),
   292  			schema.DataTypeUUID.String(),
   293  			schema.DataTypeObject.String(),
   294  		}
   295  		if !ofNestedProp {
   296  			allowed = append(allowed, schema.DataTypePhoneNumber.String(), schema.DataTypeGeoCoordinates.String())
   297  		}
   298  		return nil, fmt.Errorf("unrecognized data type of value '%v' - one of '%s' expected",
   299  			typedValue, strings.Join(allowed, "', '"))
   300  	}
   301  }
   302  
   303  func asSingleDataType(arrayDataType schema.DataType) schema.DataType {
   304  	if dt, isArray := schema.IsArrayType(arrayDataType); isArray {
   305  		return dt
   306  	}
   307  	return arrayDataType
   308  }
   309  
   310  func (m *autoSchemaManager) determineArrayType(value interface{}, ofNestedProp bool,
   311  ) (schema.DataType, schema.DataType, error) {
   312  	switch typedValue := value.(type) {
   313  	case string:
   314  		if _, err := time.Parse(time.RFC3339, typedValue); err == nil {
   315  			return schema.DataTypeDateArray, "", nil
   316  		}
   317  		if _, err := uuid.Parse(typedValue); err == nil {
   318  			return schema.DataTypeUUIDArray, "", nil
   319  		}
   320  		if schema.DataType(m.config.DefaultString) == schema.DataTypeString {
   321  			return schema.DataTypeStringArray, "", nil
   322  		}
   323  		return schema.DataTypeTextArray, "", nil
   324  	case json.Number:
   325  		if schema.DataType(m.config.DefaultNumber) == schema.DataTypeInt {
   326  			return schema.DataTypeIntArray, "", nil
   327  		}
   328  		return schema.DataTypeNumberArray, "", nil
   329  	case float64:
   330  		return schema.DataTypeNumberArray, "", nil
   331  	case int64:
   332  		return schema.DataTypeIntArray, "", nil
   333  	case bool:
   334  		return schema.DataTypeBooleanArray, "", nil
   335  	case map[string]interface{}:
   336  		if ofNestedProp {
   337  			return schema.DataTypeObjectArray, "", nil
   338  		}
   339  		if refDataType, ok := m.asRef(typedValue); ok {
   340  			return "", refDataType, nil
   341  		}
   342  		return schema.DataTypeObjectArray, "", nil
   343  	default:
   344  		allowed := []string{
   345  			schema.DataTypeText.String(),
   346  			schema.DataTypeNumber.String(),
   347  			schema.DataTypeInt.String(),
   348  			schema.DataTypeBoolean.String(),
   349  			schema.DataTypeDate.String(),
   350  			schema.DataTypeUUID.String(),
   351  			schema.DataTypeObject.String(),
   352  		}
   353  		if !ofNestedProp {
   354  			allowed = append(allowed, schema.DataTypeCRef.String())
   355  		}
   356  		return "", "", fmt.Errorf("unrecognized data type of value '%v' - one of '%s' expected",
   357  			typedValue, strings.Join(allowed, "', '"))
   358  	}
   359  }
   360  
   361  func (m *autoSchemaManager) asGeoCoordinatesType(val map[string]interface{}) ([]schema.DataType, bool) {
   362  	if len(val) == 2 {
   363  		if val["latitude"] != nil && val["longitude"] != nil {
   364  			return []schema.DataType{schema.DataTypeGeoCoordinates}, true
   365  		}
   366  	}
   367  	return nil, false
   368  }
   369  
   370  func (m *autoSchemaManager) asPhoneNumber(val map[string]interface{}) ([]schema.DataType, bool) {
   371  	if val["input"] != nil {
   372  		if len(val) == 1 {
   373  			return []schema.DataType{schema.DataTypePhoneNumber}, true
   374  		}
   375  		if len(val) == 2 {
   376  			if _, ok := val["defaultCountry"]; ok {
   377  				return []schema.DataType{schema.DataTypePhoneNumber}, true
   378  			}
   379  		}
   380  	}
   381  
   382  	return nil, false
   383  }
   384  
   385  func (m *autoSchemaManager) asRef(val map[string]interface{}) (schema.DataType, bool) {
   386  	if v, ok := val["beacon"]; ok {
   387  		if beacon, ok := v.(string); ok {
   388  			ref, err := crossref.Parse(beacon)
   389  			if err == nil {
   390  				if ref.Class == "" {
   391  					res, err := m.vectorRepo.ObjectByID(context.Background(), ref.TargetID, search.SelectProperties{}, additional.Properties{}, "")
   392  					if err == nil && res != nil {
   393  						return schema.DataType(res.ClassName), true
   394  					}
   395  				} else {
   396  					return schema.DataType(ref.Class), true
   397  				}
   398  			}
   399  		}
   400  	}
   401  	return "", false
   402  }
   403  
   404  func (m *autoSchemaManager) determineNestedProperties(values map[string]interface{}, now time.Time,
   405  ) ([]*models.NestedProperty, error) {
   406  	i := 0
   407  	nestedProperties := make([]*models.NestedProperty, len(values))
   408  	for name, value := range values {
   409  		np, err := m.determineNestedProperty(name, value, now)
   410  		if err != nil {
   411  			return nil, fmt.Errorf("nested property '%s': %w", name, err)
   412  		}
   413  		nestedProperties[i] = np
   414  		i++
   415  	}
   416  	return nestedProperties, nil
   417  }
   418  
   419  func (m *autoSchemaManager) determineNestedProperty(name string, value interface{}, now time.Time,
   420  ) (*models.NestedProperty, error) {
   421  	dt, err := m.determineType(value, true)
   422  	if err != nil {
   423  		return nil, err
   424  	}
   425  
   426  	var np []*models.NestedProperty
   427  	if len(dt) == 1 {
   428  		switch dt[0] {
   429  		case schema.DataTypeObject:
   430  			np, err = m.determineNestedProperties(value.(map[string]interface{}), now)
   431  		case schema.DataTypeObjectArray:
   432  			np, err = m.determineNestedPropertiesOfArray(value.([]interface{}), now)
   433  		default:
   434  			// do nothing
   435  		}
   436  	}
   437  	if err != nil {
   438  		return nil, err
   439  	}
   440  
   441  	return &models.NestedProperty{
   442  		Name:     name,
   443  		DataType: m.getDataTypes(dt),
   444  		Description: "This nested property was generated by Weaviate's auto-schema feature on " +
   445  			now.Format(time.ANSIC),
   446  		NestedProperties: np,
   447  	}, nil
   448  }
   449  
   450  func (m *autoSchemaManager) determineNestedPropertiesOfArray(valArray []interface{}, now time.Time,
   451  ) ([]*models.NestedProperty, error) {
   452  	if len(valArray) == 0 {
   453  		return []*models.NestedProperty{}, nil
   454  	}
   455  	nestedProperties, err := m.determineNestedProperties(valArray[0].(map[string]interface{}), now)
   456  	if err != nil {
   457  		return nil, err
   458  	}
   459  	if len(valArray) == 1 {
   460  		return nestedProperties, nil
   461  	}
   462  
   463  	nestedPropertiesIndexMap := map[string]int{}
   464  	for index := range nestedProperties {
   465  		nestedPropertiesIndexMap[nestedProperties[index].Name] = index
   466  	}
   467  
   468  	for i := 1; i < len(valArray); i++ {
   469  		values := valArray[i].(map[string]interface{})
   470  		for name, value := range values {
   471  			index, ok := nestedPropertiesIndexMap[name]
   472  			if !ok {
   473  				np, err := m.determineNestedProperty(name, value, now)
   474  				if err != nil {
   475  					return nil, err
   476  				}
   477  				nestedPropertiesIndexMap[name] = len(nestedProperties)
   478  				nestedProperties = append(nestedProperties, np)
   479  			} else if _, isNested := schema.AsNested(nestedProperties[index].DataType); isNested {
   480  				np, err := m.determineNestedProperty(name, value, now)
   481  				if err != nil {
   482  					return nil, err
   483  				}
   484  				if mergedNestedProperties, merged := schema.MergeRecursivelyNestedProperties(
   485  					nestedProperties[index].NestedProperties, np.NestedProperties,
   486  				); merged {
   487  					nestedProperties[index].NestedProperties = mergedNestedProperties
   488  				}
   489  			}
   490  		}
   491  	}
   492  
   493  	return nestedProperties, nil
   494  }