github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/file_structure_migration_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package db
    13  
    14  import (
    15  	"fmt"
    16  	"math/rand"
    17  	"os"
    18  	"path"
    19  	"strings"
    20  	"testing"
    21  
    22  	"github.com/sirupsen/logrus/hooks/test"
    23  	"github.com/stretchr/testify/assert"
    24  	"github.com/stretchr/testify/require"
    25  	"github.com/weaviate/weaviate/adapters/repos/db/helpers"
    26  	"github.com/weaviate/weaviate/entities/models"
    27  	"github.com/weaviate/weaviate/entities/schema"
    28  	"github.com/weaviate/weaviate/usecases/sharding"
    29  )
    30  
    31  const (
    32  	numClasses = 100
    33  	numShards  = 10
    34  	uppercase  = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    35  	lowercase  = "abcdefghijklmnopqrstuvwxyz"
    36  	digits     = "0123456789"
    37  	chars      = uppercase + lowercase + digits
    38  	localNode  = "node1"
    39  )
    40  
    41  var (
    42  	rootFiles = []string{
    43  		"classifications.db",
    44  		"modules.db",
    45  		"schema.db",
    46  	}
    47  	indexDirExts = []string{
    48  		".hnsw.commitlog.d",
    49  		"_someGeoProp.hnsw.commitlog.d",
    50  		"_lsm",
    51  	}
    52  	indexFileExts = []string{
    53  		".indexcount",
    54  		".proplengths",
    55  		".version",
    56  	}
    57  	migratedRootFiles = append(rootFiles,
    58  		"migration1.22.fs.hierarchy")
    59  )
    60  
    61  func TestFileStructureMigration(t *testing.T) {
    62  	shardsByClass := make(map[string][]string, numClasses)
    63  
    64  	t.Run("generate index and shard names", func(t *testing.T) {
    65  		for i := 0; i < numClasses; i++ {
    66  			c := randClassName()
    67  			shardsByClass[c] = make([]string, numShards)
    68  			for j := 0; j < numShards; j++ {
    69  				s := randShardName()
    70  				shardsByClass[c][j] = s
    71  			}
    72  		}
    73  	})
    74  
    75  	root := t.TempDir()
    76  
    77  	t.Run("write test db files", func(t *testing.T) {
    78  		for _, f := range rootFiles {
    79  			require.Nil(t, os.WriteFile(path.Join(root, f), nil, os.ModePerm))
    80  		}
    81  
    82  		for class, shards := range shardsByClass {
    83  			for _, shard := range shards {
    84  				idx := path.Join(root, fmt.Sprintf("%s_%s", strings.ToLower(class), shard))
    85  				for _, ext := range indexDirExts {
    86  					require.Nil(t, os.MkdirAll(idx+ext, os.ModePerm))
    87  				}
    88  				for _, ext := range indexFileExts {
    89  					require.Nil(t, os.WriteFile(idx+ext, nil, os.ModePerm))
    90  				}
    91  
    92  				pqDir := path.Join(root, class, shard, "compressed_objects")
    93  				require.Nil(t, os.MkdirAll(pqDir, os.ModePerm))
    94  			}
    95  		}
    96  	})
    97  
    98  	files, err := os.ReadDir(root)
    99  	require.Nil(t, err)
   100  
   101  	t.Run("assert expected flat contents length", func(t *testing.T) {
   102  		// Flat structure root contains:
   103  		//  - (3 dirs + 3 files) per shard per index
   104  		//    - dirs: main commilog, geo prop commitlog, lsm store
   105  		//    - files: indexcount, proplengths, version
   106  		//  - 1 dir per index; shards dirs are nested
   107  		//    - pq store
   108  		//  - 3 root db files
   109  		expectedLen := numClasses*(numShards*(len(indexDirExts)+len(indexFileExts))+1) + len(rootFiles)
   110  		require.Len(t, files, expectedLen)
   111  	})
   112  
   113  	t.Run("migrate the db", func(t *testing.T) {
   114  		classes := make([]*models.Class, numClasses)
   115  		states := make(map[string]*sharding.State, numClasses)
   116  
   117  		i := 0
   118  		for class, shards := range shardsByClass {
   119  			classes[i] = &models.Class{
   120  				Class: class,
   121  				Properties: []*models.Property{{
   122  					Name:     "someGeoProp",
   123  					DataType: schema.DataTypeGeoCoordinates.PropString(),
   124  				}},
   125  			}
   126  			states[class] = &sharding.State{
   127  				Physical: make(map[string]sharding.Physical),
   128  			}
   129  			states[class].SetLocalName(localNode)
   130  
   131  			for _, shard := range shards {
   132  				states[class].Physical[shard] = sharding.Physical{
   133  					Name:           shard,
   134  					BelongsToNodes: []string{localNode},
   135  				}
   136  			}
   137  
   138  			i++
   139  		}
   140  
   141  		db := testDB(root, classes, states)
   142  		require.Nil(t, db.migrateFileStructureIfNecessary())
   143  	})
   144  
   145  	files, err = os.ReadDir(root)
   146  	require.Nil(t, err)
   147  
   148  	t.Run("assert expected hierarchical contents length", func(t *testing.T) {
   149  		// After migration, the hierarchical structure root contains:
   150  		//  - one dir per index
   151  		//  - 3 original root db files, and one additional which is the FS migration indicator
   152  		expectedLen := numClasses + len(migratedRootFiles)
   153  		require.Len(t, files, expectedLen)
   154  	})
   155  
   156  	t.Run("assert all db files were migrated", func(t *testing.T) {
   157  		var foundRootFiles []string
   158  		for _, f := range files {
   159  			if f.IsDir() {
   160  				idx := f
   161  				shardsRoot, err := os.ReadDir(path.Join(root, idx.Name()))
   162  				require.Nil(t, err)
   163  				for _, shard := range shardsRoot {
   164  					assertShardRootContents(t, shardsByClass, root, idx, shard)
   165  				}
   166  			} else {
   167  				foundRootFiles = append(foundRootFiles, f.Name())
   168  			}
   169  		}
   170  
   171  		assert.ElementsMatch(t, migratedRootFiles, foundRootFiles)
   172  	})
   173  }
   174  
   175  func assertShardRootContents(t *testing.T, shardsByClass map[string][]string, root string, idx, shard os.DirEntry) {
   176  	assert.True(t, shard.IsDir())
   177  
   178  	// Whatever we find in this shard directory, it should be able to
   179  	// be mapped back to the original flat structure root contents
   180  	lowercasedClasses := make(map[string]string, len(shardsByClass))
   181  	for class := range shardsByClass {
   182  		lowercasedClasses[strings.ToLower(class)] = class
   183  	}
   184  	require.Contains(t, lowercasedClasses, idx.Name())
   185  	assert.Contains(t, shardsByClass[lowercasedClasses[idx.Name()]], shard.Name())
   186  
   187  	// Now we will get a set of all expected files within the shard dir.
   188  	// Check to see if all of these files are found.
   189  	expected := expectedShardContents()
   190  	shardFiles, err := os.ReadDir(path.Join(root, idx.Name(), shard.Name()))
   191  	require.Nil(t, err)
   192  	for _, sf := range shardFiles {
   193  		expected[sf.Name()] = true
   194  	}
   195  	expected.assert(t)
   196  
   197  	// Check if pq store was migrated to main store as "vectors_compressed" subdir
   198  	pqDir := path.Join(root, idx.Name(), shard.Name(), "lsm", helpers.VectorsCompressedBucketLSM)
   199  	info, err := os.Stat(pqDir)
   200  	require.NoError(t, err)
   201  	assert.True(t, info.IsDir())
   202  }
   203  
   204  func testDB(root string, classes []*models.Class, states map[string]*sharding.State) *DB {
   205  	logger, _ := test.NewNullLogger()
   206  	return &DB{
   207  		config: Config{RootPath: root},
   208  		logger: logger,
   209  		schemaGetter: &fakeMigrationSchemaGetter{
   210  			sch:    schema.Schema{Objects: &models.Schema{Classes: classes}},
   211  			states: states,
   212  		},
   213  	}
   214  }
   215  
   216  func randClassName() string {
   217  	return randStringBytes(16)
   218  }
   219  
   220  func randShardName() string {
   221  	return randStringBytes(8)
   222  }
   223  
   224  func randStringBytes(n int) string {
   225  	b := make([]byte, n)
   226  	for i := range b {
   227  		switch {
   228  		case i == 0:
   229  			b[i] = randChar(uppercase)
   230  		case i == n/2:
   231  			b[i] = []byte("_")[0]
   232  		default:
   233  			b[i] = randChar(chars)
   234  		}
   235  	}
   236  	return string(b)
   237  }
   238  
   239  func randChar(str string) byte {
   240  	return str[rand.Intn(len(str))]
   241  }
   242  
   243  type shardContents map[string]bool
   244  
   245  func expectedShardContents() shardContents {
   246  	return shardContents{
   247  		"main.hnsw.commitlog.d":            false,
   248  		"geo.someGeoProp.hnsw.commitlog.d": false,
   249  		"lsm":                              false,
   250  		"indexcount":                       false,
   251  		"proplengths":                      false,
   252  		"version":                          false,
   253  	}
   254  }
   255  
   256  func (c shardContents) assert(t *testing.T) {
   257  	for name, found := range c {
   258  		assert.True(t, found, "didn't find %q in shard contents", name)
   259  	}
   260  }
   261  
   262  type fakeMigrationSchemaGetter struct {
   263  	sch    schema.Schema
   264  	states map[string]*sharding.State
   265  }
   266  
   267  func (sg *fakeMigrationSchemaGetter) GetSchemaSkipAuth() schema.Schema {
   268  	return sg.sch
   269  }
   270  
   271  func (sg *fakeMigrationSchemaGetter) Nodes() []string {
   272  	return nil
   273  }
   274  
   275  func (sg *fakeMigrationSchemaGetter) NodeName() string {
   276  	return ""
   277  }
   278  
   279  func (sg *fakeMigrationSchemaGetter) ClusterHealthScore() int {
   280  	return 0
   281  }
   282  
   283  func (sg *fakeMigrationSchemaGetter) ResolveParentNodes(string, string) (map[string]string, error) {
   284  	return nil, nil
   285  }
   286  
   287  func (sg *fakeMigrationSchemaGetter) CopyShardingState(class string) *sharding.State {
   288  	return sg.states[class]
   289  }
   290  
   291  func (sg *fakeMigrationSchemaGetter) ShardOwner(class, shard string) (string, error) {
   292  	return "", nil
   293  }
   294  
   295  func (sg *fakeMigrationSchemaGetter) TenantShard(class, tenant string) (string, string) {
   296  	return "", ""
   297  }
   298  
   299  func (sg *fakeMigrationSchemaGetter) ShardFromUUID(class string, uuid []byte) string {
   300  	return ""
   301  }
   302  
   303  func (sg *fakeMigrationSchemaGetter) ShardReplicas(class, shard string) ([]string, error) {
   304  	return nil, nil
   305  }