github.com/weaviate/weaviate@v1.24.6/usecases/backup/backend.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package backup
    13  
    14  import (
    15  	"context"
    16  	"encoding/json"
    17  	"fmt"
    18  	"io"
    19  	"os"
    20  	"path"
    21  	"runtime"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	enterrors "github.com/weaviate/weaviate/entities/errors"
    26  
    27  	"github.com/prometheus/client_golang/prometheus"
    28  	"github.com/sirupsen/logrus"
    29  	"github.com/weaviate/weaviate/entities/backup"
    30  	"github.com/weaviate/weaviate/entities/modulecapabilities"
    31  	"github.com/weaviate/weaviate/usecases/monitoring"
    32  )
    33  
    34  // TODO adjust or make configurable
    35  const (
    36  	storeTimeout = 24 * time.Hour
    37  	metaTimeout  = 20 * time.Minute
    38  
    39  	// DefaultChunkSize if size is not specified
    40  	DefaultChunkSize = 1 << 27 // 128MB
    41  
    42  	// maxChunkSize is the upper bound on the chunk size
    43  	maxChunkSize = 1 << 29 // 512MB
    44  
    45  	// minChunkSize is the lower bound on the chunk size
    46  	minChunkSize = 1 << 21 // 2MB
    47  
    48  	// maxCPUPercentage max CPU percentage can be consumed by the file writer
    49  	maxCPUPercentage = 80
    50  
    51  	// DefaultCPUPercentage default CPU percentage can be consumed by the file writer
    52  	DefaultCPUPercentage = 50
    53  )
    54  
    55  const (
    56  	// BackupFile used by a node to store its metadata
    57  	BackupFile = "backup.json"
    58  	// GlobalBackupFile used by coordinator to store its metadata
    59  	GlobalBackupFile  = "backup_config.json"
    60  	GlobalRestoreFile = "restore_config.json"
    61  	_TempDirectory    = ".backup.tmp"
    62  )
    63  
    64  var _NUMCPU = runtime.NumCPU()
    65  
    66  type objStore struct {
    67  	b        modulecapabilities.BackupBackend
    68  	BasePath string
    69  }
    70  
    71  func (s *objStore) HomeDir() string {
    72  	return s.b.HomeDir(s.BasePath)
    73  }
    74  
    75  func (s *objStore) WriteToFile(ctx context.Context, key, destPath string) error {
    76  	return s.b.WriteToFile(ctx, s.BasePath, key, destPath)
    77  }
    78  
    79  // SourceDataPath is data path of all source files
    80  func (s *objStore) SourceDataPath() string {
    81  	return s.b.SourceDataPath()
    82  }
    83  
    84  func (s *objStore) Write(ctx context.Context, key string, r io.ReadCloser) (int64, error) {
    85  	return s.b.Write(ctx, s.BasePath, key, r)
    86  }
    87  
    88  func (s *objStore) Read(ctx context.Context, key string, w io.WriteCloser) (int64, error) {
    89  	return s.b.Read(ctx, s.BasePath, key, w)
    90  }
    91  
    92  func (s *objStore) Initialize(ctx context.Context) error {
    93  	return s.b.Initialize(ctx, s.BasePath)
    94  }
    95  
    96  // meta marshals and uploads metadata
    97  func (s *objStore) putMeta(ctx context.Context, key string, desc interface{}) error {
    98  	bytes, err := json.Marshal(desc)
    99  	if err != nil {
   100  		return fmt.Errorf("marshal meta file %q: %w", key, err)
   101  	}
   102  	ctx, cancel := context.WithTimeout(ctx, metaTimeout)
   103  	defer cancel()
   104  	if err := s.b.PutObject(ctx, s.BasePath, key, bytes); err != nil {
   105  		return fmt.Errorf("upload meta file %q: %w", key, err)
   106  	}
   107  	return nil
   108  }
   109  
   110  func (s *objStore) meta(ctx context.Context, key string, dest interface{}) error {
   111  	bytes, err := s.b.GetObject(ctx, s.BasePath, key)
   112  	if err != nil {
   113  		return err
   114  	}
   115  	err = json.Unmarshal(bytes, dest)
   116  	if err != nil {
   117  		return fmt.Errorf("marshal meta file %q: %w", key, err)
   118  	}
   119  	return nil
   120  }
   121  
   122  type nodeStore struct {
   123  	objStore
   124  }
   125  
   126  // Meta gets meta data using standard path or deprecated old path
   127  //
   128  // adjustBasePath: sets the base path to the old path if the backup has been created prior to v1.17.
   129  func (s *nodeStore) Meta(ctx context.Context, backupID string, adjustBasePath bool) (*backup.BackupDescriptor, error) {
   130  	var result backup.BackupDescriptor
   131  	err := s.meta(ctx, BackupFile, &result)
   132  	if err != nil {
   133  		cs := &objStore{s.b, backupID} // for backward compatibility
   134  		if err := cs.meta(ctx, BackupFile, &result); err == nil {
   135  			if adjustBasePath {
   136  				s.objStore.BasePath = backupID
   137  			}
   138  			return &result, nil
   139  		}
   140  	}
   141  
   142  	return &result, err
   143  }
   144  
   145  // meta marshals and uploads metadata
   146  func (s *nodeStore) PutMeta(ctx context.Context, desc *backup.BackupDescriptor) error {
   147  	return s.putMeta(ctx, BackupFile, desc)
   148  }
   149  
   150  type coordStore struct {
   151  	objStore
   152  }
   153  
   154  // PutMeta puts coordinator's global metadata into object store
   155  func (s *coordStore) PutMeta(ctx context.Context, filename string, desc *backup.DistributedBackupDescriptor) error {
   156  	return s.putMeta(ctx, filename, desc)
   157  }
   158  
   159  // Meta gets coordinator's global metadata from object store
   160  func (s *coordStore) Meta(ctx context.Context, filename string) (*backup.DistributedBackupDescriptor, error) {
   161  	var result backup.DistributedBackupDescriptor
   162  	err := s.meta(ctx, filename, &result)
   163  	if err != nil && filename == GlobalBackupFile {
   164  		var oldBackup backup.BackupDescriptor
   165  		if err := s.meta(ctx, BackupFile, &oldBackup); err == nil {
   166  			return oldBackup.ToDistributed(), nil
   167  		}
   168  	}
   169  	return &result, err
   170  }
   171  
   172  // uploader uploads backup artifacts. This includes db files and metadata
   173  type uploader struct {
   174  	sourcer  Sourcer
   175  	backend  nodeStore
   176  	backupID string
   177  	zipConfig
   178  	setStatus func(st backup.Status)
   179  	log       logrus.FieldLogger
   180  }
   181  
   182  func newUploader(sourcer Sourcer, backend nodeStore,
   183  	backupID string, setstatus func(st backup.Status), l logrus.FieldLogger,
   184  ) *uploader {
   185  	return &uploader{
   186  		sourcer, backend,
   187  		backupID,
   188  		newZipConfig(Compression{
   189  			Level:         DefaultCompression,
   190  			CPUPercentage: DefaultCPUPercentage,
   191  			ChunkSize:     DefaultChunkSize,
   192  		}),
   193  		setstatus,
   194  		l,
   195  	}
   196  }
   197  
   198  func (u *uploader) withCompression(cfg zipConfig) *uploader {
   199  	u.zipConfig = cfg
   200  	return u
   201  }
   202  
   203  // all uploads all files in addition to the metadata file
   204  func (u *uploader) all(ctx context.Context, classes []string, desc *backup.BackupDescriptor) (err error) {
   205  	u.setStatus(backup.Transferring)
   206  	desc.Status = string(backup.Transferring)
   207  	ch := u.sourcer.BackupDescriptors(ctx, desc.ID, classes)
   208  	defer func() {
   209  		//  make sure context is not cancelled when uploading metadata
   210  		ctx := context.Background()
   211  		if err != nil {
   212  			desc.Error = err.Error()
   213  			err = fmt.Errorf("upload %w: %v", err, u.backend.PutMeta(ctx, desc))
   214  		} else {
   215  			u.log.Info("start uploading meta data")
   216  			if err = u.backend.PutMeta(ctx, desc); err != nil {
   217  				desc.Status = string(backup.Transferred)
   218  			}
   219  			u.setStatus(backup.Success)
   220  			u.log.Info("finish uploading meta data")
   221  		}
   222  	}()
   223  Loop:
   224  	for {
   225  		select {
   226  		case cdesc, ok := <-ch:
   227  			if !ok {
   228  				break Loop // we are done
   229  			}
   230  			if cdesc.Error != nil {
   231  				return cdesc.Error
   232  			}
   233  			u.log.WithField("class", cdesc.Name).Info("start uploading files")
   234  			if err := u.class(ctx, desc.ID, &cdesc); err != nil {
   235  				return err
   236  			}
   237  			desc.Classes = append(desc.Classes, cdesc)
   238  			u.log.WithField("class", cdesc.Name).Info("finish uploading files")
   239  
   240  		case <-ctx.Done():
   241  			return ctx.Err()
   242  		}
   243  	}
   244  	u.setStatus(backup.Transferred)
   245  	desc.Status = string(backup.Success)
   246  	return nil
   247  }
   248  
   249  // class uploads one class
   250  func (u *uploader) class(ctx context.Context, id string, desc *backup.ClassDescriptor) (err error) {
   251  	classLabel := desc.Name
   252  	if monitoring.GetMetrics().Group {
   253  		classLabel = "n/a"
   254  	}
   255  	metric, err := monitoring.GetMetrics().BackupStoreDurations.GetMetricWithLabelValues(getType(u.backend.b), classLabel)
   256  	if err == nil {
   257  		timer := prometheus.NewTimer(metric)
   258  		defer timer.ObserveDuration()
   259  	}
   260  	defer func() {
   261  		// backups need to be released anyway
   262  		enterrors.GoWrapper(func() { u.sourcer.ReleaseBackup(context.Background(), id, desc.Name) }, u.log)
   263  	}()
   264  	ctx, cancel := context.WithTimeout(ctx, storeTimeout)
   265  	defer cancel()
   266  	nShards := len(desc.Shards)
   267  	if nShards == 0 {
   268  		return nil
   269  	}
   270  
   271  	desc.Chunks = make(map[int32][]string, 1+nShards/2)
   272  	var (
   273  		hasJobs   atomic.Bool
   274  		lastChunk = int32(0)
   275  		nWorker   = u.GoPoolSize
   276  	)
   277  	if nWorker > nShards {
   278  		nWorker = nShards
   279  	}
   280  	hasJobs.Store(nShards > 0)
   281  
   282  	// jobs produces work for the processor
   283  	jobs := func(xs []*backup.ShardDescriptor) <-chan *backup.ShardDescriptor {
   284  		sendCh := make(chan *backup.ShardDescriptor)
   285  		f := func() {
   286  			defer close(sendCh)
   287  			defer hasJobs.Store(false)
   288  
   289  			for _, shard := range xs {
   290  				select {
   291  				case sendCh <- shard:
   292  				// cancellation will happen for two reasons:
   293  				//  - 1. if the whole operation has been aborted,
   294  				//  - 2. or if the processor routine returns an error
   295  				case <-ctx.Done():
   296  					return
   297  				}
   298  			}
   299  		}
   300  		enterrors.GoWrapper(f, u.log)
   301  		return sendCh
   302  	}
   303  
   304  	// processor
   305  	processor := func(nWorker int, sender <-chan *backup.ShardDescriptor) <-chan chuckShards {
   306  		eg, ctx := enterrors.NewErrorGroupWithContextWrapper(u.log, ctx)
   307  		eg.SetLimit(nWorker)
   308  		recvCh := make(chan chuckShards, nWorker)
   309  		f := func() {
   310  			defer close(recvCh)
   311  			for i := 0; i < nWorker; i++ {
   312  				eg.Go(func() error {
   313  					// operation might have been aborted see comment above
   314  					if err := ctx.Err(); err != nil {
   315  						return err
   316  					}
   317  					for hasJobs.Load() {
   318  						chunk := atomic.AddInt32(&lastChunk, 1)
   319  						shards, err := u.compress(ctx, desc.Name, chunk, sender)
   320  						if err != nil {
   321  							return err
   322  						}
   323  						if m := int32(len(shards)); m > 0 {
   324  							recvCh <- chuckShards{chunk, shards}
   325  						}
   326  					}
   327  					return err
   328  				})
   329  			}
   330  			err = eg.Wait()
   331  		}
   332  		enterrors.GoWrapper(f, u.log)
   333  		return recvCh
   334  	}
   335  
   336  	for x := range processor(nWorker, jobs(desc.Shards)) {
   337  		desc.Chunks[x.chunk] = x.shards
   338  	}
   339  	return
   340  }
   341  
   342  type chuckShards struct {
   343  	chunk  int32
   344  	shards []string
   345  }
   346  
   347  func (u *uploader) compress(ctx context.Context,
   348  	class string, // class name
   349  	chunk int32, // chunk index
   350  	ch <-chan *backup.ShardDescriptor, // chan of shards
   351  ) ([]string, error) {
   352  	var (
   353  		chunkKey = chunkKey(class, chunk)
   354  		shards   = make([]string, 0, 10)
   355  		// add tolerance to enable better optimization of the chunk size
   356  		maxSize = int64(u.ChunkSize + u.ChunkSize/20) // size + 5%
   357  	)
   358  	zip, reader := NewZip(u.backend.SourceDataPath(), u.Level)
   359  	producer := func() error {
   360  		defer zip.Close()
   361  		lastShardSize := int64(0)
   362  		for shard := range ch {
   363  			if _, err := zip.WriteShard(ctx, shard); err != nil {
   364  				return err
   365  			}
   366  			shard.Chunk = chunk
   367  			shards = append(shards, shard.Name)
   368  			shard.ClearTemporary()
   369  
   370  			zip.gzw.Flush() // flush new shard
   371  			lastShardSize = zip.lastWritten() - lastShardSize
   372  			if zip.lastWritten()+lastShardSize > maxSize {
   373  				break
   374  			}
   375  		}
   376  		return nil
   377  	}
   378  
   379  	// consumer
   380  	eg := enterrors.NewErrorGroupWrapper(u.log)
   381  	eg.Go(func() error {
   382  		if _, err := u.backend.Write(ctx, chunkKey, reader); err != nil {
   383  			return err
   384  		}
   385  		return nil
   386  	})
   387  
   388  	if err := producer(); err != nil {
   389  		return shards, err
   390  	}
   391  	// wait for the consumer to finish
   392  	return shards, eg.Wait()
   393  }
   394  
   395  // fileWriter downloads files from object store and writes files to the destination folder destDir
   396  type fileWriter struct {
   397  	sourcer    Sourcer
   398  	backend    nodeStore
   399  	tempDir    string
   400  	destDir    string
   401  	movedFiles []string // files successfully moved to destination folder
   402  	compressed bool
   403  	GoPoolSize int
   404  	migrator   func(classPath string) error
   405  	logger     logrus.FieldLogger
   406  }
   407  
   408  func newFileWriter(sourcer Sourcer, backend nodeStore,
   409  	compressed bool, logger logrus.FieldLogger,
   410  ) *fileWriter {
   411  	destDir := backend.SourceDataPath()
   412  	return &fileWriter{
   413  		sourcer:    sourcer,
   414  		backend:    backend,
   415  		destDir:    destDir,
   416  		tempDir:    path.Join(destDir, _TempDirectory),
   417  		movedFiles: make([]string, 0, 64),
   418  		compressed: compressed,
   419  		GoPoolSize: routinePoolSize(50),
   420  		logger:     logger,
   421  	}
   422  }
   423  
   424  func (fw *fileWriter) WithPoolPercentage(p int) *fileWriter {
   425  	fw.GoPoolSize = routinePoolSize(p)
   426  	return fw
   427  }
   428  
   429  func (fw *fileWriter) setMigrator(m func(classPath string) error) { fw.migrator = m }
   430  
   431  // Write downloads files and put them in the destination directory
   432  func (fw *fileWriter) Write(ctx context.Context, desc *backup.ClassDescriptor) (rollback func() error, err error) {
   433  	if len(desc.Shards) == 0 { // nothing to copy
   434  		return func() error { return nil }, nil
   435  	}
   436  	classTempDir := path.Join(fw.tempDir, desc.Name)
   437  	defer func() {
   438  		if err != nil {
   439  			if rerr := fw.rollBack(); rerr != nil {
   440  				err = fmt.Errorf("%w: %v", err, rerr)
   441  			}
   442  		}
   443  		os.RemoveAll(classTempDir)
   444  	}()
   445  
   446  	if err := fw.writeTempFiles(ctx, classTempDir, desc); err != nil {
   447  		return nil, fmt.Errorf("get files: %w", err)
   448  	}
   449  
   450  	if fw.migrator != nil {
   451  		if err := fw.migrator(classTempDir); err != nil {
   452  			return nil, fmt.Errorf("migrate from pre 1.23: %w", err)
   453  		}
   454  	}
   455  
   456  	if err := fw.moveAll(classTempDir); err != nil {
   457  		return nil, fmt.Errorf("move files to destination: %w", err)
   458  	}
   459  
   460  	return func() error { return fw.rollBack() }, nil
   461  }
   462  
   463  // writeTempFiles writes class files into a temporary directory
   464  // temporary directory path = d.tempDir/className
   465  // Function makes sure that created files will be removed in case of an error
   466  func (fw *fileWriter) writeTempFiles(ctx context.Context, classTempDir string, desc *backup.ClassDescriptor) (err error) {
   467  	if err := os.RemoveAll(classTempDir); err != nil {
   468  		return fmt.Errorf("remove %s: %w", classTempDir, err)
   469  	}
   470  	if err := os.MkdirAll(classTempDir, os.ModePerm); err != nil {
   471  		return fmt.Errorf("create temp class folder %s: %w", classTempDir, err)
   472  	}
   473  	ctx, cancel := context.WithCancel(ctx)
   474  	defer cancel()
   475  
   476  	// no compression processed as before
   477  	eg, ctx := enterrors.NewErrorGroupWithContextWrapper(fw.logger, ctx)
   478  	if !fw.compressed {
   479  		eg.SetLimit(2 * _NUMCPU)
   480  		for _, shard := range desc.Shards {
   481  			shard := shard
   482  			eg.Go(func() error { return fw.writeTempShard(ctx, shard, classTempDir) }, shard.Name)
   483  		}
   484  		return eg.Wait()
   485  	}
   486  
   487  	// source files are compressed
   488  
   489  	eg.SetLimit(fw.GoPoolSize)
   490  	for k := range desc.Chunks {
   491  		chunk := chunkKey(desc.Name, k)
   492  		eg.Go(func() error {
   493  			uz, w := NewUnzip(classTempDir)
   494  			enterrors.GoWrapper(func() {
   495  				fw.backend.Read(ctx, chunk, w)
   496  			}, fw.logger)
   497  			_, err := uz.ReadChunk()
   498  			return err
   499  		})
   500  	}
   501  	return eg.Wait()
   502  }
   503  
   504  func (fw *fileWriter) writeTempShard(ctx context.Context, sd *backup.ShardDescriptor, classTempDir string) error {
   505  	for _, key := range sd.Files {
   506  		destPath := path.Join(classTempDir, key)
   507  		destDir := path.Dir(destPath)
   508  		if err := os.MkdirAll(destDir, os.ModePerm); err != nil {
   509  			return fmt.Errorf("create folder %s: %w", destDir, err)
   510  		}
   511  		if err := fw.backend.WriteToFile(ctx, key, destPath); err != nil {
   512  			return fmt.Errorf("write file %s: %w", destPath, err)
   513  		}
   514  	}
   515  	destPath := path.Join(classTempDir, sd.DocIDCounterPath)
   516  	if err := os.WriteFile(destPath, sd.DocIDCounter, os.ModePerm); err != nil {
   517  		return fmt.Errorf("write counter file %s: %w", destPath, err)
   518  	}
   519  	destPath = path.Join(classTempDir, sd.PropLengthTrackerPath)
   520  	if err := os.WriteFile(destPath, sd.PropLengthTracker, os.ModePerm); err != nil {
   521  		return fmt.Errorf("write prop file %s: %w", destPath, err)
   522  	}
   523  	destPath = path.Join(classTempDir, sd.ShardVersionPath)
   524  	if err := os.WriteFile(destPath, sd.Version, os.ModePerm); err != nil {
   525  		return fmt.Errorf("write version file %s: %w", destPath, err)
   526  	}
   527  	return nil
   528  }
   529  
   530  // moveAll moves all files to the destination
   531  func (fw *fileWriter) moveAll(classTempDir string) (err error) {
   532  	files, err := os.ReadDir(classTempDir)
   533  	if err != nil {
   534  		return fmt.Errorf("read %s", classTempDir)
   535  	}
   536  	destDir := fw.destDir
   537  	for _, key := range files {
   538  		from := path.Join(classTempDir, key.Name())
   539  		to := path.Join(destDir, key.Name())
   540  		if err := os.Rename(from, to); err != nil {
   541  			return fmt.Errorf("move %s %s: %w", from, to, err)
   542  		}
   543  		fw.movedFiles = append(fw.movedFiles, to)
   544  	}
   545  
   546  	return nil
   547  }
   548  
   549  // rollBack successfully written files
   550  func (fw *fileWriter) rollBack() (err error) {
   551  	// rollback successfully moved files
   552  	for _, fpath := range fw.movedFiles {
   553  		if rerr := os.RemoveAll(fpath); rerr != nil && err == nil {
   554  			err = fmt.Errorf("rollback %s: %w", fpath, rerr)
   555  		}
   556  	}
   557  	return err
   558  }
   559  
   560  func chunkKey(class string, id int32) string {
   561  	return fmt.Sprintf("%s/chunk-%d", class, id)
   562  }
   563  
   564  func routinePoolSize(percentage int) int {
   565  	if percentage == 0 { // default value
   566  		percentage = DefaultCPUPercentage
   567  	} else if percentage > maxCPUPercentage {
   568  		percentage = maxCPUPercentage
   569  	}
   570  	if x := (_NUMCPU * percentage) / 100; x > 0 {
   571  		return x
   572  	}
   573  	return 1
   574  }