github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/commit_log_combiner.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package hnsw
    13  
    14  import (
    15  	"io"
    16  	"os"
    17  	"strings"
    18  
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  )
    22  
    23  type CommitLogCombiner struct {
    24  	rootPath  string
    25  	id        string
    26  	threshold int64
    27  	logger    logrus.FieldLogger
    28  }
    29  
    30  func NewCommitLogCombiner(rootPath, id string, threshold int64,
    31  	logger logrus.FieldLogger,
    32  ) *CommitLogCombiner {
    33  	return &CommitLogCombiner{
    34  		rootPath:  rootPath,
    35  		id:        id,
    36  		threshold: threshold,
    37  		logger:    logger,
    38  	}
    39  }
    40  
    41  func (c *CommitLogCombiner) Do() (bool, error) {
    42  	executed := false
    43  	for {
    44  		// fileNames will already be in order
    45  		fileNames, err := getCommitFileNames(c.rootPath, c.id)
    46  		if err != nil {
    47  			return executed, errors.Wrap(err, "obtain files names")
    48  		}
    49  
    50  		ok, err := c.combineFirstMatch(fileNames)
    51  		if err != nil {
    52  			return executed, err
    53  		}
    54  
    55  		if ok {
    56  			executed = true
    57  			continue
    58  		}
    59  
    60  		break
    61  	}
    62  	return executed, nil
    63  }
    64  
    65  func (c *CommitLogCombiner) combineFirstMatch(fileNames []string) (bool, error) {
    66  	for i, fileName := range fileNames {
    67  		if !strings.HasSuffix(fileName, ".condensed") {
    68  			// not an already condensed file, so no candidate for combining
    69  			continue
    70  		}
    71  
    72  		if i == len(fileNames)-1 {
    73  			// this is the last file, so there is nothing to combine it with
    74  			return false, nil
    75  		}
    76  
    77  		if !strings.HasSuffix(fileNames[i+1], ".condensed") {
    78  			// the next file is not a condensed file, so this file is not candidate
    79  			// for merging with the next
    80  			continue
    81  		}
    82  
    83  		currentStat, err := os.Stat(fileName)
    84  		if err != nil {
    85  			return false, errors.Wrapf(err, "stat file %q", fileName)
    86  		}
    87  
    88  		if currentStat.Size() > c.threshold {
    89  			// already too big, can't combine further
    90  			continue
    91  		}
    92  
    93  		nextStat, err := os.Stat(fileNames[i+1])
    94  		if err != nil {
    95  			return false, errors.Wrapf(err, "stat file %q", fileNames[i+1])
    96  		}
    97  
    98  		if currentStat.Size()+nextStat.Size() > c.threshold {
    99  			// combining those two would exceed threshold
   100  			continue
   101  		}
   102  
   103  		if err := c.combine(fileName, fileNames[i+1]); err != nil {
   104  			return false, errors.Wrapf(err, "combine %q and %q", fileName, fileNames[i+1])
   105  		}
   106  
   107  		return true, nil
   108  	}
   109  
   110  	return false, nil
   111  }
   112  
   113  func (c *CommitLogCombiner) combine(first, second string) error {
   114  	// all names are based on the first file, so that once file1 + file2 are
   115  	// combined it is as if file2 had never existed and file 1 was just always
   116  	// big enough to hold the contents of both
   117  
   118  	// clearly indicate that the file is "in progress", in case we crash while
   119  	// combining and the after restart there are multiple alternatives
   120  	tmpName := strings.TrimSuffix(first, ".condensed") + (".combined.tmp")
   121  
   122  	// finalName will look like an uncondensed original commit log, so the
   123  	// condensor will pick it up without even knowing that it's a combined file
   124  	finalName := strings.TrimSuffix(first, ".condensed")
   125  
   126  	if err := c.mergeFiles(tmpName, first, second); err != nil {
   127  		return errors.Wrap(err, "merge files")
   128  	}
   129  
   130  	if err := c.renameAndCleanUp(tmpName, finalName, first, second); err != nil {
   131  		return errors.Wrap(err, "rename and clean up files")
   132  	}
   133  
   134  	c.logger.WithFields(logrus.Fields{
   135  		"action":       "hnsw_commit_logger_combine_condensed_logs",
   136  		"id":           c.id,
   137  		"input_first":  first,
   138  		"input_second": second,
   139  		"output":       finalName,
   140  	}).Info("successfully combined previously condensed commit log files")
   141  
   142  	return nil
   143  }
   144  
   145  func (c *CommitLogCombiner) mergeFiles(outName, first, second string) error {
   146  	out, err := os.Create(outName)
   147  	if err != nil {
   148  		return errors.Wrapf(err, "open target file %q", outName)
   149  	}
   150  
   151  	source1, err := os.Open(first)
   152  	if err != nil {
   153  		return errors.Wrapf(err, "open first source file %q", first)
   154  	}
   155  	defer source1.Close()
   156  
   157  	source2, err := os.Open(second)
   158  	if err != nil {
   159  		return errors.Wrapf(err, "open second source file %q", second)
   160  	}
   161  	defer source2.Close()
   162  
   163  	_, err = io.Copy(out, source1)
   164  	if err != nil {
   165  		return errors.Wrapf(err, "copy first source (%q) into target (%q)", first,
   166  			outName)
   167  	}
   168  
   169  	_, err = io.Copy(out, source2)
   170  	if err != nil {
   171  		return errors.Wrapf(err, "copy second source (%q) into target (%q)", second,
   172  			outName)
   173  	}
   174  
   175  	err = out.Close()
   176  	if err != nil {
   177  		return errors.Wrapf(err, "close target file %q", outName)
   178  	}
   179  
   180  	return nil
   181  }
   182  
   183  func (c *CommitLogCombiner) renameAndCleanUp(tmpName, finalName string,
   184  	toDeletes ...string,
   185  ) error {
   186  	// do the rename before the delete, because if we crash in between we end up
   187  	// with duplicate files both with and without the ".condensed" suffix. The
   188  	// new (and complete) merged file will not carry the suffix whereas the
   189  	// sources will. This will look to the corrupted file fixer as if a
   190  	// condensing had gone wrong and will delete the the source
   191  
   192  	if err := os.Rename(tmpName, finalName); err != nil {
   193  		return errors.Wrapf(err, "rename tmp (%q) to final (%q)", tmpName, finalName)
   194  	}
   195  
   196  	for _, toDelete := range toDeletes {
   197  		if err := os.Remove(toDelete); err != nil {
   198  			return errors.Wrapf(err, "clean up %q", toDelete)
   199  		}
   200  	}
   201  
   202  	return nil
   203  }