github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/commit_log_combiner.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package hnsw 13 14 import ( 15 "io" 16 "os" 17 "strings" 18 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 ) 22 23 type CommitLogCombiner struct { 24 rootPath string 25 id string 26 threshold int64 27 logger logrus.FieldLogger 28 } 29 30 func NewCommitLogCombiner(rootPath, id string, threshold int64, 31 logger logrus.FieldLogger, 32 ) *CommitLogCombiner { 33 return &CommitLogCombiner{ 34 rootPath: rootPath, 35 id: id, 36 threshold: threshold, 37 logger: logger, 38 } 39 } 40 41 func (c *CommitLogCombiner) Do() (bool, error) { 42 executed := false 43 for { 44 // fileNames will already be in order 45 fileNames, err := getCommitFileNames(c.rootPath, c.id) 46 if err != nil { 47 return executed, errors.Wrap(err, "obtain files names") 48 } 49 50 ok, err := c.combineFirstMatch(fileNames) 51 if err != nil { 52 return executed, err 53 } 54 55 if ok { 56 executed = true 57 continue 58 } 59 60 break 61 } 62 return executed, nil 63 } 64 65 func (c *CommitLogCombiner) combineFirstMatch(fileNames []string) (bool, error) { 66 for i, fileName := range fileNames { 67 if !strings.HasSuffix(fileName, ".condensed") { 68 // not an already condensed file, so no candidate for combining 69 continue 70 } 71 72 if i == len(fileNames)-1 { 73 // this is the last file, so there is nothing to combine it with 74 return false, nil 75 } 76 77 if !strings.HasSuffix(fileNames[i+1], ".condensed") { 78 // the next file is not a condensed file, so this file is not candidate 79 // for merging with the next 80 continue 81 } 82 83 currentStat, err := os.Stat(fileName) 84 if err != nil { 85 return false, errors.Wrapf(err, "stat file %q", fileName) 86 } 87 88 if currentStat.Size() > c.threshold { 89 // already too big, can't combine further 90 continue 91 } 92 93 nextStat, err := os.Stat(fileNames[i+1]) 94 if err != nil { 95 return false, errors.Wrapf(err, "stat file %q", fileNames[i+1]) 96 } 97 98 if currentStat.Size()+nextStat.Size() > c.threshold { 99 // combining those two would exceed threshold 100 continue 101 } 102 103 if err := c.combine(fileName, fileNames[i+1]); err != nil { 104 return false, errors.Wrapf(err, "combine %q and %q", fileName, fileNames[i+1]) 105 } 106 107 return true, nil 108 } 109 110 return false, nil 111 } 112 113 func (c *CommitLogCombiner) combine(first, second string) error { 114 // all names are based on the first file, so that once file1 + file2 are 115 // combined it is as if file2 had never existed and file 1 was just always 116 // big enough to hold the contents of both 117 118 // clearly indicate that the file is "in progress", in case we crash while 119 // combining and the after restart there are multiple alternatives 120 tmpName := strings.TrimSuffix(first, ".condensed") + (".combined.tmp") 121 122 // finalName will look like an uncondensed original commit log, so the 123 // condensor will pick it up without even knowing that it's a combined file 124 finalName := strings.TrimSuffix(first, ".condensed") 125 126 if err := c.mergeFiles(tmpName, first, second); err != nil { 127 return errors.Wrap(err, "merge files") 128 } 129 130 if err := c.renameAndCleanUp(tmpName, finalName, first, second); err != nil { 131 return errors.Wrap(err, "rename and clean up files") 132 } 133 134 c.logger.WithFields(logrus.Fields{ 135 "action": "hnsw_commit_logger_combine_condensed_logs", 136 "id": c.id, 137 "input_first": first, 138 "input_second": second, 139 "output": finalName, 140 }).Info("successfully combined previously condensed commit log files") 141 142 return nil 143 } 144 145 func (c *CommitLogCombiner) mergeFiles(outName, first, second string) error { 146 out, err := os.Create(outName) 147 if err != nil { 148 return errors.Wrapf(err, "open target file %q", outName) 149 } 150 151 source1, err := os.Open(first) 152 if err != nil { 153 return errors.Wrapf(err, "open first source file %q", first) 154 } 155 defer source1.Close() 156 157 source2, err := os.Open(second) 158 if err != nil { 159 return errors.Wrapf(err, "open second source file %q", second) 160 } 161 defer source2.Close() 162 163 _, err = io.Copy(out, source1) 164 if err != nil { 165 return errors.Wrapf(err, "copy first source (%q) into target (%q)", first, 166 outName) 167 } 168 169 _, err = io.Copy(out, source2) 170 if err != nil { 171 return errors.Wrapf(err, "copy second source (%q) into target (%q)", second, 172 outName) 173 } 174 175 err = out.Close() 176 if err != nil { 177 return errors.Wrapf(err, "close target file %q", outName) 178 } 179 180 return nil 181 } 182 183 func (c *CommitLogCombiner) renameAndCleanUp(tmpName, finalName string, 184 toDeletes ...string, 185 ) error { 186 // do the rename before the delete, because if we crash in between we end up 187 // with duplicate files both with and without the ".condensed" suffix. The 188 // new (and complete) merged file will not carry the suffix whereas the 189 // sources will. This will look to the corrupted file fixer as if a 190 // condensing had gone wrong and will delete the the source 191 192 if err := os.Rename(tmpName, finalName); err != nil { 193 return errors.Wrapf(err, "rename tmp (%q) to final (%q)", tmpName, finalName) 194 } 195 196 for _, toDelete := range toDeletes { 197 if err := os.Remove(toDelete); err != nil { 198 return errors.Wrapf(err, "clean up %q", toDelete) 199 } 200 } 201 202 return nil 203 }