gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/dirupdatebatcher.go (about) 1 package renter 2 3 // dirupdatebatcher.go contains the logic for the dirupdatebatcher, which is a 4 // batching tool that improves the performance of updating a large numer of 5 // directories in the same time period by removing redundant calls to the same 6 // directory, and by removing redundant update calls that would happen on shared 7 // parent directories. 8 // 9 // NOTE: The dir update batcher is already fairly optimized. There are two known 10 // places to improve performance, but both contain a fair amount of programming 11 // overhead and could potentially make performance worse if implemented 12 // incorrectly. The first is that batches do not deduplicate between eachother. 13 // If flush() is called on one batch before the previous batch is finished, the 14 // two batches may perform redundant work. This can be deduplicated if the 15 // batches have pointers to eachother, however for garbage collection purposes 16 // you need to make sure to clean up the pointers later. The second thing is 17 // that the update calls are all made together in rapid succession, which could 18 // hog the CPU and consume a ton of disk IOPs all at once. We try to manage this 19 // by only batching together 30 seconds at a time. You could try to slow down 20 // the update calls so that the CPU is under less stress, but this may block 21 // parts of the repair loop, and may also block user calls. It is unlikely that 22 // either of these optimizations need to be pursued, but is something to keep in 23 // mind if the batcher seems to be causing issues in production. 24 25 import ( 26 "container/list" 27 "fmt" 28 "sync" 29 "time" 30 31 "gitlab.com/NebulousLabs/errors" 32 "gitlab.com/SkynetLabs/skyd/build" 33 "gitlab.com/SkynetLabs/skyd/skymodules" 34 ) 35 36 var ( 37 // maxTimeBetweenBatchExectutions defines the amount of time that a batch 38 // will wait before executing the queue of directories to batch. The testing 39 // value is really low at 50ms to maximize the opportunity that threads 40 // queue things across multiple batches (which should be safe, but 41 // potentially has edge cases). 42 // 43 // The production value is also relatively low at 30 seconds was set a lot 44 // higher (15 minutes), but we saw in production that this would result in 45 // large amounts of files being batched together all at once, causing the 46 // flush to take over a minute. 47 maxTimeBetweenBatchExecutions = build.Select(build.Var{ 48 Dev: 10 * time.Second, 49 Standard: 30 * time.Second, 50 Testing: 50 * time.Millisecond, 51 }).(time.Duration) 52 ) 53 54 type ( 55 // dirUpdateBatch defines a batch of updates that should be run at the 56 // same time. Performing an update on a file requires doing an update on its 57 // directory and all parent directories up to the root directory. By doing 58 // the updates as a batch, we can reduce the total amount of work required 59 // to complete the update. 60 // 61 // NOTE: the health update batch depends on the mutex of the 62 // dirUpdateBatcher for thread safety. 63 dirUpdateBatch struct { 64 // batchSet is an array of maps which contain the directories that need 65 // to be updated. Each element of the array corresponds to a directory 66 // of a different depth. The first element of the array just contains 67 // the root directory. The second element is a map that contains only 68 // direct subdirs of the root. The third element is a map that contains 69 // directories which live directly in subdirs of the root, and so on. 70 // 71 // When performing the update on the set, the lowest level dirs are all 72 // executed at once, and then their parents are added to the batchSet, 73 // then the next level of dirs are executed all together, and so on. 74 // This ensures that each directory is only updated a single time per 75 // batch, even if it appears as a parent in dozens of directories in the 76 // batchSet. 77 batchSet []map[skymodules.SiaPath]struct{} 78 79 // completeChan is a channel that gets closed when the whole batch has 80 // successfully executed. It will not be closed until priorCompleteChan 81 // has been closed. priorCompleteChan is the channel owned by the 82 // previous batch. This ensures that when the channel is closed, all 83 // updates are certain to have completed, even if those updates were 84 // submitted to previous batches. 85 completeChan chan struct{} 86 priorCompleteChan <-chan struct{} 87 88 // Contains a renter, and also has some dependency injection logic. 89 dirUpdateBatchDeps 90 } 91 92 // dirUpdateBatcher receives requests to update the health of a file or 93 // directory and adds them to a batch. This struct manages concurrency and 94 // safety between different batches. 95 dirUpdateBatcher struct { 96 // nextBatch defines the next batch that will perform a health update. 97 nextBatch *dirUpdateBatch 98 99 // Utilities 100 closed bool // callQueueDirUpdate is a no-op after shutdown 101 staticFlushChan chan struct{} 102 mu sync.Mutex 103 staticRenter *Renter 104 } 105 ) 106 107 // managedExecute will execute a batch of updates. 108 func (batch *dirUpdateBatch) managedExecute() { 109 renter := batch.dirUpdateBatchDeps.renter 110 start := time.Now() 111 dirs := 0 112 defer func() { 113 str := fmt.Sprintf("dirupdatebatch completed %v dirs in %v", dirs, time.Since(start)) 114 renter.staticLog.Debugln(str, "dirupdatebatcher") 115 }() 116 117 // iterate through the batchSet backwards. 118 for i := len(batch.batchSet) - 1; i >= 0; i-- { 119 for dirPath := range batch.batchSet[i] { 120 // Update the directory metadata. Note: we don't do any updates on 121 // the file healths themselves, we just use the file metadata. 122 err := batch.managedUpdateDirMetadata(dirPath) // passes through to the renter except during testing 123 if err != nil { 124 str := fmt.Sprintf("error updating directory %v in dirUpdateBatch.execute: %v", dirPath, err) 125 renter.staticLog.Println(str, "health-verbose", "dirupdatebatcher", "error") 126 continue 127 } 128 dirs++ // Increment after the error. 129 130 // Add the parent. 131 if !dirPath.IsRoot() { 132 parent, err := dirPath.Dir() 133 if err != nil { 134 renter.staticLog.Critical("should not be getting an error when grabbing the dir of a non-root siadir:", dirPath, err) 135 } 136 batch.batchSet[i-1][parent] = struct{}{} 137 } 138 } 139 } 140 141 // Wait until the previous batch is complete. If we are shutting down, go 142 // ahead and front-run the previous batch and just signal a close 143 // immediately. 144 select { 145 case <-batch.priorCompleteChan: 146 case <-batch.renter.tg.StopChan(): 147 } 148 close(batch.completeChan) 149 } 150 151 // callQueueUpdate will add an update to the current batch. The input needs to 152 // be a dir. 153 func (dub *dirUpdateBatcher) callQueueDirUpdate(dirPath skymodules.SiaPath) { 154 dub.mu.Lock() 155 defer dub.mu.Unlock() 156 if dub.closed { 157 return 158 } 159 dub.staticRenter.staticLog.Debugln("dirUpdateBatcher queuing update for:", dirPath) 160 // Make sure maps at each depth exist. 161 depth := dirPath.Depth() 162 for i := len(dub.nextBatch.batchSet); i <= depth; i++ { 163 dub.nextBatch.batchSet = append(dub.nextBatch.batchSet, make(map[skymodules.SiaPath]struct{})) 164 } 165 // Add the input dirPath to the final level. 166 dub.nextBatch.batchSet[depth][dirPath] = struct{}{} 167 } 168 169 // callFlush will trigger the current batch of updates to execute, and will not 170 // return until all updates have completed and are represented in the root 171 // directory. It will also not return until all prior batches have completed as 172 // well - if you have added a directory to a batch and call flush, you can be 173 // certain that the directory update will have executed by the time the flush 174 // call returns, regardless of which batch that directory was added to. 175 func (dub *dirUpdateBatcher) callFlush() { 176 // Grab the complete chan for the current batch. 177 dub.mu.Lock() 178 completeChan := dub.nextBatch.completeChan 179 dub.mu.Unlock() 180 181 // Signal that the current batch should be flushed. 182 select { 183 case dub.staticFlushChan <- struct{}{}: 184 default: 185 } 186 187 // Wait until the batch has completed before returning. No need to wait if 188 // the renter has closed, just exit immediately. 189 select { 190 case <-completeChan: 191 case <-dub.staticRenter.tg.StopChan(): 192 } 193 } 194 195 // newBatch returns a new dirUpdateBatch ready for use. 196 func (dub *dirUpdateBatcher) newBatch(priorCompleteChan <-chan struct{}) *dirUpdateBatch { 197 return &dirUpdateBatch{ 198 completeChan: make(chan struct{}), 199 priorCompleteChan: priorCompleteChan, 200 201 dirUpdateBatchDeps: dirUpdateBatchDeps{ 202 renter: dub.staticRenter, 203 }, 204 } 205 } 206 207 // threadedExecuteBatchUpdates is a permanent background thread which will 208 // execute batched updates in the background. 209 func (dub *dirUpdateBatcher) threadedExecuteBatchUpdates() { 210 for { 211 select { 212 case <-dub.staticRenter.tg.StopChan(): 213 dub.mu.Lock() 214 dub.closed = true 215 dub.mu.Unlock() 216 dub.nextBatch.managedExecute() 217 return 218 case <-dub.staticFlushChan: 219 case <-time.After(maxTimeBetweenBatchExecutions): 220 } 221 222 // Rotate the current batch out for a new batch. This will block any 223 // thread trying to add new updates to the batch, so make sure it 224 // happens quickly. 225 dub.mu.Lock() 226 batch := dub.nextBatch 227 dub.nextBatch = dub.newBatch(batch.priorCompleteChan) 228 dub.mu.Unlock() 229 // Execute the batch now that we aren't blocking anymore. 230 batch.managedExecute() 231 } 232 } 233 234 // newDirUpdateBatcher returns a health update batcher that is ready for use. 235 func (r *Renter) newDirUpdateBatcher() (*dirUpdateBatcher, error) { 236 dub := &dirUpdateBatcher{ 237 staticFlushChan: make(chan struct{}, 1), 238 staticRenter: r, 239 } 240 241 // The next batch needs a channel which will be closed when the previous 242 // batch completes. Since there is no previous batch, we provide a channel 243 // that is already closed. 244 initialChan := make(chan struct{}) 245 close(initialChan) 246 247 dub.nextBatch = dub.newBatch(initialChan) 248 err := r.tg.Launch(dub.threadedExecuteBatchUpdates) 249 if err != nil { 250 return nil, errors.AddContext(err, "unable to launch the batch updates backghround thread") 251 } 252 return dub, nil 253 } 254 255 // UpdateMetadata will explicitly update the metadata of the provided directory, 256 // returning once the directory has been updated and the changes are reflected 257 // in the aggregate metadata of the root directory. If the recursive flag is 258 // set, it will do a check on all subdirs as well. 259 // 260 // NOTE: This call is not very efficient, and generally isn't intended to be 261 // used on large directories with lots of subdirectories. 262 func (r *Renter) UpdateMetadata(siaPath skymodules.SiaPath, recursive bool) error { 263 err := r.tg.Add() 264 if err != nil { 265 return err 266 } 267 defer r.tg.Done() 268 269 // Use a list to track all of the siapaths we want. 270 dirPaths := list.New() 271 dirPaths.PushBack(siaPath) 272 for dirPaths.Front() != nil { 273 e := dirPaths.Front() 274 dirPaths.Remove(e) 275 siaPath := e.Value.(skymodules.SiaPath) 276 err := r.managedUpdateFilesInDir(siaPath) 277 if err != nil { 278 context := fmt.Sprintf("unable to update the metadata of the files in dir %v", siaPath) 279 return errors.AddContext(err, context) 280 } 281 r.staticDirUpdateBatcher.callQueueDirUpdate(siaPath) 282 if !recursive { 283 // If the recursive flag isn't set, this should trigger immediately 284 // and result in only one directory being processed. 285 continue 286 } 287 288 // The recursive flag is set, so load the full list of subdirectories 289 // and ensure the loop will scan all of those directories as well. 290 subDirPaths, err := r.managedSubDirectories(siaPath) 291 if err != nil { 292 context := fmt.Sprintf("unable to load list of subdirs for %v", siaPath) 293 return errors.AddContext(err, context) 294 } 295 for _, subDir := range subDirPaths { 296 dirPaths.PushBack(subDir) 297 } 298 } 299 300 // Block until all updates are represented in the root aggregate metadata. 301 r.staticDirUpdateBatcher.callFlush() 302 return nil 303 }