github.com/avahowell/sia@v0.5.1-beta.0.20160524050156-83dcc3d37c94/modules/renter/repair.go (about)

     1  package renter
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"os"
     7  	"time"
     8  
     9  	"github.com/NebulousLabs/Sia/build"
    10  	"github.com/NebulousLabs/Sia/modules"
    11  	"github.com/NebulousLabs/Sia/modules/renter/contractor"
    12  	"github.com/NebulousLabs/Sia/types"
    13  )
    14  
    15  const (
    16  	// repairThreads is the number of repairs that can run concurrently.
    17  	repairThreads = 10
    18  )
    19  
    20  // When a file contract is within 'renewThreshold' blocks of expiring, the renter
    21  // will attempt to renew the contract.
    22  var renewThreshold = func() types.BlockHeight {
    23  	switch build.Release {
    24  	case "testing":
    25  		return 10
    26  	case "dev":
    27  		return 200
    28  	default:
    29  		return 144 * 7 * 3 // 3 weeks - to soon be 6 weeks.
    30  	}
    31  }()
    32  
    33  // hostErr and hostErrs are helpers for reporting repair errors. The actual
    34  // Error implementations aren't that important; we just need to be able to
    35  // extract the NetAddress of the failed host.
    36  
    37  type hostErr struct {
    38  	host modules.NetAddress
    39  	err  error
    40  }
    41  
    42  func (he hostErr) Error() string {
    43  	return fmt.Sprintf("host %v failed: %v", he.host, he.err)
    44  }
    45  
    46  type hostErrs []*hostErr
    47  
    48  func (hs hostErrs) Error() string {
    49  	var errs []error
    50  	for _, h := range hs {
    51  		errs = append(errs, h)
    52  	}
    53  	return build.JoinErrors(errs, "\n").Error()
    54  }
    55  
    56  // repair attempts to repair a file chunk by uploading its pieces to more
    57  // hosts.
    58  func (f *file) repair(chunkIndex uint64, missingPieces []uint64, r io.ReaderAt, hosts []contractor.Editor) error {
    59  	// read chunk data and encode
    60  	chunk := make([]byte, f.chunkSize())
    61  	_, err := r.ReadAt(chunk, int64(chunkIndex*f.chunkSize()))
    62  	if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
    63  		return err
    64  	}
    65  	pieces, err := f.erasureCode.Encode(chunk)
    66  	if err != nil {
    67  		return err
    68  	}
    69  	// encrypt pieces
    70  	for i := range pieces {
    71  		key := deriveKey(f.masterKey, chunkIndex, uint64(i))
    72  		pieces[i], err = key.EncryptBytes(pieces[i])
    73  		if err != nil {
    74  			return err
    75  		}
    76  	}
    77  
    78  	// upload one piece per host
    79  	numPieces := len(missingPieces)
    80  	if len(hosts) < numPieces {
    81  		numPieces = len(hosts)
    82  	}
    83  	errChan := make(chan *hostErr)
    84  	for i := 0; i < numPieces; i++ {
    85  		go func(pieceIndex uint64, host contractor.Editor) {
    86  			// upload data to host
    87  			root, err := host.Upload(pieces[pieceIndex])
    88  			if err != nil {
    89  				errChan <- &hostErr{host.Address(), err}
    90  				return
    91  			}
    92  
    93  			// create contract entry, if necessary
    94  			f.mu.Lock()
    95  			contract, ok := f.contracts[host.ContractID()]
    96  			if !ok {
    97  				contract = fileContract{
    98  					ID:          host.ContractID(),
    99  					IP:          host.Address(),
   100  					WindowStart: host.EndHeight(),
   101  				}
   102  			}
   103  
   104  			// update contract
   105  			contract.Pieces = append(contract.Pieces, pieceData{
   106  				Chunk:      chunkIndex,
   107  				Piece:      pieceIndex,
   108  				MerkleRoot: root,
   109  			})
   110  			f.contracts[host.ContractID()] = contract
   111  			f.mu.Unlock()
   112  			errChan <- nil
   113  		}(missingPieces[i], hosts[i])
   114  	}
   115  	var errs hostErrs
   116  	for i := 0; i < numPieces; i++ {
   117  		err := <-errChan
   118  		if err != nil {
   119  			errs = append(errs, err)
   120  		}
   121  	}
   122  	if errs != nil {
   123  		return errs
   124  	}
   125  
   126  	return nil
   127  }
   128  
   129  // incompleteChunks returns a map of chunks containing pieces that have not
   130  // been uploaded.
   131  func (f *file) incompleteChunks() map[uint64][]uint64 {
   132  	f.mu.RLock()
   133  	defer f.mu.RUnlock()
   134  
   135  	present := make([][]bool, f.numChunks())
   136  	for i := range present {
   137  		present[i] = make([]bool, f.erasureCode.NumPieces())
   138  	}
   139  	for _, fc := range f.contracts {
   140  		for _, p := range fc.Pieces {
   141  			present[p.Chunk][p.Piece] = true
   142  		}
   143  	}
   144  
   145  	incomplete := make(map[uint64][]uint64)
   146  	for chunkIndex, pieceBools := range present {
   147  		for pieceIndex, ok := range pieceBools {
   148  			if !ok {
   149  				incomplete[uint64(chunkIndex)] = append(incomplete[uint64(chunkIndex)], uint64(pieceIndex))
   150  			}
   151  		}
   152  	}
   153  	return incomplete
   154  }
   155  
   156  // chunkHosts returns the hosts storing the given chunk.
   157  func (f *file) chunkHosts(chunk uint64) []modules.NetAddress {
   158  	f.mu.RLock()
   159  	defer f.mu.RUnlock()
   160  
   161  	var old []modules.NetAddress
   162  	for _, fc := range f.contracts {
   163  		for _, p := range fc.Pieces {
   164  			if p.Chunk == chunk {
   165  				old = append(old, fc.IP)
   166  				break
   167  			}
   168  		}
   169  	}
   170  	return old
   171  }
   172  
   173  // expiringContracts returns the contracts that will expire soon.
   174  // TODO: what if contract has fully expired?
   175  func (f *file) expiringContracts(height types.BlockHeight) []fileContract {
   176  	f.mu.RLock()
   177  	defer f.mu.RUnlock()
   178  
   179  	var expiring []fileContract
   180  	for _, fc := range f.contracts {
   181  		if height >= fc.WindowStart-renewThreshold {
   182  			expiring = append(expiring, fc)
   183  		}
   184  	}
   185  	return expiring
   186  }
   187  
   188  // offlineChunks returns the chunks belonging to "offline" hosts -- hosts that
   189  // do not meet uptime requirements. Importantly, only chunks missing more than
   190  // half their redundancy are returned.
   191  func (f *file) offlineChunks(hdb hostDB) map[uint64][]uint64 {
   192  	f.mu.RLock()
   193  	defer f.mu.RUnlock()
   194  
   195  	// mark all pieces belonging to offline hosts.
   196  	offline := make(map[uint64][]uint64)
   197  	for _, fc := range f.contracts {
   198  		if hdb.IsOffline(fc.IP) {
   199  			for _, p := range fc.Pieces {
   200  				offline[p.Chunk] = append(offline[p.Chunk], p.Piece)
   201  			}
   202  		}
   203  	}
   204  	// filter out chunks missing less than half of their redundancy
   205  	filtered := make(map[uint64][]uint64)
   206  	for chunk, pieces := range offline {
   207  		if len(pieces) > f.erasureCode.NumPieces()/2 {
   208  			filtered[chunk] = pieces
   209  		}
   210  	}
   211  	return filtered
   212  }
   213  
   214  // threadedRepairLoop improves the health of files tracked by the renter by
   215  // reuploading their missing pieces. Multiple repair attempts may be necessary
   216  // before the file reaches full redundancy.
   217  func (r *Renter) threadedRepairLoop() {
   218  	for {
   219  		time.Sleep(5 * time.Second)
   220  
   221  		if !r.wallet.Unlocked() {
   222  			continue
   223  		}
   224  
   225  		if len(r.hostContractor.Contracts()) == 0 {
   226  			// nothing to revise
   227  			continue
   228  		}
   229  
   230  		// make copy of repair set under lock
   231  		repairing := make(map[string]trackedFile)
   232  		id := r.mu.RLock()
   233  		for name, meta := range r.tracking {
   234  			repairing[name] = meta
   235  		}
   236  		r.mu.RUnlock(id)
   237  
   238  		// create host pool
   239  		pool := r.newHostPool()
   240  		for name, meta := range repairing {
   241  			r.threadedRepairFile(name, meta, pool)
   242  		}
   243  		pool.Close() // heh
   244  	}
   245  }
   246  
   247  // threadedRepairFile repairs and saves an individual file.
   248  func (r *Renter) threadedRepairFile(name string, meta trackedFile, pool *hostPool) {
   249  	// helper function
   250  	logAndRemove := func(fmt string, args ...interface{}) {
   251  		r.log.Printf(fmt, args...)
   252  		id := r.mu.Lock()
   253  		delete(r.tracking, name)
   254  		r.save()
   255  		r.mu.Unlock(id)
   256  	}
   257  
   258  	id := r.mu.RLock()
   259  	f, ok := r.files[name]
   260  	r.mu.RUnlock(id)
   261  	if !ok {
   262  		logAndRemove("removing %v from repair set: no longer tracking that file", name)
   263  		return
   264  	}
   265  
   266  	// determine if there is any work to do
   267  	incChunks := f.incompleteChunks()
   268  	if len(incChunks) == 0 {
   269  		return
   270  	}
   271  
   272  	// open file handle
   273  	handle, err := os.Open(meta.RepairPath)
   274  	if err != nil {
   275  		logAndRemove("removing %v from repair set: %v", name, err)
   276  		return
   277  	}
   278  	defer handle.Close()
   279  
   280  	// repair incomplete chunks
   281  	if len(incChunks) != 0 {
   282  		r.log.Printf("repairing %v chunks of %v", len(incChunks), f.name)
   283  		r.repairChunks(f, handle, incChunks, pool)
   284  	}
   285  }
   286  
   287  // repairChunks uploads missing chunks of f to new hosts.
   288  func (r *Renter) repairChunks(f *file, handle io.ReaderAt, chunks map[uint64][]uint64, pool *hostPool) {
   289  	for chunk, pieces := range chunks {
   290  		// Determine host set. We want one host for each missing piece, and no
   291  		// repeats of other hosts of this chunk.
   292  		hosts := pool.uniqueHosts(len(pieces), f.chunkHosts(chunk))
   293  		if len(hosts) == 0 {
   294  			r.log.Debugf("aborting repair of %v: host pool is empty", f.name)
   295  			return
   296  		}
   297  		// upload to new hosts
   298  		err := f.repair(chunk, pieces, handle, hosts)
   299  		if err != nil {
   300  			if he, ok := err.(hostErrs); ok {
   301  				// if a specific host failed, remove it from the pool
   302  				for _, h := range he {
   303  					// only log non-graceful errors
   304  					if h.err != modules.ErrStopResponse {
   305  						r.log.Printf("failed to upload to host %v: %v", h.host, h.err)
   306  					}
   307  					pool.remove(h.host)
   308  				}
   309  			} else {
   310  				// any other type of error indicates a serious problem
   311  				r.log.Printf("aborting repair of %v: %v", f.name, err)
   312  				return
   313  			}
   314  		}
   315  
   316  		// save the new contract
   317  		f.mu.RLock()
   318  		err = r.saveFile(f)
   319  		f.mu.RUnlock()
   320  		if err != nil {
   321  			// If saving failed for this chunk, it will probably fail for the
   322  			// next chunk as well. Better to try again on the next cycle.
   323  			r.log.Printf("failed to save repaired file %v: %v", f.name, err)
   324  			return
   325  		}
   326  	}
   327  }