github.com/keybase/client/go@v0.0.0-20240309051027-028f7c731f8b/kbfs/simplefs/archive.go (about)

     1  // Copyright 2024 Keybase, Inc. All rights reserved. Use of
     2  // this source code is governed by the included BSD license.
     3  
     4  package simplefs
     5  
     6  import (
     7  	"archive/zip"
     8  	"bytes"
     9  	"compress/gzip"
    10  	"crypto/sha256"
    11  	"encoding/hex"
    12  	"encoding/json"
    13  	"fmt"
    14  	"hash"
    15  	"io"
    16  	"io/fs"
    17  	"os"
    18  	"path/filepath"
    19  	"sort"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/keybase/client/go/protocol/keybase1"
    24  	"github.com/pkg/errors"
    25  	"golang.org/x/net/context"
    26  	"gopkg.in/src-d/go-billy.v4"
    27  )
    28  
    29  func loadArchiveStateFromJsonGz(ctx context.Context, simpleFS *SimpleFS, filePath string) (state *keybase1.SimpleFSArchiveState, err error) {
    30  	f, err := os.Open(filePath)
    31  	if err != nil {
    32  		simpleFS.log.CErrorf(ctx, "loadArchiveStateFromJsonGz: opening state file error: %v", err)
    33  		return nil, err
    34  	}
    35  	defer f.Close()
    36  	gzReader, err := gzip.NewReader(f)
    37  	if err != nil {
    38  		simpleFS.log.CErrorf(ctx, "loadArchiveStateFromJsonGz: creating gzip reader error: %v", err)
    39  		return nil, err
    40  	}
    41  	decoder := json.NewDecoder(gzReader)
    42  	err = decoder.Decode(&state)
    43  	if err != nil {
    44  		simpleFS.log.CErrorf(ctx, "loadArchiveStateFromJsonGz: decoding state file error: %v", err)
    45  		return nil, err
    46  	}
    47  	return state, nil
    48  }
    49  
    50  func writeArchiveStateIntoJsonGz(ctx context.Context, simpleFS *SimpleFS, filePath string, s *keybase1.SimpleFSArchiveState) error {
    51  	err := os.MkdirAll(filepath.Dir(filePath), 0755)
    52  	if err != nil {
    53  		simpleFS.log.CErrorf(ctx, "writeArchiveStateIntoJsonGz: os.MkdirAll error: %v", err)
    54  		return err
    55  	}
    56  	f, err := os.Create(filePath)
    57  	if err != nil {
    58  		simpleFS.log.CErrorf(ctx, "writeArchiveStateIntoJsonGz: creating state file error: %v", err)
    59  		return err
    60  	}
    61  	defer f.Close()
    62  
    63  	gzWriter := gzip.NewWriter(f)
    64  	defer gzWriter.Close()
    65  
    66  	encoder := json.NewEncoder(gzWriter)
    67  	err = encoder.Encode(s)
    68  	if err != nil {
    69  		simpleFS.log.CErrorf(ctx, "writeArchiveStateIntoJsonGz: encoding state file error: %v", err)
    70  		return err
    71  	}
    72  
    73  	return nil
    74  }
    75  
    76  type errorState struct {
    77  	err       error
    78  	nextRetry time.Time
    79  }
    80  
    81  type archiveManager struct {
    82  	simpleFS *SimpleFS
    83  
    84  	// Just use a regular mutex rather than a rw one so all writes to
    85  	// persistent storage are synchronized.
    86  	mu               sync.Mutex
    87  	state            *keybase1.SimpleFSArchiveState
    88  	jobCtxCancellers map[string]func()
    89  	// jobID -> errorState. Populated when an error has happened. It's only
    90  	// valid for these phases:
    91  	//
    92  	//   keybase1.SimpleFSArchiveJobPhase_Indexing
    93  	//   keybase1.SimpleFSArchiveJobPhase_Copying
    94  	//   keybase1.SimpleFSArchiveJobPhase_Zipping
    95  	//
    96  	// When nextRetry is current errorRetryWorker delete the errorState from
    97  	// this map, while also putting them back to the previous phase so the
    98  	// worker can pick it up.
    99  	errors map[string]errorState
   100  
   101  	indexingWorkerSignal chan struct{}
   102  	copyingWorkerSignal  chan struct{}
   103  	zippingWorkerSignal  chan struct{}
   104  
   105  	ctxCancel func()
   106  }
   107  
   108  func getStateFilePath(simpleFS *SimpleFS) string {
   109  	username := simpleFS.config.KbEnv().GetUsername()
   110  	cacheDir := simpleFS.getCacheDir()
   111  	return filepath.Join(cacheDir, fmt.Sprintf("kbfs-archive-%s.json.gz", username))
   112  }
   113  
   114  func (m *archiveManager) flushStateFileLocked(ctx context.Context) error {
   115  	select {
   116  	case <-ctx.Done():
   117  		return ctx.Err()
   118  	default:
   119  	}
   120  	err := writeArchiveStateIntoJsonGz(ctx, m.simpleFS, getStateFilePath(m.simpleFS), m.state)
   121  	if err != nil {
   122  		m.simpleFS.log.CErrorf(ctx,
   123  			"archiveManager.flushStateFileLocked: writing state file error: %v", err)
   124  		return err
   125  	}
   126  	return nil
   127  }
   128  
   129  func (m *archiveManager) flushStateFile(ctx context.Context) error {
   130  	m.mu.Lock()
   131  	defer m.mu.Unlock()
   132  	return m.flushStateFileLocked(ctx)
   133  }
   134  
   135  func (m *archiveManager) signal(ch chan struct{}) {
   136  	select {
   137  	case ch <- struct{}{}:
   138  	default:
   139  		// There's already a signal in the chan. Skipping this.
   140  	}
   141  }
   142  
   143  func (m *archiveManager) shutdown(ctx context.Context) {
   144  	// OK to cancel before flushStateFileLocked because we'll pass in the
   145  	// shutdown ctx there.
   146  	if m.ctxCancel != nil {
   147  		m.ctxCancel()
   148  	}
   149  
   150  	m.mu.Lock()
   151  	defer m.mu.Unlock()
   152  	err := m.flushStateFileLocked(ctx)
   153  	if err != nil {
   154  		m.simpleFS.log.CWarningf(ctx, "m.flushStateFileLocked error: %v", err)
   155  	}
   156  }
   157  
   158  func (m *archiveManager) startJob(ctx context.Context, job keybase1.SimpleFSArchiveJobDesc) error {
   159  	m.simpleFS.log.CDebugf(ctx, "+ archiveManager.startJob %#+v", job)
   160  	defer m.simpleFS.log.CDebugf(ctx, "- archiveManager.startJob")
   161  
   162  	m.mu.Lock()
   163  	defer m.mu.Unlock()
   164  	if _, ok := m.state.Jobs[job.JobID]; ok {
   165  		return errors.New("job ID already exists")
   166  	}
   167  	m.state.Jobs[job.JobID] = keybase1.SimpleFSArchiveJobState{
   168  		Desc:  job,
   169  		Phase: keybase1.SimpleFSArchiveJobPhase_Queued,
   170  	}
   171  	m.state.LastUpdated = keybase1.ToTime(time.Now())
   172  	m.signal(m.indexingWorkerSignal)
   173  	return m.flushStateFileLocked(ctx)
   174  }
   175  
   176  func (m *archiveManager) cancelOrDismissJob(ctx context.Context,
   177  	jobID string) (err error) {
   178  	m.simpleFS.log.CDebugf(ctx, "+ archiveManager.cancelOrDismissJob")
   179  	defer m.simpleFS.log.CDebugf(ctx, "- archiveManager.cancelOrDismissJob %s", jobID)
   180  	m.mu.Lock()
   181  	defer m.mu.Unlock()
   182  
   183  	if cancel, ok := m.jobCtxCancellers[jobID]; ok {
   184  		cancel()
   185  		delete(m.jobCtxCancellers, jobID)
   186  	}
   187  
   188  	job, ok := m.state.Jobs[jobID]
   189  	if !ok {
   190  		return errors.New("job not found")
   191  	}
   192  	delete(m.state.Jobs, jobID)
   193  
   194  	err = os.RemoveAll(job.Desc.StagingPath)
   195  	if err != nil {
   196  		m.simpleFS.log.CWarningf(ctx, "removing staging path %q for job %s error: %v",
   197  			job.Desc.StagingPath, jobID, err)
   198  	}
   199  
   200  	return nil
   201  }
   202  
   203  func (m *archiveManager) getCurrentState(ctx context.Context) (
   204  	state keybase1.SimpleFSArchiveState, errorStates map[string]errorState) {
   205  	m.simpleFS.log.CDebugf(ctx, "+ archiveManager.getCurrentState")
   206  	defer m.simpleFS.log.CDebugf(ctx, "- archiveManager.getCurrentState")
   207  	m.mu.Lock()
   208  	defer m.mu.Unlock()
   209  	errorStates = make(map[string]errorState)
   210  	for jobID, errState := range m.errors {
   211  		errorStates[jobID] = errState
   212  	}
   213  	return m.state.DeepCopy(), errorStates
   214  }
   215  
   216  func (m *archiveManager) changeJobPhaseLocked(ctx context.Context,
   217  	jobID string, newPhase keybase1.SimpleFSArchiveJobPhase) {
   218  	copy, ok := m.state.Jobs[jobID]
   219  	if !ok {
   220  		m.simpleFS.log.CWarningf(ctx, "job %s not found. it might have been canceled", jobID)
   221  		return
   222  	}
   223  	copy.Phase = newPhase
   224  	m.state.Jobs[jobID] = copy
   225  }
   226  func (m *archiveManager) changeJobPhase(ctx context.Context,
   227  	jobID string, newPhase keybase1.SimpleFSArchiveJobPhase) {
   228  	m.mu.Lock()
   229  	defer m.mu.Unlock()
   230  	m.changeJobPhaseLocked(ctx, jobID, newPhase)
   231  }
   232  
   233  func (m *archiveManager) startWorkerTask(ctx context.Context,
   234  	eligiblePhase keybase1.SimpleFSArchiveJobPhase,
   235  	newPhase keybase1.SimpleFSArchiveJobPhase) (jobID string, jobCtx context.Context, ok bool) {
   236  	jobCtx, cancel := context.WithCancel(ctx)
   237  	m.mu.Lock()
   238  	defer m.mu.Unlock()
   239  	for jobID := range m.state.Jobs {
   240  		if m.state.Jobs[jobID].Phase == eligiblePhase {
   241  			m.changeJobPhaseLocked(ctx, jobID, newPhase)
   242  			m.jobCtxCancellers[jobID] = cancel
   243  			return jobID, jobCtx, true
   244  		}
   245  	}
   246  	return "", nil, false
   247  }
   248  
   249  const archiveErrorRetryDuration = time.Minute
   250  
   251  func (m *archiveManager) setJobError(
   252  	ctx context.Context, jobID string, err error) {
   253  	m.mu.Lock()
   254  	defer m.mu.Unlock()
   255  	nextRetry := time.Now().Add(archiveErrorRetryDuration)
   256  	m.simpleFS.log.CErrorf(ctx, "job %s nextRetry: %s", jobID, nextRetry)
   257  	m.errors[jobID] = errorState{
   258  		err:       err,
   259  		nextRetry: nextRetry,
   260  	}
   261  }
   262  
   263  func (m *archiveManager) doIndexing(ctx context.Context, jobID string) (err error) {
   264  	m.simpleFS.log.CDebugf(ctx, "+ doIndexing %s", jobID)
   265  	defer func() { m.simpleFS.log.CDebugf(ctx, "- doIndexing %s err: %v", jobID, err) }()
   266  
   267  	jobDesc := func() keybase1.SimpleFSArchiveJobDesc {
   268  		m.mu.Lock()
   269  		defer m.mu.Unlock()
   270  		return m.state.Jobs[jobID].Desc
   271  	}()
   272  	opid, err := m.simpleFS.SimpleFSMakeOpid(ctx)
   273  	if err != nil {
   274  		return err
   275  	}
   276  	defer m.simpleFS.SimpleFSClose(ctx, opid)
   277  	filter := keybase1.ListFilter_NO_FILTER
   278  	err = m.simpleFS.SimpleFSListRecursive(ctx, keybase1.SimpleFSListRecursiveArg{
   279  		OpID:   opid,
   280  		Path:   keybase1.NewPathWithKbfsArchived(jobDesc.KbfsPathWithRevision),
   281  		Filter: filter,
   282  	})
   283  	err = m.simpleFS.SimpleFSWait(ctx, opid)
   284  	if err != nil {
   285  		return err
   286  	}
   287  
   288  	listResult, err := m.simpleFS.SimpleFSReadList(ctx, opid)
   289  	if err != nil {
   290  		return err
   291  	}
   292  
   293  	var bytesTotal int64
   294  	manifest := make(map[string]keybase1.SimpleFSArchiveFile)
   295  	for _, e := range listResult.Entries {
   296  		manifest[e.Name] = keybase1.SimpleFSArchiveFile{
   297  			State:      keybase1.SimpleFSFileArchiveState_ToDo,
   298  			DirentType: e.DirentType,
   299  		}
   300  		if e.DirentType == keybase1.DirentType_FILE ||
   301  			e.DirentType == keybase1.DirentType_EXEC {
   302  			bytesTotal += int64(e.Size)
   303  		}
   304  	}
   305  
   306  	func() {
   307  		m.mu.Lock()
   308  		defer m.mu.Unlock()
   309  
   310  		jobCopy, ok := m.state.Jobs[jobID]
   311  		if !ok {
   312  			m.simpleFS.log.CWarningf(ctx, "job %s not found. it might have been canceled", jobID)
   313  			return
   314  		}
   315  		jobCopy.Manifest = manifest
   316  		jobCopy.BytesTotal = bytesTotal
   317  		m.state.Jobs[jobID] = jobCopy
   318  	}()
   319  	return nil
   320  }
   321  
   322  func (m *archiveManager) indexingWorker(ctx context.Context) {
   323  	for {
   324  		select {
   325  		case <-ctx.Done():
   326  			return
   327  		case <-m.indexingWorkerSignal:
   328  		}
   329  
   330  		jobID, jobCtx, ok := m.startWorkerTask(ctx,
   331  			keybase1.SimpleFSArchiveJobPhase_Queued,
   332  			keybase1.SimpleFSArchiveJobPhase_Indexing)
   333  
   334  		if !ok {
   335  			continue
   336  		}
   337  		// We got a task. Put another token into the signal channel so we
   338  		// check again on the next iteration.
   339  		m.signal(m.indexingWorkerSignal)
   340  
   341  		m.simpleFS.log.CDebugf(ctx, "indexing: %s", jobID)
   342  
   343  		err := m.doIndexing(jobCtx, jobID)
   344  		if err == nil {
   345  			m.simpleFS.log.CDebugf(jobCtx, "indexing done on job %s", jobID)
   346  			m.changeJobPhase(jobCtx, jobID, keybase1.SimpleFSArchiveJobPhase_Indexed)
   347  			m.signal(m.copyingWorkerSignal) // Done indexing! Notify the copying worker.
   348  		} else {
   349  			m.simpleFS.log.CErrorf(jobCtx, "indexing error on job %s: %v", jobID, err)
   350  			m.setJobError(ctx, jobID, err)
   351  		}
   352  
   353  		err = m.flushStateFile(ctx)
   354  		if err != nil {
   355  			m.simpleFS.log.CWarningf(ctx, "m.flushStateFileLocked error: %v", err)
   356  		}
   357  	}
   358  }
   359  
   360  type sha256TeeReader struct {
   361  	inner          io.Reader
   362  	innerTeeReader io.Reader
   363  	h              hash.Hash
   364  }
   365  
   366  var _ io.Reader = (*sha256TeeReader)(nil)
   367  
   368  // Read implements the io.Reader interface.
   369  func (r *sha256TeeReader) Read(p []byte) (n int, err error) {
   370  	return r.innerTeeReader.Read(p)
   371  }
   372  
   373  func (r *sha256TeeReader) getSum() []byte {
   374  	return r.h.Sum(nil)
   375  }
   376  
   377  func newSHA256TeeReader(inner io.Reader) (r *sha256TeeReader) {
   378  	r = &sha256TeeReader{
   379  		inner: inner,
   380  		h:     sha256.New(),
   381  	}
   382  	r.innerTeeReader = io.TeeReader(r.inner, r.h)
   383  	return r
   384  }
   385  
   386  type bytesUpdaterFunc = func(delta int64)
   387  
   388  func ctxAwareCopy(
   389  	ctx context.Context, to io.Writer, from io.Reader,
   390  	bytesUpdater bytesUpdaterFunc) error {
   391  	for {
   392  		select {
   393  		case <-ctx.Done():
   394  			return ctx.Err()
   395  		default:
   396  		}
   397  		n, err := io.CopyN(to, from, 64*1024)
   398  		switch err {
   399  		case nil:
   400  			bytesUpdater(n)
   401  		case io.EOF:
   402  			bytesUpdater(n)
   403  			return nil
   404  		default:
   405  			return err
   406  		}
   407  	}
   408  }
   409  
   410  func (m *archiveManager) copyFileFromBeginning(ctx context.Context,
   411  	srcDirFS billy.Filesystem, entryPathWithinJob string,
   412  	localPath string, mode os.FileMode,
   413  	bytesCopiedUpdater bytesUpdaterFunc) (sha256Sum []byte, err error) {
   414  	m.simpleFS.log.CDebugf(ctx, "+ copyFileFromBeginning %s", entryPathWithinJob)
   415  	defer func() { m.simpleFS.log.CDebugf(ctx, "- copyFileFromBeginning %s err: %v", entryPathWithinJob, err) }()
   416  
   417  	src, err := srcDirFS.Open(entryPathWithinJob)
   418  	if err != nil {
   419  		return nil, fmt.Errorf("srcDirFS.Open(%s) error: %v", entryPathWithinJob, err)
   420  	}
   421  	defer src.Close()
   422  
   423  	dst, err := os.OpenFile(localPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, mode)
   424  	if err != nil {
   425  		return nil, fmt.Errorf("os.OpenFile(%s) error: %v", localPath, err)
   426  	}
   427  	defer dst.Close()
   428  
   429  	teeReader := newSHA256TeeReader(src)
   430  
   431  	err = ctxAwareCopy(ctx, dst, teeReader, bytesCopiedUpdater)
   432  	if err != nil {
   433  		return nil, fmt.Errorf("[%s] io.CopyN error: %v", entryPathWithinJob, err)
   434  	}
   435  
   436  	// We didn't continue from a previously interrupted copy, so don't
   437  	// bother verifying the sha256sum and just return it.
   438  	return teeReader.getSum(), nil
   439  }
   440  
   441  func (m *archiveManager) copyFilePickupPrevious(ctx context.Context,
   442  	srcDirFS billy.Filesystem, entryPathWithinJob string,
   443  	localPath string, srcSeekOffset int64, mode os.FileMode,
   444  	bytesCopiedUpdater bytesUpdaterFunc) (sha256Sum []byte, err error) {
   445  	m.simpleFS.log.CDebugf(ctx, "+ copyFilePickupPrevious %s", entryPathWithinJob)
   446  	defer func() { m.simpleFS.log.CDebugf(ctx, "- copyFilePickupPrevious %s err: %v", entryPathWithinJob, err) }()
   447  
   448  	src, err := srcDirFS.Open(entryPathWithinJob)
   449  	if err != nil {
   450  		return nil, fmt.Errorf("srcDirFS.Open(%s) error: %v", entryPathWithinJob, err)
   451  	}
   452  	defer src.Close()
   453  
   454  	_, err = src.Seek(srcSeekOffset, io.SeekStart)
   455  	if err != nil {
   456  		return nil, fmt.Errorf("[%s] src.Seek error: %v", entryPathWithinJob, err)
   457  	}
   458  
   459  	// Copy the file.
   460  	if err = func() error {
   461  		dst, err := os.OpenFile(localPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, mode)
   462  		if err != nil {
   463  			return fmt.Errorf("os.OpenFile(%s) error: %v", localPath, err)
   464  		}
   465  		defer dst.Close()
   466  
   467  		err = ctxAwareCopy(ctx, dst, src, bytesCopiedUpdater)
   468  		if err != nil {
   469  			return fmt.Errorf("[%s] io.CopyN error: %v", entryPathWithinJob, err)
   470  		}
   471  
   472  		return nil
   473  	}(); err != nil {
   474  		return nil, err
   475  	}
   476  
   477  	var size int64
   478  	// Calculate sha256 and check the sha256 of the copied file since we
   479  	// continued from a previously interrupted copy.
   480  	srcSHA256Sum, dstSHA256Sum, err := func() (srcSHA256Sum, dstSHA256Sum []byte, err error) {
   481  		_, err = src.Seek(0, io.SeekStart)
   482  		if err != nil {
   483  			return nil, nil, fmt.Errorf("[%s] src.Seek error: %v", entryPathWithinJob, err)
   484  		}
   485  		srcSHA256SumHasher := sha256.New()
   486  		size, err = io.Copy(srcSHA256SumHasher, src)
   487  		if err != nil {
   488  			return nil, nil, fmt.Errorf("[%s] io.Copy error: %v", entryPathWithinJob, err)
   489  		}
   490  		srcSHA256Sum = srcSHA256SumHasher.Sum(nil)
   491  
   492  		dst, err := os.Open(localPath)
   493  		if err != nil {
   494  			return nil, nil, fmt.Errorf("os.Open(%s) error: %v", localPath, err)
   495  		}
   496  		defer dst.Close()
   497  		dstSHA256SumHasher := sha256.New()
   498  		_, err = io.Copy(dstSHA256SumHasher, dst)
   499  		if err != nil {
   500  			return nil, nil, fmt.Errorf("[%s] io.Copy error: %v", entryPathWithinJob, err)
   501  		}
   502  		dstSHA256Sum = dstSHA256SumHasher.Sum(nil)
   503  
   504  		return srcSHA256Sum, dstSHA256Sum, nil
   505  	}()
   506  	if err != nil {
   507  		return nil, err
   508  	}
   509  
   510  	if !bytes.Equal(srcSHA256Sum, dstSHA256Sum) {
   511  		m.simpleFS.log.CInfof(ctx,
   512  			"file corruption is detected from a previous copy. Will copy from the beginning: ",
   513  			entryPathWithinJob)
   514  		bytesCopiedUpdater(-size)
   515  		return m.copyFileFromBeginning(ctx, srcDirFS, entryPathWithinJob, localPath, mode, bytesCopiedUpdater)
   516  	}
   517  
   518  	return srcSHA256Sum, nil
   519  }
   520  
   521  func (m *archiveManager) copyFile(ctx context.Context,
   522  	srcDirFS billy.Filesystem, entryPathWithinJob string,
   523  	localPath string, srcSeekOffset int64, mode os.FileMode,
   524  	bytesCopiedUpdater bytesUpdaterFunc) (sha256Sum []byte, err error) {
   525  	if srcSeekOffset == 0 {
   526  		return m.copyFileFromBeginning(ctx, srcDirFS, entryPathWithinJob, localPath, mode, bytesCopiedUpdater)
   527  	}
   528  	return m.copyFilePickupPrevious(ctx, srcDirFS, entryPathWithinJob, localPath, srcSeekOffset, mode, bytesCopiedUpdater)
   529  }
   530  
   531  func getWorkspaceDir(jobDesc keybase1.SimpleFSArchiveJobDesc) string {
   532  	return filepath.Join(jobDesc.StagingPath, "workspace")
   533  }
   534  
   535  func (m *archiveManager) doCopying(ctx context.Context, jobID string) (err error) {
   536  	m.simpleFS.log.CDebugf(ctx, "+ doCopying %s", jobID)
   537  	defer func() { m.simpleFS.log.CDebugf(ctx, "- doCopying %s err: %v", jobID, err) }()
   538  
   539  	desc, manifest := func() (keybase1.SimpleFSArchiveJobDesc, map[string]keybase1.SimpleFSArchiveFile) {
   540  		m.mu.Lock()
   541  		defer m.mu.Unlock()
   542  		manifest := make(map[string]keybase1.SimpleFSArchiveFile)
   543  		for k, v := range m.state.Jobs[jobID].Manifest {
   544  			manifest[k] = v.DeepCopy()
   545  		}
   546  		return m.state.Jobs[jobID].Desc, manifest
   547  	}()
   548  
   549  	updateManifest := func(manifest map[string]keybase1.SimpleFSArchiveFile) {
   550  		m.mu.Lock()
   551  		defer m.mu.Unlock()
   552  		// Can override directly since only one worker can work on a give job at a time.
   553  		job := m.state.Jobs[jobID]
   554  		for k, v := range manifest {
   555  			job.Manifest[k] = v.DeepCopy()
   556  		}
   557  		m.state.Jobs[jobID] = job
   558  	}
   559  
   560  	updateBytesCopied := func(delta int64) {
   561  		m.mu.Lock()
   562  		defer m.mu.Unlock()
   563  		// Can override directly since only one worker can work on a give job at a time.
   564  		job := m.state.Jobs[jobID]
   565  		job.BytesCopied += delta
   566  		m.state.Jobs[jobID] = job
   567  	}
   568  
   569  	srcContainingDirFS, finalElem, err := m.simpleFS.getFSIfExists(ctx,
   570  		keybase1.NewPathWithKbfsArchived(desc.KbfsPathWithRevision))
   571  	if err != nil {
   572  		return fmt.Errorf("getFSIfExists error: %v", err)
   573  	}
   574  	srcDirFS, err := srcContainingDirFS.Chroot(finalElem)
   575  	if err != nil {
   576  		return fmt.Errorf("srcContainingDirFS.Chroot error: %v", err)
   577  	}
   578  	dstBase := filepath.Join(getWorkspaceDir(desc), desc.TargetName)
   579  
   580  	entryPaths := make([]string, 0, len(manifest))
   581  	for entryPathWithinJob := range manifest {
   582  		entryPaths = append(entryPaths, entryPathWithinJob)
   583  	}
   584  	sort.Strings(entryPaths)
   585  
   586  loopEntryPaths:
   587  	for _, entryPathWithinJob := range entryPaths {
   588  		entry := manifest[entryPathWithinJob]
   589  		entry.State = keybase1.SimpleFSFileArchiveState_InProgress
   590  		manifest[entryPathWithinJob] = entry
   591  		updateManifest(manifest)
   592  
   593  		localPath := filepath.Join(dstBase, entryPathWithinJob)
   594  		srcFI, err := srcDirFS.Lstat(entryPathWithinJob)
   595  		if err != nil {
   596  			return fmt.Errorf("srcDirFS.LStat(%s) error: %v", entryPathWithinJob, err)
   597  		}
   598  		switch {
   599  		case srcFI.IsDir():
   600  			err = os.MkdirAll(localPath, 0755)
   601  			if err != nil {
   602  				return fmt.Errorf("os.MkdirAll(%s) error: %v", localPath, err)
   603  			}
   604  			err = os.Chtimes(localPath, time.Time{}, srcFI.ModTime())
   605  			if err != nil {
   606  				return fmt.Errorf("os.Chtimes(%s) error: %v", localPath, err)
   607  			}
   608  			entry.State = keybase1.SimpleFSFileArchiveState_Complete
   609  			manifest[entryPathWithinJob] = entry
   610  		case srcFI.Mode()&os.ModeSymlink != 0: // symlink
   611  			err = os.MkdirAll(filepath.Dir(localPath), 0755)
   612  			if err != nil {
   613  				return fmt.Errorf("os.MkdirAll(filepath.Dir(%s)) error: %v", localPath, err)
   614  			}
   615  			// Call Stat, which follows symlinks, to make sure the link doesn't
   616  			// escape outside the srcDirFS.
   617  			_, err = srcDirFS.Stat(entryPathWithinJob)
   618  			if err != nil {
   619  				m.simpleFS.log.CWarningf(ctx, "skipping %s due to srcDirFS.Stat error: %v", entryPathWithinJob, err)
   620  				entry.State = keybase1.SimpleFSFileArchiveState_Skipped
   621  				manifest[entryPathWithinJob] = entry
   622  				continue loopEntryPaths
   623  			}
   624  
   625  			link, err := srcDirFS.Readlink(entryPathWithinJob)
   626  			if err != nil {
   627  				return fmt.Errorf("srcDirFS(%s) error: %v", entryPathWithinJob, err)
   628  			}
   629  			m.simpleFS.log.CInfof(ctx, "calling os.Symlink(%s, %s) ", link, localPath)
   630  			err = os.Symlink(link, localPath)
   631  			if err != nil {
   632  				return fmt.Errorf("os.Symlink(%s, %s) error: %v", link, localPath, err)
   633  			}
   634  			// Skipping Chtimes becasue there doesn't seem to be a way to
   635  			// change time on symlinks.
   636  			entry.State = keybase1.SimpleFSFileArchiveState_Complete
   637  			manifest[entryPathWithinJob] = entry
   638  		default:
   639  			err = os.MkdirAll(filepath.Dir(localPath), 0755)
   640  			if err != nil {
   641  				return fmt.Errorf("os.MkdirAll(filepath.Dir(%s)) error: %v", localPath, err)
   642  			}
   643  
   644  			var mode os.FileMode = 0644
   645  			if srcFI.Mode()&0100 != 0 {
   646  				mode = 0755
   647  			}
   648  
   649  			seek := int64(0)
   650  
   651  			dstFI, err := os.Lstat(localPath)
   652  			switch {
   653  			case os.IsNotExist(err): // simple copy from the start of file
   654  			case err == nil: // continue from a previously interrupted copy
   655  				if srcFI.Mode()&os.ModeSymlink == 0 {
   656  					seek = dstFI.Size()
   657  				}
   658  				// otherwise copy from the start of file
   659  			default:
   660  				return fmt.Errorf("os.Lstat(%s) error: %v", localPath, err)
   661  			}
   662  
   663  			sha256Sum, err := m.copyFile(ctx,
   664  				srcDirFS, entryPathWithinJob, localPath, seek, mode, updateBytesCopied)
   665  			if err != nil {
   666  				return err
   667  			}
   668  
   669  			err = os.Chtimes(localPath, time.Time{}, srcFI.ModTime())
   670  			if err != nil {
   671  				return fmt.Errorf("os.Chtimes(%s) error: %v", localPath, err)
   672  			}
   673  
   674  			entry.Sha256SumHex = hex.EncodeToString(sha256Sum)
   675  			entry.State = keybase1.SimpleFSFileArchiveState_Complete
   676  			manifest[entryPathWithinJob] = entry
   677  		}
   678  		updateManifest(manifest)
   679  	}
   680  
   681  	return nil
   682  }
   683  
   684  func (m *archiveManager) copyingWorker(ctx context.Context) {
   685  	for {
   686  		select {
   687  		case <-ctx.Done():
   688  			return
   689  		case <-m.copyingWorkerSignal:
   690  		}
   691  
   692  		jobID, jobCtx, ok := m.startWorkerTask(ctx,
   693  			keybase1.SimpleFSArchiveJobPhase_Indexed,
   694  			keybase1.SimpleFSArchiveJobPhase_Copying)
   695  
   696  		if !ok {
   697  			continue
   698  		}
   699  		// We got a task. Put another token into the signal channel so we
   700  		// check again on the next iteration.
   701  		m.signal(m.copyingWorkerSignal)
   702  
   703  		m.simpleFS.log.CDebugf(ctx, "copying: %s", jobID)
   704  
   705  		err := m.doCopying(jobCtx, jobID)
   706  		if err == nil {
   707  			m.simpleFS.log.CDebugf(jobCtx, "copying done on job %s", jobID)
   708  			m.changeJobPhase(jobCtx, jobID, keybase1.SimpleFSArchiveJobPhase_Copied)
   709  			m.signal(m.zippingWorkerSignal) // Done copying! Notify the zipping worker.
   710  		} else {
   711  			m.simpleFS.log.CErrorf(jobCtx, "copying error on job %s: %v", jobID, err)
   712  			m.setJobError(ctx, jobID, err)
   713  		}
   714  
   715  		err = m.flushStateFile(ctx)
   716  		if err != nil {
   717  			m.simpleFS.log.CWarningf(ctx, "m.flushStateFileLocked error: %v", err)
   718  		}
   719  	}
   720  }
   721  
   722  // zipWriterAddDir is adapted from zip.Writer.AddFS in go1.22.0 source because 1) we're
   723  // not on a version with this function yet, and 2) Go's AddFS doesn't support
   724  // symlinks; 3) we need bytesZippedUpdater here and we need to use CopyN for it.
   725  func zipWriterAddDir(ctx context.Context,
   726  	w *zip.Writer, dirPath string, bytesZippedUpdater bytesUpdaterFunc) error {
   727  	fsys := os.DirFS(dirPath)
   728  	return fs.WalkDir(fsys, ".", func(name string, d fs.DirEntry, err error) error {
   729  		if err != nil {
   730  			return err
   731  		}
   732  		if d.IsDir() {
   733  			return nil
   734  		}
   735  		info, err := d.Info()
   736  		if err != nil {
   737  			return err
   738  		}
   739  		if !(info.Mode() &^ fs.ModeSymlink).IsRegular() {
   740  			return errors.New("zip: cannot add non-regular file except symlink")
   741  		}
   742  		h, err := zip.FileInfoHeader(info)
   743  		if err != nil {
   744  			return err
   745  		}
   746  		h.Name = name
   747  		h.Method = zip.Deflate
   748  		fw, err := w.CreateHeader(h)
   749  		if err != nil {
   750  			return err
   751  		}
   752  		switch {
   753  		case info.Mode()&fs.ModeSymlink != 0:
   754  			target, err := os.Readlink(filepath.Join(dirPath, name))
   755  			if err != nil {
   756  				return err
   757  			}
   758  			_, err = fw.Write([]byte(filepath.ToSlash(target)))
   759  			if err != nil {
   760  				return err
   761  			}
   762  			return nil
   763  		default:
   764  			f, err := fsys.Open(name)
   765  			if err != nil {
   766  				return err
   767  			}
   768  			defer f.Close()
   769  			ctxAwareCopy(ctx, fw, f, bytesZippedUpdater)
   770  			return nil
   771  		}
   772  	})
   773  }
   774  
   775  func (m *archiveManager) doZipping(ctx context.Context, jobID string) (err error) {
   776  	m.simpleFS.log.CDebugf(ctx, "+ doZipping %s", jobID)
   777  	defer func() { m.simpleFS.log.CDebugf(ctx, "- doZipping %s err: %v", jobID, err) }()
   778  
   779  	jobDesc := func() keybase1.SimpleFSArchiveJobDesc {
   780  		m.mu.Lock()
   781  		defer m.mu.Unlock()
   782  		return m.state.Jobs[jobID].Desc
   783  	}()
   784  
   785  	// Reset BytesZipped.
   786  	func() {
   787  		m.mu.Lock()
   788  		defer m.mu.Unlock()
   789  		// Can override directly since only one worker can work on a give job at a time.
   790  		job := m.state.Jobs[jobID]
   791  		job.BytesZipped = 0
   792  		m.state.Jobs[jobID] = job
   793  	}()
   794  
   795  	updateBytesZipped := func(delta int64) {
   796  		m.mu.Lock()
   797  		defer m.mu.Unlock()
   798  		// Can override directly since only one worker can work on a give job at a time.
   799  		job := m.state.Jobs[jobID]
   800  		job.BytesZipped += delta
   801  		m.state.Jobs[jobID] = job
   802  	}
   803  
   804  	workspaceDir := getWorkspaceDir(jobDesc)
   805  
   806  	err = func() (err error) {
   807  		mode := os.O_WRONLY | os.O_CREATE | os.O_EXCL
   808  		if jobDesc.OverwriteZip {
   809  			mode = os.O_WRONLY | os.O_CREATE | os.O_TRUNC
   810  		}
   811  		zipFile, err := os.OpenFile(jobDesc.ZipFilePath, mode, 0666)
   812  		if err != nil {
   813  			return fmt.Errorf("os.Create(%s) error: %v", jobDesc.ZipFilePath, err)
   814  		}
   815  		defer func() {
   816  			closeErr := zipFile.Close()
   817  			if err == nil {
   818  				err = closeErr
   819  			}
   820  		}()
   821  
   822  		zipWriter := zip.NewWriter(zipFile)
   823  		defer func() {
   824  			closeErr := zipWriter.Close()
   825  			if err == nil {
   826  				err = closeErr
   827  			}
   828  		}()
   829  
   830  		err = zipWriterAddDir(ctx, zipWriter, workspaceDir, updateBytesZipped)
   831  		if err != nil {
   832  			return fmt.Errorf("zipWriter.AddFS to %s error: %v", jobDesc.ZipFilePath, err)
   833  		}
   834  
   835  		return nil
   836  	}()
   837  	if err != nil {
   838  		return err
   839  	}
   840  
   841  	// Remove the workspace so we release the storage space early on before
   842  	// user dismisses the job.
   843  	err = os.RemoveAll(workspaceDir)
   844  	if err != nil {
   845  		m.simpleFS.log.CWarningf(ctx, "removing workspace %s error %v", workspaceDir, err)
   846  	}
   847  
   848  	return nil
   849  }
   850  
   851  func (m *archiveManager) zippingWorker(ctx context.Context) {
   852  	for {
   853  		select {
   854  		case <-ctx.Done():
   855  			return
   856  		case <-m.zippingWorkerSignal:
   857  		}
   858  
   859  		jobID, jobCtx, ok := m.startWorkerTask(ctx,
   860  			keybase1.SimpleFSArchiveJobPhase_Copied,
   861  			keybase1.SimpleFSArchiveJobPhase_Zipping)
   862  
   863  		if !ok {
   864  			continue
   865  		}
   866  		// We got a task. Put another token into the signal channel so we
   867  		// check again on the next iteration.
   868  		m.signal(m.zippingWorkerSignal)
   869  
   870  		m.simpleFS.log.CDebugf(ctx, "zipping: %s", jobID)
   871  
   872  		err := m.doZipping(jobCtx, jobID)
   873  		if err == nil {
   874  			m.simpleFS.log.CDebugf(jobCtx, "zipping done on job %s", jobID)
   875  			m.changeJobPhase(jobCtx, jobID, keybase1.SimpleFSArchiveJobPhase_Done)
   876  		} else {
   877  			m.simpleFS.log.CErrorf(jobCtx, "zipping error on job %s: %v", jobID, err)
   878  			m.setJobError(ctx, jobID, err)
   879  		}
   880  
   881  		err = m.flushStateFile(ctx)
   882  		if err != nil {
   883  			m.simpleFS.log.CWarningf(ctx, "m.flushStateFileLocked error: %v", err)
   884  		}
   885  	}
   886  }
   887  
   888  func (m *archiveManager) resetInterruptedPhaseLocked(ctx context.Context, jobID string) (changed bool) {
   889  	switch m.state.Jobs[jobID].Phase {
   890  	case keybase1.SimpleFSArchiveJobPhase_Indexing:
   891  		m.simpleFS.log.CDebugf(ctx, "resetting %s phase from %s to %s", jobID,
   892  			keybase1.SimpleFSArchiveJobPhase_Indexing,
   893  			keybase1.SimpleFSArchiveJobPhase_Queued)
   894  		m.changeJobPhaseLocked(ctx, jobID,
   895  			keybase1.SimpleFSArchiveJobPhase_Queued)
   896  		return true
   897  	case keybase1.SimpleFSArchiveJobPhase_Copying:
   898  		m.simpleFS.log.CDebugf(ctx, "resetting %s phase from %s to %s", jobID,
   899  			keybase1.SimpleFSArchiveJobPhase_Copying,
   900  			keybase1.SimpleFSArchiveJobPhase_Indexed)
   901  		m.changeJobPhaseLocked(ctx, jobID,
   902  			keybase1.SimpleFSArchiveJobPhase_Indexed)
   903  		return true
   904  	case keybase1.SimpleFSArchiveJobPhase_Zipping:
   905  		m.simpleFS.log.CDebugf(ctx, "resetting %s phase from %s to %s", jobID,
   906  			keybase1.SimpleFSArchiveJobPhase_Zipping,
   907  			keybase1.SimpleFSArchiveJobPhase_Copied)
   908  		m.changeJobPhaseLocked(ctx, jobID,
   909  			keybase1.SimpleFSArchiveJobPhase_Copied)
   910  		return true
   911  	default:
   912  		m.simpleFS.log.CDebugf(ctx, "not resetting %s phase from %s", jobID,
   913  			m.state.Jobs[jobID].Phase)
   914  		return false
   915  	}
   916  }
   917  
   918  func (m *archiveManager) errorRetryWorker(ctx context.Context) {
   919  	ticker := time.NewTicker(time.Second * 5)
   920  	for {
   921  		select {
   922  		case <-ctx.Done():
   923  			return
   924  		case <-ticker.C:
   925  		}
   926  
   927  		func() {
   928  			m.mu.Lock()
   929  			defer m.mu.Unlock()
   930  			jobIDs := make([]string, len(m.state.Jobs))
   931  			for jobID := range m.state.Jobs {
   932  				jobIDs = append(jobIDs, jobID)
   933  			}
   934  		loopJobIDs:
   935  			for _, jobID := range jobIDs {
   936  				errState, ok := m.errors[jobID]
   937  				if !ok {
   938  					continue loopJobIDs
   939  				}
   940  				if time.Now().Before(errState.nextRetry) {
   941  					continue loopJobIDs
   942  				}
   943  				m.simpleFS.log.CDebugf(ctx, "retrying job %s", jobID)
   944  				changed := m.resetInterruptedPhaseLocked(ctx, jobID)
   945  				if !changed {
   946  					m.simpleFS.log.CWarningf(ctx,
   947  						"job %s has an error state %v but an unexpected job phase",
   948  						jobID, errState.err)
   949  					continue loopJobIDs
   950  				}
   951  				delete(m.errors, jobID)
   952  
   953  				m.signal(m.indexingWorkerSignal)
   954  				m.signal(m.copyingWorkerSignal)
   955  				m.signal(m.zippingWorkerSignal)
   956  			}
   957  		}()
   958  	}
   959  }
   960  
   961  func (m *archiveManager) start() {
   962  	ctx := context.Background()
   963  	ctx, m.ctxCancel = context.WithCancel(ctx)
   964  	go m.indexingWorker(m.simpleFS.makeContext(ctx))
   965  	go m.copyingWorker(m.simpleFS.makeContext(ctx))
   966  	go m.zippingWorker(m.simpleFS.makeContext(ctx))
   967  	go m.errorRetryWorker(m.simpleFS.makeContext(ctx))
   968  	m.signal(m.indexingWorkerSignal)
   969  	m.signal(m.copyingWorkerSignal)
   970  	m.signal(m.zippingWorkerSignal)
   971  }
   972  
   973  func (m *archiveManager) resetInterruptedPhasesLocked(ctx context.Context) {
   974  	// We don't resume indexing and zipping work, so just reset them here.
   975  	// Copying is resumable but we have per file state tracking so reset the
   976  	// phase here as well.
   977  	for jobID := range m.state.Jobs {
   978  		_ = m.resetInterruptedPhaseLocked(ctx, jobID)
   979  	}
   980  }
   981  
   982  func newArchiveManager(simpleFS *SimpleFS) (m *archiveManager, err error) {
   983  	ctx := context.Background()
   984  	simpleFS.log.CDebugf(ctx, "+ newArchiveManager")
   985  	defer simpleFS.log.CDebugf(ctx, "- newArchiveManager")
   986  	m = &archiveManager{
   987  		simpleFS:             simpleFS,
   988  		jobCtxCancellers:     make(map[string]func()),
   989  		errors:               make(map[string]errorState),
   990  		indexingWorkerSignal: make(chan struct{}, 1),
   991  		copyingWorkerSignal:  make(chan struct{}, 1),
   992  		zippingWorkerSignal:  make(chan struct{}, 1),
   993  	}
   994  	stateFilePath := getStateFilePath(simpleFS)
   995  	m.state, err = loadArchiveStateFromJsonGz(ctx, simpleFS, stateFilePath)
   996  	switch err {
   997  	case nil:
   998  		if m.state.Jobs == nil {
   999  			m.state.Jobs = make(map[string]keybase1.SimpleFSArchiveJobState)
  1000  		}
  1001  		m.resetInterruptedPhasesLocked(ctx)
  1002  	default:
  1003  		simpleFS.log.CErrorf(ctx, "loadArchiveStateFromJsonGz error ( %v ). Creating a new state.", err)
  1004  		m.state = &keybase1.SimpleFSArchiveState{
  1005  			Jobs: make(map[string]keybase1.SimpleFSArchiveJobState),
  1006  		}
  1007  		err = writeArchiveStateIntoJsonGz(ctx, simpleFS, stateFilePath, m.state)
  1008  		if err != nil {
  1009  			simpleFS.log.CErrorf(ctx, "newArchiveManager: creating state file error: %v", err)
  1010  			return nil, err
  1011  		}
  1012  	}
  1013  	m.start()
  1014  	return m, nil
  1015  }