github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/base/dsfs/write.go (about)

     1  package dsfs
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io/fs"
     8  	"strings"
     9  	"time"
    10  
    11  	cid "github.com/ipfs/go-cid"
    12  	crypto "github.com/libp2p/go-libp2p-core/crypto"
    13  	"github.com/qri-io/dataset"
    14  	"github.com/qri-io/dataset/dsviz"
    15  	"github.com/qri-io/dataset/validate"
    16  	"github.com/qri-io/qfs"
    17  	"github.com/qri-io/qri/dsref"
    18  	"github.com/qri-io/qri/event"
    19  )
    20  
    21  // number of entries to per batch when processing body data in WriteDataset
    22  const batchSize = 5000
    23  
    24  var (
    25  	// BodySizeSmallEnoughToDiff sets how small a body must be to generate a message from it
    26  	BodySizeSmallEnoughToDiff = 20000000 // 20M or less is small
    27  	// OpenFileTimeoutDuration determines the maximium amount of time to wait for
    28  	// a Filestore to open a file. Some filestores (like IPFS) fallback to a
    29  	// network request when it can't find a file locally. Setting a short timeout
    30  	// prevents waiting for a slow network response, at the expense of leaving
    31  	// files unresolved.
    32  	// TODO (b5) - allow -1 duration as a sentinel value for no timeout
    33  	OpenFileTimeoutDuration = time.Millisecond * 700
    34  )
    35  
    36  // If a user has a dataset larger than the above limit, then instead of diffing we compare the
    37  // checksum against the previous version. We should make this algorithm agree with how `status`
    38  // works.
    39  // See issue: https://github.com/qri-io/qri/issues/1150
    40  
    41  // SaveSwitches represents options for saving a dataset
    42  type SaveSwitches struct {
    43  	// Use a custom timestamp, defaults to time.Now if unset
    44  	Time time.Time
    45  	// Replace is whether the save is a full replacement or a set of patches to previous
    46  	Replace bool
    47  	// Pin is whether the dataset should be pinned
    48  	Pin bool
    49  	// ConvertFormatToPrev is whether the body should be converted to match the previous format
    50  	ConvertFormatToPrev bool
    51  	// ForceIfNoChanges is whether the save should be forced even if no changes are detected
    52  	ForceIfNoChanges bool
    53  	// ShouldRender is deprecated, controls whether viz should be rendered
    54  	ShouldRender bool
    55  	// NewName is whether a new dataset should be created, guaranteeing there's no previous version
    56  	NewName bool
    57  	// FileHint is a hint for what file is used for creating this dataset
    58  	FileHint string
    59  	// Drop is a string of components to remove before saving
    60  	Drop string
    61  	// parsed drop string into list of components
    62  	dropRevs []*dsref.Rev
    63  
    64  	// action to take when calculating commit messages
    65  	// bodyAction is set by computeFieldsFile to feed data to the commit component
    66  	// write. A bit of a hack, but it works.
    67  	bodyAct BodyAction
    68  }
    69  
    70  // CreateDataset writes a dataset to a provided store.
    71  // Store is where we're going to store the data
    72  // Dataset to be saved
    73  // Prev is the previous version or nil if there isn't one
    74  // Pk is the private key for cryptographically signing
    75  // Sw is switches that control how the save happens
    76  // Returns the immutable path if no error
    77  func CreateDataset(
    78  	ctx context.Context,
    79  	source qfs.Filesystem,
    80  	destination qfs.Filesystem,
    81  	pub event.Publisher,
    82  	ds *dataset.Dataset,
    83  	prev *dataset.Dataset,
    84  	pk crypto.PrivKey,
    85  	sw SaveSwitches,
    86  ) (string, error) {
    87  	if pk == nil {
    88  		return "", fmt.Errorf("private key is required to create a dataset")
    89  	}
    90  
    91  	if err := DerefDataset(ctx, source, ds); err != nil {
    92  		log.Debugf("dereferencing dataset components: %s", err)
    93  		return "", err
    94  	}
    95  	if err := validate.Dataset(ds); err != nil {
    96  		log.Debug(err.Error())
    97  		return "", err
    98  	}
    99  	log.Debugw("CreateDataset", "ds.Peername", ds.Peername, "ds.Name", ds.Name, "dest", destination.Type())
   100  
   101  	if prev != nil && !prev.IsEmpty() {
   102  		log.Debugw("dereferencing previous dataset", "prevPath", prev.Path)
   103  		if err := DerefDataset(ctx, source, prev); err != nil {
   104  			log.Debug(err.Error())
   105  			return "", err
   106  		}
   107  		if err := validate.Dataset(prev); err != nil {
   108  			log.Debug(err.Error())
   109  			return "", err
   110  		}
   111  	}
   112  
   113  	peername := ds.Peername
   114  	name := ds.Name
   115  
   116  	go func() {
   117  		evtErr := pub.Publish(ctx, event.ETDatasetSaveStarted, event.DsSaveEvent{
   118  			Username:   peername,
   119  			Name:       name,
   120  			Message:    "save started",
   121  			Completion: 0,
   122  		})
   123  		if evtErr != nil {
   124  			log.Debugw("ignored error while publishing save start event", "evtErr", evtErr)
   125  		}
   126  	}()
   127  
   128  	path, err := WriteDataset(ctx, source, destination, prev, ds, pub, pk, sw)
   129  	if err != nil {
   130  		log.Debug(err.Error())
   131  		if evtErr := pub.Publish(ctx, event.ETDatasetSaveCompleted, event.DsSaveEvent{
   132  			Username:   peername,
   133  			Name:       name,
   134  			Error:      err,
   135  			Completion: 1.0,
   136  		}); evtErr != nil {
   137  			log.Debugw("ignored error while publishing save completed", "evtErr", evtErr)
   138  		}
   139  		return "", err
   140  	}
   141  
   142  	// TODO (b5) - many codepaths that call this function use the `ds` arg after saving
   143  	// we need to dereference here so fields are set, but this is overkill if
   144  	// the caller doesn't use the ds arg afterward
   145  	// might make sense to have a wrapper function that writes and loads on success
   146  	if err := DerefDataset(ctx, destination, ds); err != nil {
   147  		if evtErr := pub.Publish(ctx, event.ETDatasetSaveCompleted, event.DsSaveEvent{
   148  			Username:   peername,
   149  			Name:       name,
   150  			Error:      err,
   151  			Completion: 1.0,
   152  		}); evtErr != nil {
   153  			log.Debugw("ignored error while publishing save completed", "evtErr", evtErr)
   154  		}
   155  		return path, err
   156  	}
   157  
   158  	return path, pub.Publish(ctx, event.ETDatasetSaveCompleted, event.DsSaveEvent{
   159  		Username:   peername,
   160  		Name:       name,
   161  		Message:    "dataset saved",
   162  		Path:       path,
   163  		Completion: 1.0,
   164  	})
   165  }
   166  
   167  // WriteDataset persists a datasets to a destination filesystem
   168  func WriteDataset(
   169  	ctx context.Context,
   170  	src qfs.Filesystem,
   171  	dst qfs.Filesystem,
   172  	prev *dataset.Dataset,
   173  	ds *dataset.Dataset,
   174  	publisher event.Publisher,
   175  	pk crypto.PrivKey,
   176  	sw SaveSwitches,
   177  ) (string, error) {
   178  	dstStore, ok := dst.(qfs.MerkleDagStore)
   179  	if !ok {
   180  		return "", fmt.Errorf("destination must be a MerkleDagStore")
   181  	}
   182  
   183  	if ds.Commit != nil {
   184  		// assign timestamp early. saving process on large files can take many minutes
   185  		// and we want to mark commit creation closer to when the user submitted the
   186  		// creation request
   187  		if ds.Commit.Timestamp.IsZero() {
   188  			ds.Commit.Timestamp = Timestamp()
   189  		} else {
   190  			ds.Commit.Timestamp = ds.Commit.Timestamp.In(time.UTC)
   191  		}
   192  	}
   193  
   194  	if ds.Stats != nil {
   195  		ds.Stats = nil
   196  	}
   197  
   198  	revs, err := dsref.ParseRevs(sw.Drop)
   199  	if err != nil {
   200  		return "", err
   201  	}
   202  	sw.dropRevs = revs
   203  
   204  	added := qfs.NewLinks()
   205  
   206  	// the call order of these functions is important, funcs later in the slice
   207  	// may rely on writeFiles fields set by eariler functions
   208  	writeFuncs := []writeComponentFunc{
   209  		bodyFileFunc(ctx, pk, publisher),      // no deps
   210  		metadataFile,                          // no deps
   211  		transformFile,                         // no deps
   212  		structureFile,                         // requires bdoy if it exists
   213  		statsFile,                             // requires body, structure if they exist
   214  		readmeFile,                            // no deps
   215  		vizFilesAddFunc(ctx, sw),              // requires body, meta, transform, structure, stats, readme if they exist
   216  		commitFileAddFunc(ctx, pk, publisher), // requires meta, transform, body, structure, stats, readme, vizScript, vizRendered if they exist
   217  		writeDatasetFile,                      // requires all other components
   218  	}
   219  
   220  	for _, fileFunc := range writeFuncs {
   221  		if err := fileFunc(src, dstStore, prev, ds, added, &sw); err != nil {
   222  			if errors.Is(errNoComponent, err) {
   223  				continue
   224  			}
   225  			return "", err
   226  		}
   227  	}
   228  
   229  	// add root node
   230  	res, err := dstStore.PutNode(added)
   231  	if err != nil {
   232  		return "", err
   233  	}
   234  	return fsPathFromCID(dstStore, res.Cid), nil
   235  }
   236  
   237  // writeComponentFunc is a function that writes a component to a merkleDagStore
   238  // it accepts a set of named links that have already been added
   239  // write component funcs are expected to write a link to "added" on successful
   240  // write
   241  type writeComponentFunc func(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) (err error)
   242  
   243  var errNoComponent = errors.New("no component")
   244  
   245  func bodyFileFunc(ctx context.Context, pk crypto.PrivKey, publisher event.Publisher) writeComponentFunc {
   246  	return func(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   247  		if ds.BodyFile() == nil {
   248  			if usePrevComponent(sw, "bd") && prev != nil && prev.BodyPath != "" {
   249  				sw.bodyAct = BodySame
   250  				// TODO (b5): need to validate that a potentially new structure will work
   251  				if id, err := cidFromIPFSPath(prev.BodyPath); err == nil {
   252  					added.Add(qfs.Link{Name: bodyFilename(prev), Cid: id, IsFile: true})
   253  				}
   254  			}
   255  			return errNoComponent
   256  		}
   257  
   258  		sw.bodyAct = BodyDefault
   259  		bodyFilename := bodyFilename(ds)
   260  		cff, err := newComputeFieldsFile(ctx, publisher, pk, ds, prev, sw)
   261  		if err != nil {
   262  			return err
   263  		}
   264  
   265  		f, err := NewMemfileReader(bodyFilename, cff), nil
   266  		if err != nil {
   267  			return err
   268  		}
   269  
   270  		if err := writePackageFile(dst, f, added); err != nil {
   271  			return err
   272  		}
   273  		if err := <-cff.(doneProcessingFile).DoneProcessing(); err != nil {
   274  			return err
   275  		}
   276  
   277  		log.Debugw("setting calculated stats")
   278  		ds.Stats, err = cff.(statsComponentFile).StatsComponent()
   279  		return err
   280  	}
   281  }
   282  
   283  func structureFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   284  	if ds.Structure == nil {
   285  		if usePrevComponent(sw, "st") && prev != nil && prev.Structure != nil {
   286  			if id, err := cidFromIPFSPath(prev.Structure.Path); err == nil {
   287  				log.Debugw("using previous structure", "path", prev.Structure.Path)
   288  				added.Add(qfs.Link{Name: PackageFileStructure.String(), Cid: id, IsFile: true})
   289  			}
   290  		}
   291  		return errNoComponent
   292  	}
   293  
   294  	ds.Structure.DropTransientValues()
   295  
   296  	// if the destination filesystem is content-addressed, use the body
   297  	// path as the checksum. Include path prefix to disambiguate which FS
   298  	// generated the checksum
   299  	if _, ok := dst.(qfs.CAFS); ok {
   300  		if bodyLink := added.Get(ds.Structure.BodyFilename()); bodyLink != nil {
   301  			ds.Structure.Checksum = fsPathFromCID(dst, bodyLink.Cid)
   302  		}
   303  	}
   304  
   305  	f, err := JSONFile(PackageFileStructure.String(), ds.Structure)
   306  	if err != nil {
   307  		return err
   308  	}
   309  	return writePackageFile(dst, f, added)
   310  }
   311  
   312  func metadataFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   313  	if ds.Meta == nil {
   314  		if usePrevComponent(sw, "md") && prev != nil && prev.Meta != nil {
   315  			if id, err := cidFromIPFSPath(prev.Meta.Path); err == nil {
   316  				added.Add(qfs.Link{Name: PackageFileMeta.String(), Cid: id, IsFile: true})
   317  			}
   318  		}
   319  		return errNoComponent
   320  	}
   321  	ds.Meta.DropTransientValues()
   322  	f, err := JSONFile(PackageFileMeta.String(), ds.Meta)
   323  	if err != nil {
   324  		return err
   325  	}
   326  	return writePackageFile(dst, f, added)
   327  }
   328  
   329  func transformFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   330  	if ds.Transform == nil {
   331  		return errNoComponent
   332  	}
   333  
   334  	ds.Transform.DropTransientValues()
   335  	// TODO (b5): this is validation logic, should happen before WriteDataset is
   336  	// ever called.
   337  	// all resources must be references
   338  	for key, r := range ds.Transform.Resources {
   339  		if r.Path == "" {
   340  			return fmt.Errorf("transform resource %s requires a path to save", key)
   341  		}
   342  	}
   343  
   344  	if tfsf := ds.Transform.ScriptFile(); tfsf != nil {
   345  		if err := writePackageFile(dst, NewMemfileReader(transformScriptFilename, tfsf), added); err != nil {
   346  			return err
   347  		}
   348  		link := added.Get(transformScriptFilename)
   349  		ds.Transform.ScriptPath = fsPathFromCID(dst, link.Cid)
   350  	}
   351  
   352  	// // transform component is inlined into dataset
   353  	// return errNoComponent
   354  	f, err := JSONFile(PackageFileTransform.String(), ds.Transform)
   355  	if err != nil {
   356  		return err
   357  	}
   358  
   359  	return writePackageFile(dst, f, added)
   360  }
   361  
   362  func statsFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   363  	if ds.Stats == nil {
   364  		// if the body is unchanged and it's hash matches the prior, keep the stats component
   365  		if usePrevComponent(sw, "bd") && usePrevComponent(sw, "sa") {
   366  			if bdLnk := added.Get(bodyFilename(ds)); bdLnk != nil {
   367  				if fsPathFromCID(dst, bdLnk.Cid) == prev.BodyPath && prev.Stats != nil && prev.Stats.Path != "" {
   368  					if id, err := cidFromIPFSPath(prev.Stats.Path); err == nil {
   369  						log.Debugw("body is unchanged, keeping stats component", "path", prev.Stats.Path)
   370  						added.Add(qfs.Link{Name: PackageFileStats.String(), Cid: id})
   371  					}
   372  				}
   373  			}
   374  		}
   375  		return errNoComponent
   376  	}
   377  	f, err := JSONFile(PackageFileStats.String(), ds.Stats)
   378  	if err != nil {
   379  		return err
   380  	}
   381  	return writePackageFile(dst, f, added)
   382  }
   383  
   384  func readmeFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   385  	if ds.Readme == nil {
   386  		if usePrevComponent(sw, "rm") && prev != nil && prev.Readme != nil {
   387  			if id, err := cidFromIPFSPath(prev.Readme.Path); err == nil {
   388  				added.Add(qfs.Link{Name: PackageFileReadme.String(), Cid: id, IsFile: true})
   389  			}
   390  		}
   391  		return errNoComponent
   392  	}
   393  
   394  	ds.Readme.DropTransientValues()
   395  	if rmsf := ds.Readme.ScriptFile(); rmsf != nil {
   396  		f := NewMemfileReader(PackageFileReadmeScript.String(), rmsf)
   397  		if err := writePackageFile(dst, f, added); err != nil {
   398  			return err
   399  		}
   400  		ds.Readme.ScriptPath = fsPathFromCID(dst, added.Get(PackageFileReadmeScript.String()).Cid)
   401  	}
   402  
   403  	// readme is used for side-effects, component will be inlined into dataset component
   404  	return errNoComponent
   405  }
   406  
   407  // TODO(b5): current construction makes it possible to provide both rendered
   408  // file and script file externally, without checking that the rendered file is
   409  // in fact the result of executing the script.
   410  func vizFilesAddFunc(ctx context.Context, sw SaveSwitches) writeComponentFunc {
   411  	return func(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   412  		if ds.Viz == nil {
   413  			if usePrevComponent(sw, "vz") && prev != nil && prev.Viz != nil {
   414  				if id, err := cidFromIPFSPath(prev.Viz.Path); err == nil {
   415  					added.Add(qfs.Link{Name: PackageFileViz.String(), Cid: id, IsFile: true})
   416  				}
   417  			}
   418  			return errNoComponent
   419  		}
   420  
   421  		ds.Viz.DropTransientValues()
   422  		vzfs := ds.Viz.ScriptFile()
   423  		if vzfs != nil {
   424  			if err := writePackageFile(dst, NewMemfileReader(PackageFileVizScript.String(), vzfs), added); err != nil {
   425  				return err
   426  			}
   427  		}
   428  
   429  		renderedF := ds.Viz.RenderedFile()
   430  		if renderedF != nil {
   431  			if err := writePackageFile(dst, NewMemfileReader(PackageFileRenderedViz.String(), renderedF), added); err != nil {
   432  				return err
   433  			}
   434  		} else if vzfs != nil && sw.ShouldRender {
   435  			renderDs := &dataset.Dataset{}
   436  			renderDs.Assign(ds)
   437  
   438  			if bfn := bodyFilename(ds); bfn != "" {
   439  				if bodyLink := added.Get(bfn); bodyLink != nil {
   440  					bf, err := dst.(qfs.Filesystem).Get(ctx, fsPathFromCID(dst, bodyLink.Cid))
   441  					if err != nil {
   442  						return err
   443  					}
   444  					renderDs.SetBodyFile(bf)
   445  				}
   446  			}
   447  
   448  			if vizScriptLink := added.Get(PackageFileVizScript.String()); vizScriptLink != nil {
   449  				sf, err := dst.(qfs.Filesystem).Get(ctx, fsPathFromCID(dst, vizScriptLink.Cid))
   450  				if err != nil {
   451  					return err
   452  				}
   453  				renderDs.Viz.SetScriptFile(sf)
   454  			}
   455  
   456  			result, err := dsviz.Render(renderDs)
   457  			if err != nil {
   458  				return err
   459  			}
   460  			if err := writePackageFile(dst, NewMemfileReader(PackageFileRenderedViz.String(), result), added); err != nil {
   461  				return err
   462  			}
   463  		}
   464  
   465  		// viz is used for side-effects, component will be inlined into dataset component
   466  		return errNoComponent
   467  	}
   468  }
   469  
   470  func writeDatasetFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error {
   471  	if added.Len() == 0 {
   472  		return fmt.Errorf("cannot save empty dataset")
   473  	}
   474  
   475  	ds.DropTransientValues()
   476  	updateScriptPaths(dst, ds, added)
   477  	setComponentRefs(dst, ds, bodyFilename(ds), added)
   478  
   479  	f, err := JSONFile(PackageFileDataset.String(), ds)
   480  	if err != nil {
   481  		return err
   482  	}
   483  	return writePackageFile(dst, f, added)
   484  }
   485  
   486  func updateScriptPaths(s qfs.MerkleDagStore, ds *dataset.Dataset, added qfs.Links) {
   487  	for filename, link := range added.Map() {
   488  		path := fsPathFromCID(s, link.Cid)
   489  		switch filename {
   490  		case PackageFileVizScript.String():
   491  			ds.Viz.ScriptPath = path
   492  		case PackageFileRenderedViz.String():
   493  			ds.Viz.RenderedPath = path
   494  		case PackageFileReadmeScript.String():
   495  			ds.Readme.ScriptPath = path
   496  		}
   497  	}
   498  }
   499  
   500  func fsPathFromCID(s qfs.MerkleDagStore, id cid.Cid) string {
   501  	fs := s.(qfs.Filesystem)
   502  	return fmt.Sprintf("/%s/%s", fs.Type(), id.String())
   503  }
   504  
   505  func cidFromIPFSPath(path string) (cid.Cid, error) {
   506  	if !strings.HasPrefix(path, "/ipfs/") {
   507  		return cid.Cid{}, fmt.Errorf("cannot create link to path oustide of ipfs filesystem")
   508  	}
   509  	return cid.Parse(strings.TrimPrefix(path, "/ipfs/"))
   510  }
   511  
   512  func writePackageFile(s qfs.MerkleDagStore, f fs.File, added qfs.Links) error {
   513  	fi, err := f.Stat()
   514  	if err != nil {
   515  		return err
   516  	}
   517  
   518  	res, err := s.PutFile(f)
   519  	if err != nil {
   520  		return err
   521  	}
   522  
   523  	added.Add(res.ToLink(fi.Name(), !fi.IsDir()))
   524  	return nil
   525  }
   526  
   527  func bodyFilename(ds *dataset.Dataset) string {
   528  	if ds.Structure == nil {
   529  		return ""
   530  	}
   531  	return ds.Structure.BodyFilename()
   532  }
   533  
   534  func setComponentRefs(dst qfs.MerkleDagStore, ds *dataset.Dataset, bodyFilename string, added qfs.Links) {
   535  	for filename, link := range added.Map() {
   536  		switch filename {
   537  		case bodyFilename:
   538  			ds.BodyPath = fsPathFromCID(dst, link.Cid)
   539  		case PackageFileCommit.String():
   540  			ds.Commit = dataset.NewCommitRef(fsPathFromCID(dst, link.Cid))
   541  		case PackageFileMeta.String():
   542  			ds.Meta = dataset.NewMetaRef(fsPathFromCID(dst, link.Cid))
   543  		case PackageFileViz.String():
   544  			ds.Viz = dataset.NewVizRef(fsPathFromCID(dst, link.Cid))
   545  		case PackageFileStats.String():
   546  			ds.Stats = dataset.NewStatsRef(fsPathFromCID(dst, link.Cid))
   547  		case PackageFileStructure.String():
   548  			ds.Structure = dataset.NewStructureRef(fsPathFromCID(dst, link.Cid))
   549  			// TODO(b5): bug!
   550  			// case PackageFileTransform.String():
   551  			// 	ds.Transform = dataset.NewTransformRef(fsPathFromCID(dst, link.Cid))
   552  		}
   553  	}
   554  }
   555  
   556  func usePrevComponent(sw *SaveSwitches, component string) bool {
   557  	if sw.Replace {
   558  		return false
   559  	}
   560  	for _, rev := range sw.dropRevs {
   561  		if rev.Field == component {
   562  			return false
   563  		}
   564  	}
   565  	return true
   566  }