github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/base/dsfs/write.go (about) 1 package dsfs 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "io/fs" 8 "strings" 9 "time" 10 11 cid "github.com/ipfs/go-cid" 12 crypto "github.com/libp2p/go-libp2p-core/crypto" 13 "github.com/qri-io/dataset" 14 "github.com/qri-io/dataset/dsviz" 15 "github.com/qri-io/dataset/validate" 16 "github.com/qri-io/qfs" 17 "github.com/qri-io/qri/dsref" 18 "github.com/qri-io/qri/event" 19 ) 20 21 // number of entries to per batch when processing body data in WriteDataset 22 const batchSize = 5000 23 24 var ( 25 // BodySizeSmallEnoughToDiff sets how small a body must be to generate a message from it 26 BodySizeSmallEnoughToDiff = 20000000 // 20M or less is small 27 // OpenFileTimeoutDuration determines the maximium amount of time to wait for 28 // a Filestore to open a file. Some filestores (like IPFS) fallback to a 29 // network request when it can't find a file locally. Setting a short timeout 30 // prevents waiting for a slow network response, at the expense of leaving 31 // files unresolved. 32 // TODO (b5) - allow -1 duration as a sentinel value for no timeout 33 OpenFileTimeoutDuration = time.Millisecond * 700 34 ) 35 36 // If a user has a dataset larger than the above limit, then instead of diffing we compare the 37 // checksum against the previous version. We should make this algorithm agree with how `status` 38 // works. 39 // See issue: https://github.com/qri-io/qri/issues/1150 40 41 // SaveSwitches represents options for saving a dataset 42 type SaveSwitches struct { 43 // Use a custom timestamp, defaults to time.Now if unset 44 Time time.Time 45 // Replace is whether the save is a full replacement or a set of patches to previous 46 Replace bool 47 // Pin is whether the dataset should be pinned 48 Pin bool 49 // ConvertFormatToPrev is whether the body should be converted to match the previous format 50 ConvertFormatToPrev bool 51 // ForceIfNoChanges is whether the save should be forced even if no changes are detected 52 ForceIfNoChanges bool 53 // ShouldRender is deprecated, controls whether viz should be rendered 54 ShouldRender bool 55 // NewName is whether a new dataset should be created, guaranteeing there's no previous version 56 NewName bool 57 // FileHint is a hint for what file is used for creating this dataset 58 FileHint string 59 // Drop is a string of components to remove before saving 60 Drop string 61 // parsed drop string into list of components 62 dropRevs []*dsref.Rev 63 64 // action to take when calculating commit messages 65 // bodyAction is set by computeFieldsFile to feed data to the commit component 66 // write. A bit of a hack, but it works. 67 bodyAct BodyAction 68 } 69 70 // CreateDataset writes a dataset to a provided store. 71 // Store is where we're going to store the data 72 // Dataset to be saved 73 // Prev is the previous version or nil if there isn't one 74 // Pk is the private key for cryptographically signing 75 // Sw is switches that control how the save happens 76 // Returns the immutable path if no error 77 func CreateDataset( 78 ctx context.Context, 79 source qfs.Filesystem, 80 destination qfs.Filesystem, 81 pub event.Publisher, 82 ds *dataset.Dataset, 83 prev *dataset.Dataset, 84 pk crypto.PrivKey, 85 sw SaveSwitches, 86 ) (string, error) { 87 if pk == nil { 88 return "", fmt.Errorf("private key is required to create a dataset") 89 } 90 91 if err := DerefDataset(ctx, source, ds); err != nil { 92 log.Debugf("dereferencing dataset components: %s", err) 93 return "", err 94 } 95 if err := validate.Dataset(ds); err != nil { 96 log.Debug(err.Error()) 97 return "", err 98 } 99 log.Debugw("CreateDataset", "ds.Peername", ds.Peername, "ds.Name", ds.Name, "dest", destination.Type()) 100 101 if prev != nil && !prev.IsEmpty() { 102 log.Debugw("dereferencing previous dataset", "prevPath", prev.Path) 103 if err := DerefDataset(ctx, source, prev); err != nil { 104 log.Debug(err.Error()) 105 return "", err 106 } 107 if err := validate.Dataset(prev); err != nil { 108 log.Debug(err.Error()) 109 return "", err 110 } 111 } 112 113 peername := ds.Peername 114 name := ds.Name 115 116 go func() { 117 evtErr := pub.Publish(ctx, event.ETDatasetSaveStarted, event.DsSaveEvent{ 118 Username: peername, 119 Name: name, 120 Message: "save started", 121 Completion: 0, 122 }) 123 if evtErr != nil { 124 log.Debugw("ignored error while publishing save start event", "evtErr", evtErr) 125 } 126 }() 127 128 path, err := WriteDataset(ctx, source, destination, prev, ds, pub, pk, sw) 129 if err != nil { 130 log.Debug(err.Error()) 131 if evtErr := pub.Publish(ctx, event.ETDatasetSaveCompleted, event.DsSaveEvent{ 132 Username: peername, 133 Name: name, 134 Error: err, 135 Completion: 1.0, 136 }); evtErr != nil { 137 log.Debugw("ignored error while publishing save completed", "evtErr", evtErr) 138 } 139 return "", err 140 } 141 142 // TODO (b5) - many codepaths that call this function use the `ds` arg after saving 143 // we need to dereference here so fields are set, but this is overkill if 144 // the caller doesn't use the ds arg afterward 145 // might make sense to have a wrapper function that writes and loads on success 146 if err := DerefDataset(ctx, destination, ds); err != nil { 147 if evtErr := pub.Publish(ctx, event.ETDatasetSaveCompleted, event.DsSaveEvent{ 148 Username: peername, 149 Name: name, 150 Error: err, 151 Completion: 1.0, 152 }); evtErr != nil { 153 log.Debugw("ignored error while publishing save completed", "evtErr", evtErr) 154 } 155 return path, err 156 } 157 158 return path, pub.Publish(ctx, event.ETDatasetSaveCompleted, event.DsSaveEvent{ 159 Username: peername, 160 Name: name, 161 Message: "dataset saved", 162 Path: path, 163 Completion: 1.0, 164 }) 165 } 166 167 // WriteDataset persists a datasets to a destination filesystem 168 func WriteDataset( 169 ctx context.Context, 170 src qfs.Filesystem, 171 dst qfs.Filesystem, 172 prev *dataset.Dataset, 173 ds *dataset.Dataset, 174 publisher event.Publisher, 175 pk crypto.PrivKey, 176 sw SaveSwitches, 177 ) (string, error) { 178 dstStore, ok := dst.(qfs.MerkleDagStore) 179 if !ok { 180 return "", fmt.Errorf("destination must be a MerkleDagStore") 181 } 182 183 if ds.Commit != nil { 184 // assign timestamp early. saving process on large files can take many minutes 185 // and we want to mark commit creation closer to when the user submitted the 186 // creation request 187 if ds.Commit.Timestamp.IsZero() { 188 ds.Commit.Timestamp = Timestamp() 189 } else { 190 ds.Commit.Timestamp = ds.Commit.Timestamp.In(time.UTC) 191 } 192 } 193 194 if ds.Stats != nil { 195 ds.Stats = nil 196 } 197 198 revs, err := dsref.ParseRevs(sw.Drop) 199 if err != nil { 200 return "", err 201 } 202 sw.dropRevs = revs 203 204 added := qfs.NewLinks() 205 206 // the call order of these functions is important, funcs later in the slice 207 // may rely on writeFiles fields set by eariler functions 208 writeFuncs := []writeComponentFunc{ 209 bodyFileFunc(ctx, pk, publisher), // no deps 210 metadataFile, // no deps 211 transformFile, // no deps 212 structureFile, // requires bdoy if it exists 213 statsFile, // requires body, structure if they exist 214 readmeFile, // no deps 215 vizFilesAddFunc(ctx, sw), // requires body, meta, transform, structure, stats, readme if they exist 216 commitFileAddFunc(ctx, pk, publisher), // requires meta, transform, body, structure, stats, readme, vizScript, vizRendered if they exist 217 writeDatasetFile, // requires all other components 218 } 219 220 for _, fileFunc := range writeFuncs { 221 if err := fileFunc(src, dstStore, prev, ds, added, &sw); err != nil { 222 if errors.Is(errNoComponent, err) { 223 continue 224 } 225 return "", err 226 } 227 } 228 229 // add root node 230 res, err := dstStore.PutNode(added) 231 if err != nil { 232 return "", err 233 } 234 return fsPathFromCID(dstStore, res.Cid), nil 235 } 236 237 // writeComponentFunc is a function that writes a component to a merkleDagStore 238 // it accepts a set of named links that have already been added 239 // write component funcs are expected to write a link to "added" on successful 240 // write 241 type writeComponentFunc func(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) (err error) 242 243 var errNoComponent = errors.New("no component") 244 245 func bodyFileFunc(ctx context.Context, pk crypto.PrivKey, publisher event.Publisher) writeComponentFunc { 246 return func(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 247 if ds.BodyFile() == nil { 248 if usePrevComponent(sw, "bd") && prev != nil && prev.BodyPath != "" { 249 sw.bodyAct = BodySame 250 // TODO (b5): need to validate that a potentially new structure will work 251 if id, err := cidFromIPFSPath(prev.BodyPath); err == nil { 252 added.Add(qfs.Link{Name: bodyFilename(prev), Cid: id, IsFile: true}) 253 } 254 } 255 return errNoComponent 256 } 257 258 sw.bodyAct = BodyDefault 259 bodyFilename := bodyFilename(ds) 260 cff, err := newComputeFieldsFile(ctx, publisher, pk, ds, prev, sw) 261 if err != nil { 262 return err 263 } 264 265 f, err := NewMemfileReader(bodyFilename, cff), nil 266 if err != nil { 267 return err 268 } 269 270 if err := writePackageFile(dst, f, added); err != nil { 271 return err 272 } 273 if err := <-cff.(doneProcessingFile).DoneProcessing(); err != nil { 274 return err 275 } 276 277 log.Debugw("setting calculated stats") 278 ds.Stats, err = cff.(statsComponentFile).StatsComponent() 279 return err 280 } 281 } 282 283 func structureFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 284 if ds.Structure == nil { 285 if usePrevComponent(sw, "st") && prev != nil && prev.Structure != nil { 286 if id, err := cidFromIPFSPath(prev.Structure.Path); err == nil { 287 log.Debugw("using previous structure", "path", prev.Structure.Path) 288 added.Add(qfs.Link{Name: PackageFileStructure.String(), Cid: id, IsFile: true}) 289 } 290 } 291 return errNoComponent 292 } 293 294 ds.Structure.DropTransientValues() 295 296 // if the destination filesystem is content-addressed, use the body 297 // path as the checksum. Include path prefix to disambiguate which FS 298 // generated the checksum 299 if _, ok := dst.(qfs.CAFS); ok { 300 if bodyLink := added.Get(ds.Structure.BodyFilename()); bodyLink != nil { 301 ds.Structure.Checksum = fsPathFromCID(dst, bodyLink.Cid) 302 } 303 } 304 305 f, err := JSONFile(PackageFileStructure.String(), ds.Structure) 306 if err != nil { 307 return err 308 } 309 return writePackageFile(dst, f, added) 310 } 311 312 func metadataFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 313 if ds.Meta == nil { 314 if usePrevComponent(sw, "md") && prev != nil && prev.Meta != nil { 315 if id, err := cidFromIPFSPath(prev.Meta.Path); err == nil { 316 added.Add(qfs.Link{Name: PackageFileMeta.String(), Cid: id, IsFile: true}) 317 } 318 } 319 return errNoComponent 320 } 321 ds.Meta.DropTransientValues() 322 f, err := JSONFile(PackageFileMeta.String(), ds.Meta) 323 if err != nil { 324 return err 325 } 326 return writePackageFile(dst, f, added) 327 } 328 329 func transformFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 330 if ds.Transform == nil { 331 return errNoComponent 332 } 333 334 ds.Transform.DropTransientValues() 335 // TODO (b5): this is validation logic, should happen before WriteDataset is 336 // ever called. 337 // all resources must be references 338 for key, r := range ds.Transform.Resources { 339 if r.Path == "" { 340 return fmt.Errorf("transform resource %s requires a path to save", key) 341 } 342 } 343 344 if tfsf := ds.Transform.ScriptFile(); tfsf != nil { 345 if err := writePackageFile(dst, NewMemfileReader(transformScriptFilename, tfsf), added); err != nil { 346 return err 347 } 348 link := added.Get(transformScriptFilename) 349 ds.Transform.ScriptPath = fsPathFromCID(dst, link.Cid) 350 } 351 352 // // transform component is inlined into dataset 353 // return errNoComponent 354 f, err := JSONFile(PackageFileTransform.String(), ds.Transform) 355 if err != nil { 356 return err 357 } 358 359 return writePackageFile(dst, f, added) 360 } 361 362 func statsFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 363 if ds.Stats == nil { 364 // if the body is unchanged and it's hash matches the prior, keep the stats component 365 if usePrevComponent(sw, "bd") && usePrevComponent(sw, "sa") { 366 if bdLnk := added.Get(bodyFilename(ds)); bdLnk != nil { 367 if fsPathFromCID(dst, bdLnk.Cid) == prev.BodyPath && prev.Stats != nil && prev.Stats.Path != "" { 368 if id, err := cidFromIPFSPath(prev.Stats.Path); err == nil { 369 log.Debugw("body is unchanged, keeping stats component", "path", prev.Stats.Path) 370 added.Add(qfs.Link{Name: PackageFileStats.String(), Cid: id}) 371 } 372 } 373 } 374 } 375 return errNoComponent 376 } 377 f, err := JSONFile(PackageFileStats.String(), ds.Stats) 378 if err != nil { 379 return err 380 } 381 return writePackageFile(dst, f, added) 382 } 383 384 func readmeFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 385 if ds.Readme == nil { 386 if usePrevComponent(sw, "rm") && prev != nil && prev.Readme != nil { 387 if id, err := cidFromIPFSPath(prev.Readme.Path); err == nil { 388 added.Add(qfs.Link{Name: PackageFileReadme.String(), Cid: id, IsFile: true}) 389 } 390 } 391 return errNoComponent 392 } 393 394 ds.Readme.DropTransientValues() 395 if rmsf := ds.Readme.ScriptFile(); rmsf != nil { 396 f := NewMemfileReader(PackageFileReadmeScript.String(), rmsf) 397 if err := writePackageFile(dst, f, added); err != nil { 398 return err 399 } 400 ds.Readme.ScriptPath = fsPathFromCID(dst, added.Get(PackageFileReadmeScript.String()).Cid) 401 } 402 403 // readme is used for side-effects, component will be inlined into dataset component 404 return errNoComponent 405 } 406 407 // TODO(b5): current construction makes it possible to provide both rendered 408 // file and script file externally, without checking that the rendered file is 409 // in fact the result of executing the script. 410 func vizFilesAddFunc(ctx context.Context, sw SaveSwitches) writeComponentFunc { 411 return func(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 412 if ds.Viz == nil { 413 if usePrevComponent(sw, "vz") && prev != nil && prev.Viz != nil { 414 if id, err := cidFromIPFSPath(prev.Viz.Path); err == nil { 415 added.Add(qfs.Link{Name: PackageFileViz.String(), Cid: id, IsFile: true}) 416 } 417 } 418 return errNoComponent 419 } 420 421 ds.Viz.DropTransientValues() 422 vzfs := ds.Viz.ScriptFile() 423 if vzfs != nil { 424 if err := writePackageFile(dst, NewMemfileReader(PackageFileVizScript.String(), vzfs), added); err != nil { 425 return err 426 } 427 } 428 429 renderedF := ds.Viz.RenderedFile() 430 if renderedF != nil { 431 if err := writePackageFile(dst, NewMemfileReader(PackageFileRenderedViz.String(), renderedF), added); err != nil { 432 return err 433 } 434 } else if vzfs != nil && sw.ShouldRender { 435 renderDs := &dataset.Dataset{} 436 renderDs.Assign(ds) 437 438 if bfn := bodyFilename(ds); bfn != "" { 439 if bodyLink := added.Get(bfn); bodyLink != nil { 440 bf, err := dst.(qfs.Filesystem).Get(ctx, fsPathFromCID(dst, bodyLink.Cid)) 441 if err != nil { 442 return err 443 } 444 renderDs.SetBodyFile(bf) 445 } 446 } 447 448 if vizScriptLink := added.Get(PackageFileVizScript.String()); vizScriptLink != nil { 449 sf, err := dst.(qfs.Filesystem).Get(ctx, fsPathFromCID(dst, vizScriptLink.Cid)) 450 if err != nil { 451 return err 452 } 453 renderDs.Viz.SetScriptFile(sf) 454 } 455 456 result, err := dsviz.Render(renderDs) 457 if err != nil { 458 return err 459 } 460 if err := writePackageFile(dst, NewMemfileReader(PackageFileRenderedViz.String(), result), added); err != nil { 461 return err 462 } 463 } 464 465 // viz is used for side-effects, component will be inlined into dataset component 466 return errNoComponent 467 } 468 } 469 470 func writeDatasetFile(src qfs.Filesystem, dst qfs.MerkleDagStore, prev, ds *dataset.Dataset, added qfs.Links, sw *SaveSwitches) error { 471 if added.Len() == 0 { 472 return fmt.Errorf("cannot save empty dataset") 473 } 474 475 ds.DropTransientValues() 476 updateScriptPaths(dst, ds, added) 477 setComponentRefs(dst, ds, bodyFilename(ds), added) 478 479 f, err := JSONFile(PackageFileDataset.String(), ds) 480 if err != nil { 481 return err 482 } 483 return writePackageFile(dst, f, added) 484 } 485 486 func updateScriptPaths(s qfs.MerkleDagStore, ds *dataset.Dataset, added qfs.Links) { 487 for filename, link := range added.Map() { 488 path := fsPathFromCID(s, link.Cid) 489 switch filename { 490 case PackageFileVizScript.String(): 491 ds.Viz.ScriptPath = path 492 case PackageFileRenderedViz.String(): 493 ds.Viz.RenderedPath = path 494 case PackageFileReadmeScript.String(): 495 ds.Readme.ScriptPath = path 496 } 497 } 498 } 499 500 func fsPathFromCID(s qfs.MerkleDagStore, id cid.Cid) string { 501 fs := s.(qfs.Filesystem) 502 return fmt.Sprintf("/%s/%s", fs.Type(), id.String()) 503 } 504 505 func cidFromIPFSPath(path string) (cid.Cid, error) { 506 if !strings.HasPrefix(path, "/ipfs/") { 507 return cid.Cid{}, fmt.Errorf("cannot create link to path oustide of ipfs filesystem") 508 } 509 return cid.Parse(strings.TrimPrefix(path, "/ipfs/")) 510 } 511 512 func writePackageFile(s qfs.MerkleDagStore, f fs.File, added qfs.Links) error { 513 fi, err := f.Stat() 514 if err != nil { 515 return err 516 } 517 518 res, err := s.PutFile(f) 519 if err != nil { 520 return err 521 } 522 523 added.Add(res.ToLink(fi.Name(), !fi.IsDir())) 524 return nil 525 } 526 527 func bodyFilename(ds *dataset.Dataset) string { 528 if ds.Structure == nil { 529 return "" 530 } 531 return ds.Structure.BodyFilename() 532 } 533 534 func setComponentRefs(dst qfs.MerkleDagStore, ds *dataset.Dataset, bodyFilename string, added qfs.Links) { 535 for filename, link := range added.Map() { 536 switch filename { 537 case bodyFilename: 538 ds.BodyPath = fsPathFromCID(dst, link.Cid) 539 case PackageFileCommit.String(): 540 ds.Commit = dataset.NewCommitRef(fsPathFromCID(dst, link.Cid)) 541 case PackageFileMeta.String(): 542 ds.Meta = dataset.NewMetaRef(fsPathFromCID(dst, link.Cid)) 543 case PackageFileViz.String(): 544 ds.Viz = dataset.NewVizRef(fsPathFromCID(dst, link.Cid)) 545 case PackageFileStats.String(): 546 ds.Stats = dataset.NewStatsRef(fsPathFromCID(dst, link.Cid)) 547 case PackageFileStructure.String(): 548 ds.Structure = dataset.NewStructureRef(fsPathFromCID(dst, link.Cid)) 549 // TODO(b5): bug! 550 // case PackageFileTransform.String(): 551 // ds.Transform = dataset.NewTransformRef(fsPathFromCID(dst, link.Cid)) 552 } 553 } 554 } 555 556 func usePrevComponent(sw *SaveSwitches, component string) bool { 557 if sw.Replace { 558 return false 559 } 560 for _, rev := range sw.dropRevs { 561 if rev.Field == component { 562 return false 563 } 564 } 565 return true 566 }