github.com/pkalwak/bagins@v0.0.0-20210317172317-694ac5ce2f54/bag.go (about) 1 /* 2 Package for working with files stored using the BagIt specification (see below). 3 4 It facilitates the creation of bags, adding files to the bag payload and managing 5 checksums for the file manifest as well as data stored in tag files. 6 7 For more information on Bag tagfiles see 8 http://tools.ietf.org/html/draft-kunze-bagit-09#section-2.3 9 */ 10 package bagins 11 12 /* 13 14 “He that breaks a thing to find out what it is has left the path of wisdom.” 15 16 - Gandalf the Grey 17 18 */ 19 20 import ( 21 "fmt" 22 "io" 23 "log" 24 "os" 25 "path/filepath" 26 "strings" 27 ) 28 29 // Represents the basic structure of a bag which is controlled by methods. 30 type Bag struct { 31 pathToFile string // path to the bag 32 payload *Payload 33 Manifests []*Manifest 34 tagfiles map[string]*TagFile // Key is relative path 35 excludeFromTagManifests map[string]bool 36 } 37 38 // METHODS FOR CREATING AND INITALIZING BAGS 39 40 /* 41 Creates a new bag under the location directory and creates a bag root directory 42 with the provided name. Returns an error if the location does not exist or if the 43 bag already exist. 44 45 This constructor will automatically create manifests with the 46 specified hash algorithms. Supported algorithms include: 47 48 "md5", "sha1", "sha256", "sha512", "sha224" and "sha384" 49 50 If param createTagManifests is true, this will also create tag manifests 51 with the specified algorithms. 52 53 example: 54 NewBag("archive/bags", "bag-34323", ["sha256", "md5"], true) 55 */ 56 func NewBag(location string, name string, hashNames []string, createTagManifests bool) (*Bag, error) { 57 // Create the bag object. 58 bag := new(Bag) 59 60 if bag.Manifests == nil { 61 bag.Manifests = make([]*Manifest, 0) 62 } 63 64 // Start with creating the directories. 65 bag.pathToFile = filepath.Join(location, name) 66 err := FS.Mkdir(bag.pathToFile, 0755) 67 if err != nil { 68 return nil, err 69 } 70 //defer bag.Save() 71 72 // Init the manifests and tag manifests 73 for _, hashName := range hashNames { 74 lcHashName := strings.ToLower(hashName) 75 manifest, err := NewManifest(bag.Path(), lcHashName, PayloadManifest) 76 if err != nil { 77 return nil, err 78 } 79 bag.Manifests = append(bag.Manifests, manifest) 80 81 if createTagManifests == true { 82 tagManifestName := fmt.Sprintf("tagmanifest-%s.txt", lcHashName) 83 fullPath := filepath.Join(bag.Path(), tagManifestName) 84 tagmanifest, err := NewManifest(fullPath, lcHashName, TagManifest) 85 if err != nil { 86 return nil, err 87 } 88 bag.Manifests = append(bag.Manifests, tagmanifest) 89 } 90 } 91 92 // Init the payload directory and such. 93 plPath := filepath.Join(bag.Path(), "data") 94 err = FS.Mkdir(plPath, 0755) 95 if err != nil { 96 return nil, err 97 } 98 bag.payload, err = NewPayload(plPath) 99 if err != nil { 100 return nil, err 101 } 102 103 // Init tagfiles map and create the BagIt.txt Tagfile 104 bag.tagfiles = make(map[string]*TagFile) 105 bag.excludeFromTagManifests = make(map[string]bool) 106 tf, err := bag.createBagItFile() 107 if err != nil { 108 return nil, err 109 } 110 bag.tagfiles["bagit.txt"] = tf 111 112 errors := bag.Save() 113 if err != nil && len(errors) > 0 { 114 message := "" 115 for _, e := range errors { 116 message = fmt.Sprintf("%s, %s", message, e.Error()) 117 } 118 return nil, fmt.Errorf(message) 119 } 120 121 return bag, nil 122 } 123 124 // Creates the required bagit.txt file as per the specification 125 // http://tools.ietf.org/html/draft-kunze-bagit-09#section-2.1.1 126 func (b *Bag) createBagItFile() (*TagFile, error) { 127 if err := b.AddTagfile("bagit.txt"); err != nil { 128 return nil, err 129 } 130 bagit, err := b.TagFile("bagit.txt") 131 if err != nil { 132 return nil, err 133 } 134 bagit.Data.AddField(*NewTagField("BagIt-Version", "0.97")) 135 bagit.Data.AddField(*NewTagField("Tag-File-Character-Encoding", "UTF-8")) 136 137 return bagit, nil 138 } 139 140 /* 141 Reads the directory provided as the root of a new bag and attemps to parse the file 142 contents into payload, manifests and tagfiles. 143 */ 144 func ReadBag(pathToFile string, tagfiles []string) (*Bag, error) { 145 146 // validate existence 147 fi, err := FS.Stat(pathToFile) 148 if err != nil { 149 return nil, err 150 } 151 if !fi.IsDir() { 152 return nil, fmt.Errorf("%s is not a directory.", pathToFile) 153 } 154 155 // Get the payload directory. 156 payload, err := NewPayload(filepath.Join(pathToFile, "data")) 157 if err != nil { 158 return nil, err 159 } 160 161 // Get the bag root directory. 162 bag := new(Bag) 163 bag.pathToFile = pathToFile 164 bag.payload = payload 165 bag.tagfiles = make(map[string]*TagFile) 166 bag.excludeFromTagManifests = make(map[string]bool) 167 168 errors := bag.findManifests() 169 if errors != nil { 170 errorMessage := "" 171 for _, e := range errors { 172 errorMessage = fmt.Sprintf("%s; %s", errorMessage, e.Error()) 173 } 174 return nil, fmt.Errorf(errorMessage) 175 } 176 if len(bag.Manifests) == 0 { 177 return nil, fmt.Errorf("Unable to parse a manifest") 178 } 179 180 for i := range bag.Manifests { 181 manifest := bag.Manifests[i] 182 manifestPath := manifest.Name() 183 if filepath.Dir(manifestPath) != bag.pathToFile { 184 manifestPath = filepath.Join(bag.pathToFile, manifest.Name()) 185 } 186 if _, err := FS.Stat(manifestPath); err != nil { 187 return nil, fmt.Errorf("Can't find manifest: %v", err) 188 } 189 parsedManifest, errs := ReadManifest(manifestPath) 190 if errs != nil && len(errs) > 0 { 191 errors := "" 192 for _, e := range errs { 193 errors = fmt.Sprintf("%s; %s", errors, e.Error()) 194 } 195 return nil, fmt.Errorf("Unable to parse manifest %s: %s", manifestPath, errors) 196 } else { 197 bag.Manifests[i] = parsedManifest 198 } 199 } 200 201 /* 202 Note that we are parsing tags from the expected tag files, and 203 not parsing tags from unexpected tag files. This is per the BagIt 204 spec for V0.97, section 2.2.4, as described here: 205 206 http://tools.ietf.org/html/draft-kunze-bagit-13#section-2.2.4 207 208 A bag MAY contain other tag files that are not defined by this 209 specification. Implementations SHOULD ignore the content of any 210 unexpected tag files, except when they are listed in a tag manifest. 211 When unexpected tag files are listed in a tag manifest, 212 implementations MUST only treat the content of those tag files as 213 octet streams for the purpose of checksum verification. 214 */ 215 for _, tName := range tagfiles { 216 tf, errs := ReadTagFile(filepath.Join(bag.pathToFile, tName)) 217 // Warn on Stderr only if we're running as bagmaker 218 if len(errs) != 0 && strings.Index(os.Args[0], "bagmaker") > -1 { 219 log.Println("While parsing tagfiles:", errs) 220 } 221 if tf != nil { 222 bag.tagfiles[tName] = tf 223 } 224 } 225 226 return bag, nil 227 } 228 229 // Finds all payload and tag manifests in an existing bag. 230 // This is used by ReadBag, not when creating a bag. 231 func (b *Bag) findManifests() []error { 232 if b.Manifests == nil { 233 b.Manifests = make([]*Manifest, 0) 234 } 235 if len(b.Manifests) == 0 { 236 bagFiles, _ := b.ListFiles() 237 for _, fName := range bagFiles { 238 239 filePath := filepath.Join(b.pathToFile, fName) 240 payloadManifestPrefix := filepath.Join(b.pathToFile, "manifest-") 241 tagManifestPrefix := filepath.Join(b.pathToFile, "tagmanifest-") 242 243 if strings.HasPrefix(filePath, payloadManifestPrefix) || 244 strings.HasPrefix(filePath, tagManifestPrefix) { 245 manifest, errors := ReadManifest(filePath) 246 if errors != nil && len(errors) > 0 { 247 return errors 248 } 249 b.Manifests = append(b.Manifests, manifest) 250 } 251 } 252 } 253 return nil 254 } 255 256 // METHODS FOR MANAGING BAG PAYLOADS 257 258 /* 259 Adds a file specified by src parameter to the data directory under 260 the relative path and filename provided in the dst parameter. 261 example: 262 err := b.AddFile("/tmp/myfile.txt", "myfile.txt") 263 */ 264 func (b *Bag) AddFile(src string, dst string) error { 265 payloadManifests := b.GetManifests(PayloadManifest) 266 _, err := b.payload.Add(src, dst, payloadManifests) 267 if err != nil { 268 return err 269 } 270 return err 271 } 272 273 // Performans a Bag.AddFile on all files found under the src 274 // location including all subdirectories. 275 // example: 276 // errs := b.AddDir("/tmp/mypreservationfiles") 277 func (b *Bag) AddDir(src string) (errs []error) { 278 payloadManifests := b.GetManifests(PayloadManifest) 279 _, errs = b.payload.AddAll(src, payloadManifests) 280 return errs 281 } 282 283 // METHODS FOR MANAGING BAG TAG FILES 284 285 /* 286 Adds a tagfile to the bag with the filename provided, 287 creating whatever subdirectories are needed if supplied 288 as part of name parameter. 289 example: 290 err := b.AddTagfile("baginfo.txt") 291 292 Note that this is for adding tag files that adhere to 293 the "Text Tag File Format" described in section 2.2.4 294 of the BagIt spec at http://tools.ietf.org/html/draft-kunze-bagit-13. 295 296 For this type of tag file, you add name-value pairs to 297 the tag file's Data attribute, and this library ensures 298 that the data is written to the file according to the 299 specification. 300 301 The spec also allows you to add non-standard tag files 302 in ANY format. For that, see AddCustomTagfile. 303 */ 304 func (b *Bag) AddTagfile(name string) error { 305 tagFilePath := filepath.Join(b.Path(), name) 306 if err := FS.MkdirAll(filepath.Dir(tagFilePath), 0766); err != nil { 307 return err 308 } 309 tf, err := NewTagFile(tagFilePath) 310 if err != nil { 311 return err 312 } 313 b.tagfiles[name] = tf 314 if err := tf.Create(); err != nil { 315 return err 316 } 317 return nil 318 } 319 320 /* 321 AddCustomTagfile adds a tag file of ANY format into the 322 bag at the specified path without making any attempt to 323 validate or even read the contents of the custom tag file. 324 325 The sourcePath param describes where the file should be 326 copied from. The destPath param describes what the file's 327 relative path in the bag should be, while includeInTagManifests 328 describes whether the custom tag file should be included in 329 the bag's tag manifests. 330 331 The destPath parameter cannot start with "data/" because 332 that would put it in the payload directory, and it cannot 333 start with a slash or contain "..". 334 335 Example: 336 337 bag.AddCustomTagfile("/home/june/cleaver.xml", "customtags/cleaver-meta.xml", true) 338 339 That says put "/home/june/cleaver.xml" into the bag at 340 "customtags/cleaver-meta.xml" and record it in the tagmanifests 341 with the appropriate checksums. 342 */ 343 func (b *Bag) AddCustomTagfile(sourcePath string, destPath string, includeInTagManifests bool) error { 344 if strings.HasPrefix(destPath, "/data") || 345 strings.HasPrefix(destPath, "/") || strings.Contains(destPath, "..") { 346 return fmt.Errorf("Illegal value '%s' for param destPath. "+ 347 "File name cannot start with '/' or '/data' or contain '..'", destPath) 348 } 349 350 absSourcePath, err := filepath.Abs(sourcePath) 351 if err != nil { 352 return err 353 } 354 absDestPath, err := filepath.Abs(filepath.Join(b.pathToFile, destPath)) 355 if err != nil { 356 return err 357 } 358 359 if absSourcePath != absDestPath { 360 sourceFile, err := FS.Open(absSourcePath) 361 if err != nil { 362 return err 363 } 364 defer sourceFile.Close() 365 366 if err = FS.MkdirAll(filepath.Dir(absDestPath), 0766); err != nil { 367 return err 368 } 369 destFile, err := FS.Create(absDestPath) 370 if err != nil { 371 return err 372 } 373 defer destFile.Close() 374 375 _, err = io.Copy(destFile, sourceFile) 376 if err != nil { 377 return err 378 } 379 } 380 381 // The Save() function puts all non-payload, non-manifest files 382 // into the tag manifests by default. So we only need to keep 383 // a map of what to exclude. 384 if includeInTagManifests == false { 385 if b.excludeFromTagManifests == nil { 386 b.excludeFromTagManifests = make(map[string]bool) 387 } 388 b.excludeFromTagManifests[destPath] = true 389 } 390 391 return nil 392 } 393 394 /* 395 Finds a tagfile in by its relative path to the bag root directory. 396 example: 397 tf, err := b.TagFile("bag-info.txt") 398 */ 399 func (b *Bag) TagFile(name string) (*TagFile, error) { 400 if tf, ok := b.tagfiles[name]; ok { 401 return tf, nil 402 } 403 return nil, fmt.Errorf("Unable to find tagfile %s", name) 404 } 405 406 /* 407 Lists all the current tag files the bag is tracking. 408 These are the tag files that the bag has actually parsed. 409 The bag may have any number of unparsed (and perhaps unreadable) 410 tag files as well. For those, see UnparsedTagFiles() 411 */ 412 func (b *Bag) ListTagFiles() []string { 413 names := make([]string, len(b.tagfiles)) 414 i := 0 415 for k := range b.tagfiles { 416 names[i] = k 417 i++ 418 } 419 return names 420 } 421 422 // Returns a list of unparsed tag files, which includes any file 423 // not a manifest, not in the data directory, and not among the 424 // tag files passed into ReadBag(). 425 func (b *Bag) UnparsedTagFiles() ([]string, error) { 426 var files []string 427 428 // WalkDir function to collect files in the bag.. 429 visit := func(pathToFile string, info os.FileInfo, err error) error { 430 if err != nil { 431 return err 432 } 433 434 relativePath, err := filepath.Rel(b.Path(), pathToFile) 435 if err != nil { 436 return err 437 } 438 439 isPayload := strings.HasPrefix(pathToFile, b.payload.Name()) 440 isManifest := strings.HasPrefix(relativePath, "tagmanifest-") || 441 strings.HasPrefix(relativePath, "manifest-") 442 _, isParsedTagFile := b.tagfiles[relativePath] 443 444 if !info.IsDir() && !isPayload && !isParsedTagFile && !isManifest { 445 if relativePath != "." { 446 files = append(files, relativePath) 447 } 448 } 449 return err 450 } 451 452 if err := FS.Walk(b.Path(), visit); err != nil { 453 return nil, err 454 } 455 456 return files, nil 457 } 458 459 /* 460 Convienence method to return the bag-info.txt tag file if it exists. Since 461 this is optional it will not be created by default and will return an error 462 if you have not defined or added it yourself via Bag.AddTagfile 463 */ 464 func (b *Bag) BagInfo() (*TagFile, error) { 465 tf, err := b.TagFile("bag-info.txt") 466 if err != nil { 467 return nil, err 468 } 469 return tf, nil 470 } 471 472 // Returns the manifest with the specified algorithm and type, 473 // or nil. For example, GetManifest(PayloadManifest, "sha256") 474 // returns either a reference to manifest-sha256.txt or nil. 475 // GetManifest(TagManifest, "md5") returns a reference to 476 // tagmanifest-md5.txt or nil. 477 func (b *Bag) GetManifest(manifestType, algorithm string) *Manifest { 478 for _, m := range b.Manifests { 479 if m.Type() == manifestType && m.Algorithm() == algorithm { 480 return m 481 } 482 } 483 return nil 484 } 485 486 // Returns the manifests of the specified type, 487 // or an empty slice. For example, GetManifests(PayloadManifest) 488 // returns all of the payload manifests. 489 func (b *Bag) GetManifests(manifestType string) []*Manifest { 490 manifests := make([]*Manifest, 0) 491 for _, m := range b.Manifests { 492 if m.Type() == manifestType { 493 manifests = append(manifests, m) 494 } 495 } 496 return manifests 497 } 498 499 // TODO create methods for managing fetch file. 500 501 // METHODS FOR MANAGING OR RETURNING INFORMATION ABOUT THE BAG ITSELF 502 503 // Returns the full path of the bag including it's own directory. 504 func (b *Bag) Path() string { 505 return b.pathToFile 506 } 507 508 /* 509 This method writes all the relevant tag and manifest files to finish off the 510 bag. 511 */ 512 func (b *Bag) Save() (errs []error) { 513 514 errors := b.savePayloadManifests() 515 if len(errors) > 0 { 516 errs = append(errs, errors...) 517 } 518 519 errors = b.calculateChecksumsForManagedTagFiles() 520 if len(errors) > 0 { 521 errs = append(errs, errors...) 522 } 523 524 errors = b.calculateChecksumsForCustomTagFiles() 525 if len(errors) > 0 { 526 errs = append(errs, errors...) 527 } 528 529 errors = b.saveTagManifests() 530 if len(errors) > 0 { 531 errs = append(errs, errors...) 532 } 533 534 return errs 535 } 536 537 func (b *Bag) savePayloadManifests() (errs []error) { 538 // Write the payload manifests first because we may 539 // need to include their checksums in the tagmanifests. 540 payloadManifests := b.GetManifests(PayloadManifest) 541 for i := range payloadManifests { 542 manifest := payloadManifests[i] 543 if err := manifest.Create(); err != nil { 544 errs = append(errs, err) 545 } 546 } 547 return errs 548 } 549 550 func (b *Bag) calculateChecksumsForManagedTagFiles() (errs []error) { 551 tagManifests := b.GetManifests(TagManifest) 552 for _, tf := range b.tagfiles { 553 if err := FS.MkdirAll(filepath.Dir(tf.Name()), 0766); err != nil { 554 errs = append(errs, err) 555 } 556 if err := tf.Create(); err != nil { 557 errs = append(errs, err) 558 } 559 // Add tag file checksums to tag manifests 560 for i := range tagManifests { 561 manifest := tagManifests[i] 562 checksum, err := FileChecksum(tf.Name(), manifest.hashFunc()) 563 if err != nil { 564 errors := []error{ 565 fmt.Errorf("Error calculating %s checksum for file %s: %v", 566 manifest.Algorithm(), tf.Name(), err), 567 } 568 return errors 569 } 570 relativeFilePath := strings.Replace(tf.Name(), b.pathToFile+"/", "", 1) 571 manifest.Data[relativeFilePath] = checksum 572 } 573 } 574 return errs 575 } 576 577 func (b *Bag) calculateChecksumsForCustomTagFiles() (errs []error) { 578 // Calculate checksums that go into the tag manifests. 579 nonPayloadFiles, err := b.UnparsedTagFiles() 580 if err != nil { 581 errs = append(errs, err) 582 } 583 payloadManifests := b.GetManifests(PayloadManifest) 584 tagManifests := b.GetManifests(TagManifest) 585 for _, m := range payloadManifests { 586 nonPayloadFiles = append(nonPayloadFiles, m.Name()) 587 } 588 for _, file := range nonPayloadFiles { 589 relativeFilePath := strings.Replace(file, b.pathToFile+"/", "", 1) 590 if _, exclude := b.excludeFromTagManifests[relativeFilePath]; exclude { 591 continue 592 } 593 // Use relative path in manifest, abs path when calculating checksum. 594 absPathToFile := file 595 if !strings.HasPrefix(file, b.pathToFile) { 596 absPathToFile = filepath.Join(b.pathToFile, file) 597 } 598 for i := range tagManifests { 599 manifest := tagManifests[i] 600 checksum, err := FileChecksum(absPathToFile, manifest.hashFunc()) 601 if err != nil { 602 errors := []error{ 603 fmt.Errorf("Error calculating %s checksum for file %s: %v", 604 manifest.Algorithm(), file, err), 605 } 606 return errors 607 } 608 manifest.Data[relativeFilePath] = checksum 609 } 610 } 611 return errs 612 } 613 614 func (b *Bag) saveTagManifests() (errs []error) { 615 tagManifests := b.GetManifests(TagManifest) 616 for i := range tagManifests { 617 manifest := tagManifests[i] 618 if err := manifest.Create(); err != nil { 619 errs = append(errs, err) 620 } 621 } 622 return errs 623 } 624 625 /* 626 Walks the bag directory and subdirectories and returns the 627 filepaths found inside and any errors skipping files in the 628 payload directory. 629 */ 630 func (b *Bag) ListFiles() ([]string, error) { 631 632 var files []string 633 634 // WalkDir function to collect files in the bag.. 635 visit := func(pathToFile string, info os.FileInfo, err error) error { 636 if err != nil { 637 return err 638 } 639 640 if !info.IsDir() { 641 fp, err := filepath.Rel(b.Path(), pathToFile) 642 if err != nil { 643 return err 644 } 645 if fp != "." { 646 files = append(files, fp) 647 } 648 } 649 return err 650 } 651 652 if err := FS.Walk(b.Path(), visit); err != nil { 653 return nil, err 654 } 655 656 return files, nil 657 }