github.com/pkalwak/bagins@v0.0.0-20210317172317-694ac5ce2f54/bag.go (about)

     1  /*
     2  Package for working with files stored using the BagIt specification (see below).
     3  
     4  It facilitates the creation of bags, adding files to the bag payload and managing
     5  checksums for the file manifest as well as data stored in tag files.
     6  
     7  For more information on Bag tagfiles see
     8  http://tools.ietf.org/html/draft-kunze-bagit-09#section-2.3
     9  */
    10  package bagins
    11  
    12  /*
    13  
    14  “He that breaks a thing to find out what it is has left the path of wisdom.”
    15  
    16  - Gandalf the Grey
    17  
    18  */
    19  
    20  import (
    21  	"fmt"
    22  	"io"
    23  	"log"
    24  	"os"
    25  	"path/filepath"
    26  	"strings"
    27  )
    28  
    29  // Represents the basic structure of a bag which is controlled by methods.
    30  type Bag struct {
    31  	pathToFile              string // path to the bag
    32  	payload                 *Payload
    33  	Manifests               []*Manifest
    34  	tagfiles                map[string]*TagFile // Key is relative path
    35  	excludeFromTagManifests map[string]bool
    36  }
    37  
    38  // METHODS FOR CREATING AND INITALIZING BAGS
    39  
    40  /*
    41   Creates a new bag under the location directory and creates a bag root directory
    42   with the provided name.  Returns an error if the location does not exist or if the
    43   bag already exist.
    44  
    45   This constructor will automatically create manifests with the
    46   specified hash algorithms. Supported algorithms include:
    47  
    48   "md5", "sha1", "sha256", "sha512", "sha224" and "sha384"
    49  
    50   If param createTagManifests is true, this will also create tag manifests
    51   with the specified algorithms.
    52  
    53   example:
    54  		NewBag("archive/bags", "bag-34323", ["sha256", "md5"], true)
    55  */
    56  func NewBag(location string, name string, hashNames []string, createTagManifests bool) (*Bag, error) {
    57  	// Create the bag object.
    58  	bag := new(Bag)
    59  
    60  	if bag.Manifests == nil {
    61  		bag.Manifests = make([]*Manifest, 0)
    62  	}
    63  
    64  	// Start with creating the directories.
    65  	bag.pathToFile = filepath.Join(location, name)
    66  	err := FS.Mkdir(bag.pathToFile, 0755)
    67  	if err != nil {
    68  		return nil, err
    69  	}
    70  	//defer bag.Save()
    71  
    72  	// Init the manifests and tag manifests
    73  	for _, hashName := range hashNames {
    74  		lcHashName := strings.ToLower(hashName)
    75  		manifest, err := NewManifest(bag.Path(), lcHashName, PayloadManifest)
    76  		if err != nil {
    77  			return nil, err
    78  		}
    79  		bag.Manifests = append(bag.Manifests, manifest)
    80  
    81  		if createTagManifests == true {
    82  			tagManifestName := fmt.Sprintf("tagmanifest-%s.txt", lcHashName)
    83  			fullPath := filepath.Join(bag.Path(), tagManifestName)
    84  			tagmanifest, err := NewManifest(fullPath, lcHashName, TagManifest)
    85  			if err != nil {
    86  				return nil, err
    87  			}
    88  			bag.Manifests = append(bag.Manifests, tagmanifest)
    89  		}
    90  	}
    91  
    92  	// Init the payload directory and such.
    93  	plPath := filepath.Join(bag.Path(), "data")
    94  	err = FS.Mkdir(plPath, 0755)
    95  	if err != nil {
    96  		return nil, err
    97  	}
    98  	bag.payload, err = NewPayload(plPath)
    99  	if err != nil {
   100  		return nil, err
   101  	}
   102  
   103  	// Init tagfiles map and create the BagIt.txt Tagfile
   104  	bag.tagfiles = make(map[string]*TagFile)
   105  	bag.excludeFromTagManifests = make(map[string]bool)
   106  	tf, err := bag.createBagItFile()
   107  	if err != nil {
   108  		return nil, err
   109  	}
   110  	bag.tagfiles["bagit.txt"] = tf
   111  
   112  	errors := bag.Save()
   113  	if err != nil && len(errors) > 0 {
   114  		message := ""
   115  		for _, e := range errors {
   116  			message = fmt.Sprintf("%s, %s", message, e.Error())
   117  		}
   118  		return nil, fmt.Errorf(message)
   119  	}
   120  
   121  	return bag, nil
   122  }
   123  
   124  // Creates the required bagit.txt file as per the specification
   125  // http://tools.ietf.org/html/draft-kunze-bagit-09#section-2.1.1
   126  func (b *Bag) createBagItFile() (*TagFile, error) {
   127  	if err := b.AddTagfile("bagit.txt"); err != nil {
   128  		return nil, err
   129  	}
   130  	bagit, err := b.TagFile("bagit.txt")
   131  	if err != nil {
   132  		return nil, err
   133  	}
   134  	bagit.Data.AddField(*NewTagField("BagIt-Version", "0.97"))
   135  	bagit.Data.AddField(*NewTagField("Tag-File-Character-Encoding", "UTF-8"))
   136  
   137  	return bagit, nil
   138  }
   139  
   140  /*
   141  	Reads the directory provided as the root of a new bag and attemps to parse the file
   142  	contents into payload, manifests and tagfiles.
   143  */
   144  func ReadBag(pathToFile string, tagfiles []string) (*Bag, error) {
   145  
   146  	// validate existence
   147  	fi, err := FS.Stat(pathToFile)
   148  	if err != nil {
   149  		return nil, err
   150  	}
   151  	if !fi.IsDir() {
   152  		return nil, fmt.Errorf("%s is not a directory.", pathToFile)
   153  	}
   154  
   155  	// Get the payload directory.
   156  	payload, err := NewPayload(filepath.Join(pathToFile, "data"))
   157  	if err != nil {
   158  		return nil, err
   159  	}
   160  
   161  	// Get the bag root directory.
   162  	bag := new(Bag)
   163  	bag.pathToFile = pathToFile
   164  	bag.payload = payload
   165  	bag.tagfiles = make(map[string]*TagFile)
   166  	bag.excludeFromTagManifests = make(map[string]bool)
   167  
   168  	errors := bag.findManifests()
   169  	if errors != nil {
   170  		errorMessage := ""
   171  		for _, e := range errors {
   172  			errorMessage = fmt.Sprintf("%s; %s", errorMessage, e.Error())
   173  		}
   174  		return nil, fmt.Errorf(errorMessage)
   175  	}
   176  	if len(bag.Manifests) == 0 {
   177  		return nil, fmt.Errorf("Unable to parse a manifest")
   178  	}
   179  
   180  	for i := range bag.Manifests {
   181  		manifest := bag.Manifests[i]
   182  		manifestPath := manifest.Name()
   183  		if filepath.Dir(manifestPath) != bag.pathToFile {
   184  			manifestPath = filepath.Join(bag.pathToFile, manifest.Name())
   185  		}
   186  		if _, err := FS.Stat(manifestPath); err != nil {
   187  			return nil, fmt.Errorf("Can't find manifest: %v", err)
   188  		}
   189  		parsedManifest, errs := ReadManifest(manifestPath)
   190  		if errs != nil && len(errs) > 0 {
   191  			errors := ""
   192  			for _, e := range errs {
   193  				errors = fmt.Sprintf("%s; %s", errors, e.Error())
   194  			}
   195  			return nil, fmt.Errorf("Unable to parse manifest %s: %s", manifestPath, errors)
   196  		} else {
   197  			bag.Manifests[i] = parsedManifest
   198  		}
   199  	}
   200  
   201  	/*
   202  		       Note that we are parsing tags from the expected tag files, and
   203  		       not parsing tags from unexpected tag files. This is per the BagIt
   204  		       spec for V0.97, section 2.2.4, as described here:
   205  
   206  		       http://tools.ietf.org/html/draft-kunze-bagit-13#section-2.2.4
   207  
   208  		       A bag MAY contain other tag files that are not defined by this
   209  		       specification.  Implementations SHOULD ignore the content of any
   210  			   unexpected tag files, except when they are listed in a tag manifest.
   211  		       When unexpected tag files are listed in a tag manifest,
   212  		       implementations MUST only treat the content of those tag files as
   213  		       octet streams for the purpose of checksum verification.
   214  	*/
   215  	for _, tName := range tagfiles {
   216  		tf, errs := ReadTagFile(filepath.Join(bag.pathToFile, tName))
   217  		// Warn on Stderr only if we're running as bagmaker
   218  		if len(errs) != 0 && strings.Index(os.Args[0], "bagmaker") > -1 {
   219  			log.Println("While parsing tagfiles:", errs)
   220  		}
   221  		if tf != nil {
   222  			bag.tagfiles[tName] = tf
   223  		}
   224  	}
   225  
   226  	return bag, nil
   227  }
   228  
   229  // Finds all payload and tag manifests in an existing bag.
   230  // This is used by ReadBag, not when creating a bag.
   231  func (b *Bag) findManifests() []error {
   232  	if b.Manifests == nil {
   233  		b.Manifests = make([]*Manifest, 0)
   234  	}
   235  	if len(b.Manifests) == 0 {
   236  		bagFiles, _ := b.ListFiles()
   237  		for _, fName := range bagFiles {
   238  
   239  			filePath := filepath.Join(b.pathToFile, fName)
   240  			payloadManifestPrefix := filepath.Join(b.pathToFile, "manifest-")
   241  			tagManifestPrefix := filepath.Join(b.pathToFile, "tagmanifest-")
   242  
   243  			if strings.HasPrefix(filePath, payloadManifestPrefix) ||
   244  				strings.HasPrefix(filePath, tagManifestPrefix) {
   245  				manifest, errors := ReadManifest(filePath)
   246  				if errors != nil && len(errors) > 0 {
   247  					return errors
   248  				}
   249  				b.Manifests = append(b.Manifests, manifest)
   250  			}
   251  		}
   252  	}
   253  	return nil
   254  }
   255  
   256  // METHODS FOR MANAGING BAG PAYLOADS
   257  
   258  /*
   259    Adds a file specified by src parameter to the data directory under
   260    the relative path and filename provided in the dst parameter.
   261    example:
   262  			err := b.AddFile("/tmp/myfile.txt", "myfile.txt")
   263  */
   264  func (b *Bag) AddFile(src string, dst string) error {
   265  	payloadManifests := b.GetManifests(PayloadManifest)
   266  	_, err := b.payload.Add(src, dst, payloadManifests)
   267  	if err != nil {
   268  		return err
   269  	}
   270  	return err
   271  }
   272  
   273  // Performans a Bag.AddFile on all files found under the src
   274  // location including all subdirectories.
   275  // example:
   276  //			errs := b.AddDir("/tmp/mypreservationfiles")
   277  func (b *Bag) AddDir(src string) (errs []error) {
   278  	payloadManifests := b.GetManifests(PayloadManifest)
   279  	_, errs = b.payload.AddAll(src, payloadManifests)
   280  	return errs
   281  }
   282  
   283  // METHODS FOR MANAGING BAG TAG FILES
   284  
   285  /*
   286   Adds a tagfile to the bag with the filename provided,
   287   creating whatever subdirectories are needed if supplied
   288   as part of name parameter.
   289   example:
   290  			err := b.AddTagfile("baginfo.txt")
   291  
   292   Note that this is for adding tag files that adhere to
   293   the "Text Tag File Format" described in section 2.2.4
   294   of the BagIt spec at http://tools.ietf.org/html/draft-kunze-bagit-13.
   295  
   296   For this type of tag file, you add name-value pairs to
   297   the tag file's Data attribute, and this library ensures
   298   that the data is written to the file according to the
   299   specification.
   300  
   301   The spec also allows you to add non-standard tag files
   302   in ANY format. For that, see AddCustomTagfile.
   303  */
   304  func (b *Bag) AddTagfile(name string) error {
   305  	tagFilePath := filepath.Join(b.Path(), name)
   306  	if err := FS.MkdirAll(filepath.Dir(tagFilePath), 0766); err != nil {
   307  		return err
   308  	}
   309  	tf, err := NewTagFile(tagFilePath)
   310  	if err != nil {
   311  		return err
   312  	}
   313  	b.tagfiles[name] = tf
   314  	if err := tf.Create(); err != nil {
   315  		return err
   316  	}
   317  	return nil
   318  }
   319  
   320  /*
   321   AddCustomTagfile adds a tag file of ANY format into the
   322   bag at the specified path without making any attempt to
   323   validate or even read the contents of the custom tag file.
   324  
   325   The sourcePath param describes where the file should be
   326   copied from. The destPath param describes what the file's
   327   relative path in the bag should be, while includeInTagManifests
   328   describes whether the custom tag file should be included in
   329   the bag's tag manifests.
   330  
   331   The destPath parameter cannot start with "data/" because
   332   that would put it in the payload directory, and it cannot
   333   start with a slash or contain "..".
   334  
   335   Example:
   336  
   337   bag.AddCustomTagfile("/home/june/cleaver.xml", "customtags/cleaver-meta.xml", true)
   338  
   339   That says put "/home/june/cleaver.xml" into the bag at
   340   "customtags/cleaver-meta.xml" and record it in the tagmanifests
   341   with the appropriate checksums.
   342  */
   343  func (b *Bag) AddCustomTagfile(sourcePath string, destPath string, includeInTagManifests bool) error {
   344  	if strings.HasPrefix(destPath, "/data") ||
   345  		strings.HasPrefix(destPath, "/") || strings.Contains(destPath, "..") {
   346  		return fmt.Errorf("Illegal value '%s' for param destPath. "+
   347  			"File name cannot start with '/' or '/data' or contain '..'", destPath)
   348  	}
   349  
   350  	absSourcePath, err := filepath.Abs(sourcePath)
   351  	if err != nil {
   352  		return err
   353  	}
   354  	absDestPath, err := filepath.Abs(filepath.Join(b.pathToFile, destPath))
   355  	if err != nil {
   356  		return err
   357  	}
   358  
   359  	if absSourcePath != absDestPath {
   360  		sourceFile, err := FS.Open(absSourcePath)
   361  		if err != nil {
   362  			return err
   363  		}
   364  		defer sourceFile.Close()
   365  
   366  		if err = FS.MkdirAll(filepath.Dir(absDestPath), 0766); err != nil {
   367  			return err
   368  		}
   369  		destFile, err := FS.Create(absDestPath)
   370  		if err != nil {
   371  			return err
   372  		}
   373  		defer destFile.Close()
   374  
   375  		_, err = io.Copy(destFile, sourceFile)
   376  		if err != nil {
   377  			return err
   378  		}
   379  	}
   380  
   381  	// The Save() function puts all non-payload, non-manifest files
   382  	// into the tag manifests by default. So we only need to keep
   383  	// a map of what to exclude.
   384  	if includeInTagManifests == false {
   385  		if b.excludeFromTagManifests == nil {
   386  			b.excludeFromTagManifests = make(map[string]bool)
   387  		}
   388  		b.excludeFromTagManifests[destPath] = true
   389  	}
   390  
   391  	return nil
   392  }
   393  
   394  /*
   395   Finds a tagfile in by its relative path to the bag root directory.
   396   example:
   397  			tf, err := b.TagFile("bag-info.txt")
   398  */
   399  func (b *Bag) TagFile(name string) (*TagFile, error) {
   400  	if tf, ok := b.tagfiles[name]; ok {
   401  		return tf, nil
   402  	}
   403  	return nil, fmt.Errorf("Unable to find tagfile %s", name)
   404  }
   405  
   406  /*
   407    Lists all the current tag files the bag is tracking.
   408    These are the tag files that the bag has actually parsed.
   409    The bag may have any number of unparsed (and perhaps unreadable)
   410    tag files as well. For those, see UnparsedTagFiles()
   411  */
   412  func (b *Bag) ListTagFiles() []string {
   413  	names := make([]string, len(b.tagfiles))
   414  	i := 0
   415  	for k := range b.tagfiles {
   416  		names[i] = k
   417  		i++
   418  	}
   419  	return names
   420  }
   421  
   422  // Returns a list of unparsed tag files, which includes any file
   423  // not a manifest, not in the data directory, and not among the
   424  // tag files passed into ReadBag().
   425  func (b *Bag) UnparsedTagFiles() ([]string, error) {
   426  	var files []string
   427  
   428  	// WalkDir function to collect files in the bag..
   429  	visit := func(pathToFile string, info os.FileInfo, err error) error {
   430  		if err != nil {
   431  			return err
   432  		}
   433  
   434  		relativePath, err := filepath.Rel(b.Path(), pathToFile)
   435  		if err != nil {
   436  			return err
   437  		}
   438  
   439  		isPayload := strings.HasPrefix(pathToFile, b.payload.Name())
   440  		isManifest := strings.HasPrefix(relativePath, "tagmanifest-") ||
   441  			strings.HasPrefix(relativePath, "manifest-")
   442  		_, isParsedTagFile := b.tagfiles[relativePath]
   443  
   444  		if !info.IsDir() && !isPayload && !isParsedTagFile && !isManifest {
   445  			if relativePath != "." {
   446  				files = append(files, relativePath)
   447  			}
   448  		}
   449  		return err
   450  	}
   451  
   452  	if err := FS.Walk(b.Path(), visit); err != nil {
   453  		return nil, err
   454  	}
   455  
   456  	return files, nil
   457  }
   458  
   459  /*
   460   Convienence method to return the bag-info.txt tag file if it exists.  Since
   461   this is optional it will not be created by default and will return an error
   462   if you have not defined or added it yourself via Bag.AddTagfile
   463  */
   464  func (b *Bag) BagInfo() (*TagFile, error) {
   465  	tf, err := b.TagFile("bag-info.txt")
   466  	if err != nil {
   467  		return nil, err
   468  	}
   469  	return tf, nil
   470  }
   471  
   472  // Returns the manifest with the specified algorithm and type,
   473  // or nil. For example, GetManifest(PayloadManifest, "sha256")
   474  // returns either a reference to manifest-sha256.txt or nil.
   475  // GetManifest(TagManifest, "md5") returns a reference to
   476  // tagmanifest-md5.txt or nil.
   477  func (b *Bag) GetManifest(manifestType, algorithm string) *Manifest {
   478  	for _, m := range b.Manifests {
   479  		if m.Type() == manifestType && m.Algorithm() == algorithm {
   480  			return m
   481  		}
   482  	}
   483  	return nil
   484  }
   485  
   486  // Returns the manifests of the specified type,
   487  // or an empty slice. For example, GetManifests(PayloadManifest)
   488  // returns all of the payload manifests.
   489  func (b *Bag) GetManifests(manifestType string) []*Manifest {
   490  	manifests := make([]*Manifest, 0)
   491  	for _, m := range b.Manifests {
   492  		if m.Type() == manifestType {
   493  			manifests = append(manifests, m)
   494  		}
   495  	}
   496  	return manifests
   497  }
   498  
   499  // TODO create methods for managing fetch file.
   500  
   501  // METHODS FOR MANAGING OR RETURNING INFORMATION ABOUT THE BAG ITSELF
   502  
   503  // Returns the full path of the bag including it's own directory.
   504  func (b *Bag) Path() string {
   505  	return b.pathToFile
   506  }
   507  
   508  /*
   509   This method writes all the relevant tag and manifest files to finish off the
   510   bag.
   511  */
   512  func (b *Bag) Save() (errs []error) {
   513  
   514  	errors := b.savePayloadManifests()
   515  	if len(errors) > 0 {
   516  		errs = append(errs, errors...)
   517  	}
   518  
   519  	errors = b.calculateChecksumsForManagedTagFiles()
   520  	if len(errors) > 0 {
   521  		errs = append(errs, errors...)
   522  	}
   523  
   524  	errors = b.calculateChecksumsForCustomTagFiles()
   525  	if len(errors) > 0 {
   526  		errs = append(errs, errors...)
   527  	}
   528  
   529  	errors = b.saveTagManifests()
   530  	if len(errors) > 0 {
   531  		errs = append(errs, errors...)
   532  	}
   533  
   534  	return errs
   535  }
   536  
   537  func (b *Bag) savePayloadManifests() (errs []error) {
   538  	// Write the payload manifests first because we may
   539  	// need to include their checksums in the tagmanifests.
   540  	payloadManifests := b.GetManifests(PayloadManifest)
   541  	for i := range payloadManifests {
   542  		manifest := payloadManifests[i]
   543  		if err := manifest.Create(); err != nil {
   544  			errs = append(errs, err)
   545  		}
   546  	}
   547  	return errs
   548  }
   549  
   550  func (b *Bag) calculateChecksumsForManagedTagFiles() (errs []error) {
   551  	tagManifests := b.GetManifests(TagManifest)
   552  	for _, tf := range b.tagfiles {
   553  		if err := FS.MkdirAll(filepath.Dir(tf.Name()), 0766); err != nil {
   554  			errs = append(errs, err)
   555  		}
   556  		if err := tf.Create(); err != nil {
   557  			errs = append(errs, err)
   558  		}
   559  		// Add tag file checksums to tag manifests
   560  		for i := range tagManifests {
   561  			manifest := tagManifests[i]
   562  			checksum, err := FileChecksum(tf.Name(), manifest.hashFunc())
   563  			if err != nil {
   564  				errors := []error{
   565  					fmt.Errorf("Error calculating %s checksum for file %s: %v",
   566  						manifest.Algorithm(), tf.Name(), err),
   567  				}
   568  				return errors
   569  			}
   570  			relativeFilePath := strings.Replace(tf.Name(), b.pathToFile+"/", "", 1)
   571  			manifest.Data[relativeFilePath] = checksum
   572  		}
   573  	}
   574  	return errs
   575  }
   576  
   577  func (b *Bag) calculateChecksumsForCustomTagFiles() (errs []error) {
   578  	// Calculate checksums that go into the tag manifests.
   579  	nonPayloadFiles, err := b.UnparsedTagFiles()
   580  	if err != nil {
   581  		errs = append(errs, err)
   582  	}
   583  	payloadManifests := b.GetManifests(PayloadManifest)
   584  	tagManifests := b.GetManifests(TagManifest)
   585  	for _, m := range payloadManifests {
   586  		nonPayloadFiles = append(nonPayloadFiles, m.Name())
   587  	}
   588  	for _, file := range nonPayloadFiles {
   589  		relativeFilePath := strings.Replace(file, b.pathToFile+"/", "", 1)
   590  		if _, exclude := b.excludeFromTagManifests[relativeFilePath]; exclude {
   591  			continue
   592  		}
   593  		// Use relative path in manifest, abs path when calculating checksum.
   594  		absPathToFile := file
   595  		if !strings.HasPrefix(file, b.pathToFile) {
   596  			absPathToFile = filepath.Join(b.pathToFile, file)
   597  		}
   598  		for i := range tagManifests {
   599  			manifest := tagManifests[i]
   600  			checksum, err := FileChecksum(absPathToFile, manifest.hashFunc())
   601  			if err != nil {
   602  				errors := []error{
   603  					fmt.Errorf("Error calculating %s checksum for file %s: %v",
   604  						manifest.Algorithm(), file, err),
   605  				}
   606  				return errors
   607  			}
   608  			manifest.Data[relativeFilePath] = checksum
   609  		}
   610  	}
   611  	return errs
   612  }
   613  
   614  func (b *Bag) saveTagManifests() (errs []error) {
   615  	tagManifests := b.GetManifests(TagManifest)
   616  	for i := range tagManifests {
   617  		manifest := tagManifests[i]
   618  		if err := manifest.Create(); err != nil {
   619  			errs = append(errs, err)
   620  		}
   621  	}
   622  	return errs
   623  }
   624  
   625  /*
   626   Walks the bag directory and subdirectories and returns the
   627   filepaths found inside and any errors skipping files in the
   628   payload directory.
   629  */
   630  func (b *Bag) ListFiles() ([]string, error) {
   631  
   632  	var files []string
   633  
   634  	// WalkDir function to collect files in the bag..
   635  	visit := func(pathToFile string, info os.FileInfo, err error) error {
   636  		if err != nil {
   637  			return err
   638  		}
   639  
   640  		if !info.IsDir() {
   641  			fp, err := filepath.Rel(b.Path(), pathToFile)
   642  			if err != nil {
   643  				return err
   644  			}
   645  			if fp != "." {
   646  				files = append(files, fp)
   647  			}
   648  		}
   649  		return err
   650  	}
   651  
   652  	if err := FS.Walk(b.Path(), visit); err != nil {
   653  		return nil, err
   654  	}
   655  
   656  	return files, nil
   657  }