github.com/pachyderm/pachyderm@v1.13.4/src/client/admin/v1_7/hashtree/hashtree.proto (about)

     1  // Data structures for serializing hash trees, which Pachyderm uses to track
     2  // the files present in each commit and determine when to start jobs.
     3  
     4  syntax = "proto3";
     5  
     6  package hashtree_1_7;
     7  option go_package = "github.com/pachyderm/pachyderm/src/client/admin/v1_7/hashtree";
     8  
     9  import "client/admin/v1_7/pfs/pfs.proto";
    10  
    11  // FileNodeProto is a node corresponding to a file (which is also a leaf node).
    12  message FileNodeProto {
    13    // Object references an object in the object store which contains the content
    14    // of the data.
    15    repeated pfs_1_7.Object objects = 4;
    16  }
    17  
    18  // DirectoryNodeProto is a node corresponding to a directory.
    19  message DirectoryNodeProto {
    20    // Children of this directory. Note that paths are relative, so if "/foo/bar"
    21    // has a child "baz", that means that there is a file at "/foo/bar/baz".
    22    //
    23    // 'Children' is ordered alphabetically, to quickly check if a new file is
    24    // overwriting an existing one.
    25    repeated string children = 3;
    26    pfs_1_7.Object header = 4;
    27    pfs_1_7.Object footer = 5;
    28  }
    29  
    30  // NodeProto is a node in the file tree (either a file or a directory)
    31  message NodeProto {
    32    // Name is the name (not path) of the file/directory (e.g. /lib).
    33    string name = 1;
    34  
    35    // Hash is a hash of the node's name and contents (which includes the
    36    // BlockRefs of a file and the Children of a directory). This can be used to
    37    // detect if the name or contents have changed between versions.
    38    bytes hash = 2;
    39  
    40    // subtree_size is the of the subtree under node; i.e. if this is a directory,
    41    // subtree_size includes all children.
    42    int64 subtree_size = 3;
    43  
    44    // Exactly one of the following fields must be set. The type of this node will
    45    // be determined by which field is set.
    46    FileNodeProto file_node = 4;
    47    DirectoryNodeProto dir_node = 5;
    48  }
    49  
    50  // HashTreeProto is a tree corresponding to the complete file contents of a
    51  // pachyderm repo at a given commit (based on a Merkle Tree). We store one
    52  // HashTree for every PFS commit.
    53  message HashTreeProto {
    54    // Version is an arbitrary version number, set by the corresponding library
    55    // in hashtree.go.  This ensures that if the hash function used to create
    56    // these trees is changed, we won't run into errors when deserializing old
    57    // trees. The current version is 1.
    58    int32 version = 1;
    59  
    60    // Fs maps each node's path to the NodeProto with that node's details.
    61    // See "Potential Optimizations" at the end for a compression scheme that
    62    // could be useful if this map gets too large.
    63    //
    64    // Note that the key must end in "/" if an only if the value has .dir_node set
    65    // (i.e. iff the path points to a directory).
    66    map<string, NodeProto> fs = 2;
    67  }
    68  
    69  /// Potential Optimizations
    70  //
    71  // Currently, we serialize HashTree.fs, i.e. the map from paths to nodes, as a
    72  // protobuf Map. This keeps our code simple, but may be inefficient for certain
    73  // repositories. Consider a repository that breaks up a large file with many
    74  // JSON records into many small files containing one record:
    75  //
    76  // /file/r00000
    77  // /file/r00001
    78  // ...
    79  // /file/r99999
    80  //
    81  // The current serialization format stores the complete path of each file, which
    82  // means that in this examples, the string "/file/" is serialized 100,000 times
    83  // in every commit. An alternative approach would be to make the keys a repeated
    84  // field, and "delta-encode" the paths. In this example, that would mean
    85  // encoding a repeated string field with the elements:
    86  //
    87  // /
    88  // file/
    89  // r00000
    90  // r00001
    91  // r00002
    92  // ...
    93  // r99999
    94  //
    95  // (Note that "file/" followed by "r00000" implies "file/r00000" because
    96  // "file/" ends in a slash, but "r00000" followed by "r00001" does not imply
    97  // "r00000r00001" because "r00000" does not end in a slash).
    98  //
    99  // If there are many small files with a shared prefix, this might save
   100  // nontrivial space in the object store:
   101  //   (common path length) * (#files) * (#commits)
   102  //
   103  // This would mean that there is some explicit deserialization code that turns
   104  // the stored protobuf (which is hard to manipulate) into a separate Go object.
   105  //
   106  // One more example: a repo with three top-level directories: "foo/", "bar/"
   107  // and "baz/". This would be encoded as:
   108  //
   109  //    /
   110  //    foo/
   111  //    file_in_foo.json
   112  //    another_file_in_foo.json
   113  //    ../bar/
   114  //    file_in_bar.json
   115  //    ../baz/
   116  //    file_in_baz.json