github.com/pachyderm/pachyderm@v1.13.4/src/client/admin/v1_7/hashtree/hashtree.proto (about) 1 // Data structures for serializing hash trees, which Pachyderm uses to track 2 // the files present in each commit and determine when to start jobs. 3 4 syntax = "proto3"; 5 6 package hashtree_1_7; 7 option go_package = "github.com/pachyderm/pachyderm/src/client/admin/v1_7/hashtree"; 8 9 import "client/admin/v1_7/pfs/pfs.proto"; 10 11 // FileNodeProto is a node corresponding to a file (which is also a leaf node). 12 message FileNodeProto { 13 // Object references an object in the object store which contains the content 14 // of the data. 15 repeated pfs_1_7.Object objects = 4; 16 } 17 18 // DirectoryNodeProto is a node corresponding to a directory. 19 message DirectoryNodeProto { 20 // Children of this directory. Note that paths are relative, so if "/foo/bar" 21 // has a child "baz", that means that there is a file at "/foo/bar/baz". 22 // 23 // 'Children' is ordered alphabetically, to quickly check if a new file is 24 // overwriting an existing one. 25 repeated string children = 3; 26 pfs_1_7.Object header = 4; 27 pfs_1_7.Object footer = 5; 28 } 29 30 // NodeProto is a node in the file tree (either a file or a directory) 31 message NodeProto { 32 // Name is the name (not path) of the file/directory (e.g. /lib). 33 string name = 1; 34 35 // Hash is a hash of the node's name and contents (which includes the 36 // BlockRefs of a file and the Children of a directory). This can be used to 37 // detect if the name or contents have changed between versions. 38 bytes hash = 2; 39 40 // subtree_size is the of the subtree under node; i.e. if this is a directory, 41 // subtree_size includes all children. 42 int64 subtree_size = 3; 43 44 // Exactly one of the following fields must be set. The type of this node will 45 // be determined by which field is set. 46 FileNodeProto file_node = 4; 47 DirectoryNodeProto dir_node = 5; 48 } 49 50 // HashTreeProto is a tree corresponding to the complete file contents of a 51 // pachyderm repo at a given commit (based on a Merkle Tree). We store one 52 // HashTree for every PFS commit. 53 message HashTreeProto { 54 // Version is an arbitrary version number, set by the corresponding library 55 // in hashtree.go. This ensures that if the hash function used to create 56 // these trees is changed, we won't run into errors when deserializing old 57 // trees. The current version is 1. 58 int32 version = 1; 59 60 // Fs maps each node's path to the NodeProto with that node's details. 61 // See "Potential Optimizations" at the end for a compression scheme that 62 // could be useful if this map gets too large. 63 // 64 // Note that the key must end in "/" if an only if the value has .dir_node set 65 // (i.e. iff the path points to a directory). 66 map<string, NodeProto> fs = 2; 67 } 68 69 /// Potential Optimizations 70 // 71 // Currently, we serialize HashTree.fs, i.e. the map from paths to nodes, as a 72 // protobuf Map. This keeps our code simple, but may be inefficient for certain 73 // repositories. Consider a repository that breaks up a large file with many 74 // JSON records into many small files containing one record: 75 // 76 // /file/r00000 77 // /file/r00001 78 // ... 79 // /file/r99999 80 // 81 // The current serialization format stores the complete path of each file, which 82 // means that in this examples, the string "/file/" is serialized 100,000 times 83 // in every commit. An alternative approach would be to make the keys a repeated 84 // field, and "delta-encode" the paths. In this example, that would mean 85 // encoding a repeated string field with the elements: 86 // 87 // / 88 // file/ 89 // r00000 90 // r00001 91 // r00002 92 // ... 93 // r99999 94 // 95 // (Note that "file/" followed by "r00000" implies "file/r00000" because 96 // "file/" ends in a slash, but "r00000" followed by "r00001" does not imply 97 // "r00000r00001" because "r00000" does not end in a slash). 98 // 99 // If there are many small files with a shared prefix, this might save 100 // nontrivial space in the object store: 101 // (common path length) * (#files) * (#commits) 102 // 103 // This would mean that there is some explicit deserialization code that turns 104 // the stored protobuf (which is hard to manipulate) into a separate Go object. 105 // 106 // One more example: a repo with three top-level directories: "foo/", "bar/" 107 // and "baz/". This would be encoded as: 108 // 109 // / 110 // foo/ 111 // file_in_foo.json 112 // another_file_in_foo.json 113 // ../bar/ 114 // file_in_bar.json 115 // ../baz/ 116 // file_in_baz.json