github.com/pachyderm/pachyderm@v1.13.4/src/server/pkg/storage/fileset/merge.go (about) 1 package fileset 2 3 import ( 4 "context" 5 "io" 6 "strings" 7 8 "github.com/gogo/protobuf/proto" 9 "github.com/pachyderm/pachyderm/src/server/pkg/storage/chunk" 10 "github.com/pachyderm/pachyderm/src/server/pkg/storage/fileset/index" 11 ) 12 13 // MergeReader is an abstraction for reading merged filesets. 14 // A file's content is ordered based on the lexicographical order of 15 // the tagged content, so the output file content is produced by 16 // performing a merge of the tagged content. 17 type MergeReader struct { 18 chunks *chunk.Storage 19 fileSets []FileSet 20 } 21 22 func newMergeReader(chunks *chunk.Storage, fileSets []FileSet) *MergeReader { 23 return &MergeReader{ 24 chunks: chunks, 25 fileSets: fileSets, 26 } 27 } 28 29 // Iterate iterates over the files in the merge reader. 30 func (mr *MergeReader) Iterate(ctx context.Context, cb func(File) error, deletive ...bool) error { 31 if len(deletive) > 0 && deletive[0] { 32 return mr.iterateDeletive(ctx, cb) 33 } 34 return mr.iterate(ctx, cb) 35 } 36 37 func (mr *MergeReader) iterate(ctx context.Context, cb func(File) error) error { 38 var ss []stream 39 for _, fs := range mr.fileSets { 40 ss = append(ss, &fileStream{ 41 iterator: NewIterator(ctx, fs, true), 42 priority: len(ss), 43 deletive: true, 44 }) 45 ss = append(ss, &fileStream{ 46 iterator: NewIterator(ctx, fs), 47 priority: len(ss), 48 }) 49 } 50 pq := newPriorityQueue(ss) 51 var skipPrefix string 52 var skipPriority int 53 return pq.iterate(func(ss []stream, _ ...string) error { 54 var fss []*fileStream 55 for _, s := range ss { 56 fss = append(fss, s.(*fileStream)) 57 } 58 if skipPrefix != "" && strings.HasPrefix(fss[0].key(), skipPrefix) { 59 for i, fs := range fss { 60 if fs.streamPriority() < skipPriority { 61 fss = fss[:i] 62 break 63 } 64 } 65 } 66 if len(fss) == 0 { 67 return nil 68 } 69 if IsDir(fss[0].key()) { 70 skipPrefix = fss[0].key() 71 skipPriority = fss[0].streamPriority() 72 return nil 73 } 74 if len(fss) == 1 { 75 if fss[0].deletive { 76 return nil 77 } 78 return cb(newFileReader(ctx, mr.chunks, fss[0].file.Index())) 79 } 80 idx := mergeFile(fss) 81 // Handle a full delete. 82 if len(idx.File.Parts) == 0 { 83 return nil 84 } 85 return cb(newMergeFileReader(ctx, mr.chunks, idx)) 86 }) 87 } 88 89 func mergeFile(fss []*fileStream) *index.Index { 90 mergeIdx := &index.Index{ 91 Path: fss[0].file.Index().Path, 92 File: &index.File{}, 93 } 94 var ps []*partStream 95 for _, fs := range fss { 96 idx := fs.file.Index() 97 if fs.deletive && idx.File.Parts == nil { 98 break 99 } 100 ps = append(ps, &partStream{ 101 parts: idx.File.Parts, 102 deletive: fs.deletive, 103 }) 104 } 105 // Merge the parts based on the lexicograhical ordering of the tags. 106 mergeIdx.File.Parts = mergeParts(ps) 107 return mergeIdx 108 } 109 110 func mergeParts(pss []*partStream) []*index.Part { 111 if len(pss) == 0 { 112 return nil 113 } 114 var ss []stream 115 for i := len(pss) - 1; i >= 0; i-- { 116 pss[i].priority = len(ss) 117 ss = append(ss, pss[i]) 118 } 119 pq := newPriorityQueue(ss) 120 var mergedParts []*index.Part 121 pq.iterate(func(ss []stream, _ ...string) error { 122 for i := 0; i < len(ss); i++ { 123 ps := ss[i].(*partStream) 124 if ps.deletive { 125 return nil 126 } 127 mergedParts = mergePart(mergedParts, ps.part) 128 } 129 return nil 130 }) 131 return mergedParts 132 } 133 134 func mergePart(parts []*index.Part, part *index.Part) []*index.Part { 135 if len(parts) == 0 { 136 return []*index.Part{part} 137 } 138 lastPart := parts[len(parts)-1] 139 if lastPart.Tag == part.Tag { 140 if part.DataRefs != nil { 141 lastPart.DataRefs = append(lastPart.DataRefs, part.DataRefs...) 142 } 143 lastPart.SizeBytes += part.SizeBytes 144 return parts 145 } 146 return append(parts, part) 147 } 148 149 func (mr *MergeReader) iterateDeletive(ctx context.Context, cb func(File) error) error { 150 var ss []stream 151 for _, fs := range mr.fileSets { 152 ss = append(ss, &fileStream{ 153 iterator: NewIterator(ctx, fs, true), 154 priority: len(ss), 155 }) 156 } 157 pq := newPriorityQueue(ss) 158 return pq.iterate(func(ss []stream, _ ...string) error { 159 var idxs []*index.Index 160 for _, s := range ss { 161 idxs = append(idxs, s.(*fileStream).file.Index()) 162 } 163 idx := mergeDeletes(idxs) 164 return cb(newFileReader(ctx, mr.chunks, idx)) 165 }) 166 } 167 168 func mergeDeletes(idxs []*index.Index) *index.Index { 169 mergeIdx := &index.Index{ 170 Path: idxs[0].Path, 171 File: &index.File{}, 172 } 173 var ps []*partStream 174 for _, idx := range idxs { 175 // Handle full delete. 176 if idx.File.Parts == nil { 177 return mergeIdx 178 } 179 ps = append(ps, &partStream{ 180 parts: idx.File.Parts, 181 }) 182 } 183 // Merge the parts based on the lexicograhical ordering of the tags. 184 mergeIdx.File.Parts = mergeParts(ps) 185 return mergeIdx 186 } 187 188 // MergeFileReader is an abstraction for reading a merged file. 189 type MergeFileReader struct { 190 ctx context.Context 191 chunks *chunk.Storage 192 idx *index.Index 193 } 194 195 func newMergeFileReader(ctx context.Context, chunks *chunk.Storage, idx *index.Index) *MergeFileReader { 196 return &MergeFileReader{ 197 ctx: ctx, 198 chunks: chunks, 199 idx: idx, 200 } 201 } 202 203 // Index returns the index for the merged file. 204 func (mfr *MergeFileReader) Index() *index.Index { 205 return proto.Clone(mfr.idx).(*index.Index) 206 } 207 208 // Content returns the content of the merged file. 209 func (mfr *MergeFileReader) Content(w io.Writer) error { 210 dataRefs := getDataRefs(mfr.idx.File.Parts) 211 r := mfr.chunks.NewReader(mfr.ctx, dataRefs) 212 return r.Get(w) 213 } 214 215 type fileStream struct { 216 iterator *Iterator 217 file File 218 priority int 219 deletive bool 220 } 221 222 func (fs *fileStream) next() error { 223 var err error 224 fs.file, err = fs.iterator.Next() 225 return err 226 } 227 228 func (fs *fileStream) key() string { 229 return fs.file.Index().Path 230 } 231 232 func (fs *fileStream) streamPriority() int { 233 return fs.priority 234 } 235 236 type partStream struct { 237 parts []*index.Part 238 part *index.Part 239 priority int 240 deletive bool 241 } 242 243 func (ps *partStream) next() error { 244 if len(ps.parts) == 0 { 245 return io.EOF 246 } 247 ps.part = ps.parts[0] 248 ps.parts = ps.parts[1:] 249 return nil 250 } 251 252 func (ps *partStream) key() string { 253 return ps.part.Tag 254 } 255 256 func (ps *partStream) streamPriority() int { 257 return ps.priority 258 }