github.com/pachyderm/pachyderm@v1.13.4/src/server/pkg/storage/fileset/merge.go (about)

     1  package fileset
     2  
     3  import (
     4  	"context"
     5  	"io"
     6  	"strings"
     7  
     8  	"github.com/gogo/protobuf/proto"
     9  	"github.com/pachyderm/pachyderm/src/server/pkg/storage/chunk"
    10  	"github.com/pachyderm/pachyderm/src/server/pkg/storage/fileset/index"
    11  )
    12  
    13  // MergeReader is an abstraction for reading merged filesets.
    14  // A file's content is ordered based on the lexicographical order of
    15  // the tagged content, so the output file content is produced by
    16  // performing a merge of the tagged content.
    17  type MergeReader struct {
    18  	chunks   *chunk.Storage
    19  	fileSets []FileSet
    20  }
    21  
    22  func newMergeReader(chunks *chunk.Storage, fileSets []FileSet) *MergeReader {
    23  	return &MergeReader{
    24  		chunks:   chunks,
    25  		fileSets: fileSets,
    26  	}
    27  }
    28  
    29  // Iterate iterates over the files in the merge reader.
    30  func (mr *MergeReader) Iterate(ctx context.Context, cb func(File) error, deletive ...bool) error {
    31  	if len(deletive) > 0 && deletive[0] {
    32  		return mr.iterateDeletive(ctx, cb)
    33  	}
    34  	return mr.iterate(ctx, cb)
    35  }
    36  
    37  func (mr *MergeReader) iterate(ctx context.Context, cb func(File) error) error {
    38  	var ss []stream
    39  	for _, fs := range mr.fileSets {
    40  		ss = append(ss, &fileStream{
    41  			iterator: NewIterator(ctx, fs, true),
    42  			priority: len(ss),
    43  			deletive: true,
    44  		})
    45  		ss = append(ss, &fileStream{
    46  			iterator: NewIterator(ctx, fs),
    47  			priority: len(ss),
    48  		})
    49  	}
    50  	pq := newPriorityQueue(ss)
    51  	var skipPrefix string
    52  	var skipPriority int
    53  	return pq.iterate(func(ss []stream, _ ...string) error {
    54  		var fss []*fileStream
    55  		for _, s := range ss {
    56  			fss = append(fss, s.(*fileStream))
    57  		}
    58  		if skipPrefix != "" && strings.HasPrefix(fss[0].key(), skipPrefix) {
    59  			for i, fs := range fss {
    60  				if fs.streamPriority() < skipPriority {
    61  					fss = fss[:i]
    62  					break
    63  				}
    64  			}
    65  		}
    66  		if len(fss) == 0 {
    67  			return nil
    68  		}
    69  		if IsDir(fss[0].key()) {
    70  			skipPrefix = fss[0].key()
    71  			skipPriority = fss[0].streamPriority()
    72  			return nil
    73  		}
    74  		if len(fss) == 1 {
    75  			if fss[0].deletive {
    76  				return nil
    77  			}
    78  			return cb(newFileReader(ctx, mr.chunks, fss[0].file.Index()))
    79  		}
    80  		idx := mergeFile(fss)
    81  		// Handle a full delete.
    82  		if len(idx.File.Parts) == 0 {
    83  			return nil
    84  		}
    85  		return cb(newMergeFileReader(ctx, mr.chunks, idx))
    86  	})
    87  }
    88  
    89  func mergeFile(fss []*fileStream) *index.Index {
    90  	mergeIdx := &index.Index{
    91  		Path: fss[0].file.Index().Path,
    92  		File: &index.File{},
    93  	}
    94  	var ps []*partStream
    95  	for _, fs := range fss {
    96  		idx := fs.file.Index()
    97  		if fs.deletive && idx.File.Parts == nil {
    98  			break
    99  		}
   100  		ps = append(ps, &partStream{
   101  			parts:    idx.File.Parts,
   102  			deletive: fs.deletive,
   103  		})
   104  	}
   105  	// Merge the parts based on the lexicograhical ordering of the tags.
   106  	mergeIdx.File.Parts = mergeParts(ps)
   107  	return mergeIdx
   108  }
   109  
   110  func mergeParts(pss []*partStream) []*index.Part {
   111  	if len(pss) == 0 {
   112  		return nil
   113  	}
   114  	var ss []stream
   115  	for i := len(pss) - 1; i >= 0; i-- {
   116  		pss[i].priority = len(ss)
   117  		ss = append(ss, pss[i])
   118  	}
   119  	pq := newPriorityQueue(ss)
   120  	var mergedParts []*index.Part
   121  	pq.iterate(func(ss []stream, _ ...string) error {
   122  		for i := 0; i < len(ss); i++ {
   123  			ps := ss[i].(*partStream)
   124  			if ps.deletive {
   125  				return nil
   126  			}
   127  			mergedParts = mergePart(mergedParts, ps.part)
   128  		}
   129  		return nil
   130  	})
   131  	return mergedParts
   132  }
   133  
   134  func mergePart(parts []*index.Part, part *index.Part) []*index.Part {
   135  	if len(parts) == 0 {
   136  		return []*index.Part{part}
   137  	}
   138  	lastPart := parts[len(parts)-1]
   139  	if lastPart.Tag == part.Tag {
   140  		if part.DataRefs != nil {
   141  			lastPart.DataRefs = append(lastPart.DataRefs, part.DataRefs...)
   142  		}
   143  		lastPart.SizeBytes += part.SizeBytes
   144  		return parts
   145  	}
   146  	return append(parts, part)
   147  }
   148  
   149  func (mr *MergeReader) iterateDeletive(ctx context.Context, cb func(File) error) error {
   150  	var ss []stream
   151  	for _, fs := range mr.fileSets {
   152  		ss = append(ss, &fileStream{
   153  			iterator: NewIterator(ctx, fs, true),
   154  			priority: len(ss),
   155  		})
   156  	}
   157  	pq := newPriorityQueue(ss)
   158  	return pq.iterate(func(ss []stream, _ ...string) error {
   159  		var idxs []*index.Index
   160  		for _, s := range ss {
   161  			idxs = append(idxs, s.(*fileStream).file.Index())
   162  		}
   163  		idx := mergeDeletes(idxs)
   164  		return cb(newFileReader(ctx, mr.chunks, idx))
   165  	})
   166  }
   167  
   168  func mergeDeletes(idxs []*index.Index) *index.Index {
   169  	mergeIdx := &index.Index{
   170  		Path: idxs[0].Path,
   171  		File: &index.File{},
   172  	}
   173  	var ps []*partStream
   174  	for _, idx := range idxs {
   175  		// Handle full delete.
   176  		if idx.File.Parts == nil {
   177  			return mergeIdx
   178  		}
   179  		ps = append(ps, &partStream{
   180  			parts: idx.File.Parts,
   181  		})
   182  	}
   183  	// Merge the parts based on the lexicograhical ordering of the tags.
   184  	mergeIdx.File.Parts = mergeParts(ps)
   185  	return mergeIdx
   186  }
   187  
   188  // MergeFileReader is an abstraction for reading a merged file.
   189  type MergeFileReader struct {
   190  	ctx    context.Context
   191  	chunks *chunk.Storage
   192  	idx    *index.Index
   193  }
   194  
   195  func newMergeFileReader(ctx context.Context, chunks *chunk.Storage, idx *index.Index) *MergeFileReader {
   196  	return &MergeFileReader{
   197  		ctx:    ctx,
   198  		chunks: chunks,
   199  		idx:    idx,
   200  	}
   201  }
   202  
   203  // Index returns the index for the merged file.
   204  func (mfr *MergeFileReader) Index() *index.Index {
   205  	return proto.Clone(mfr.idx).(*index.Index)
   206  }
   207  
   208  // Content returns the content of the merged file.
   209  func (mfr *MergeFileReader) Content(w io.Writer) error {
   210  	dataRefs := getDataRefs(mfr.idx.File.Parts)
   211  	r := mfr.chunks.NewReader(mfr.ctx, dataRefs)
   212  	return r.Get(w)
   213  }
   214  
   215  type fileStream struct {
   216  	iterator *Iterator
   217  	file     File
   218  	priority int
   219  	deletive bool
   220  }
   221  
   222  func (fs *fileStream) next() error {
   223  	var err error
   224  	fs.file, err = fs.iterator.Next()
   225  	return err
   226  }
   227  
   228  func (fs *fileStream) key() string {
   229  	return fs.file.Index().Path
   230  }
   231  
   232  func (fs *fileStream) streamPriority() int {
   233  	return fs.priority
   234  }
   235  
   236  type partStream struct {
   237  	parts    []*index.Part
   238  	part     *index.Part
   239  	priority int
   240  	deletive bool
   241  }
   242  
   243  func (ps *partStream) next() error {
   244  	if len(ps.parts) == 0 {
   245  		return io.EOF
   246  	}
   247  	ps.part = ps.parts[0]
   248  	ps.parts = ps.parts[1:]
   249  	return nil
   250  }
   251  
   252  func (ps *partStream) key() string {
   253  	return ps.part.Tag
   254  }
   255  
   256  func (ps *partStream) streamPriority() int {
   257  	return ps.priority
   258  }