github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/datum/iterator.go (about)

     1  package datum
     2  
     3  import (
     4  	"bytes"
     5  	"io"
     6  	"sort"
     7  
     8  	glob "github.com/pachyderm/ohmyglob"
     9  
    10  	"github.com/pachyderm/pachyderm/src/client"
    11  	"github.com/pachyderm/pachyderm/src/client/pfs"
    12  	"github.com/pachyderm/pachyderm/src/client/pkg/errors"
    13  	"github.com/pachyderm/pachyderm/src/client/pps"
    14  	"github.com/pachyderm/pachyderm/src/server/pkg/path"
    15  	"github.com/pachyderm/pachyderm/src/server/worker/common"
    16  
    17  	"github.com/cevaris/ordered_map"
    18  )
    19  
    20  // Iterator is an interface which allows you to iterate through the datums
    21  // for a job. A datum iterator keeps track of which datum it is on, which can be Reset()
    22  // The intended use is by using this pattern `for di.Next() { ... datum := di.Datum() ... }`
    23  // Note that since you start the loop by a call to Next(), the datum iterator's location starts at -1
    24  type Iterator interface {
    25  	Reset()
    26  	Len() int
    27  	Next() bool
    28  	Datum() []*common.Input
    29  	DatumN(int) []*common.Input
    30  }
    31  
    32  type pfsIterator struct {
    33  	inputs   []*common.Input
    34  	location int
    35  }
    36  
    37  func newPFSIterator(pachClient *client.APIClient, input *pps.PFSInput) (Iterator, error) {
    38  	result := &pfsIterator{}
    39  	// make sure it gets initialized properly (location = -1)
    40  	result.Reset()
    41  	if input.Commit == "" {
    42  		// this can happen if a pipeline with multiple inputs has been triggered
    43  		// before all commits have inputs
    44  		return result, nil
    45  	}
    46  	fs, err := pachClient.GlobFileStream(pachClient.Ctx(), &pfs.GlobFileRequest{
    47  		Commit:  client.NewCommit(input.Repo, input.Commit),
    48  		Pattern: input.Glob,
    49  	})
    50  	if err != nil {
    51  		return nil, err
    52  	}
    53  	for {
    54  		fileInfo, err := fs.Recv()
    55  		if errors.Is(err, io.EOF) {
    56  			break
    57  		} else if err != nil {
    58  			return nil, err
    59  		}
    60  		g, err := glob.Compile(path.Clean(input.Glob), '/')
    61  		if err != nil {
    62  			return nil, err
    63  		}
    64  		joinOn := g.Replace(fileInfo.File.Path, input.JoinOn)
    65  		groupBy := g.Replace(fileInfo.File.Path, input.GroupBy)
    66  		result.inputs = append(result.inputs, &common.Input{
    67  			FileInfo:   fileInfo,
    68  			JoinOn:     joinOn,
    69  			GroupBy:    groupBy,
    70  			Name:       input.Name,
    71  			Lazy:       input.Lazy,
    72  			Branch:     input.Branch,
    73  			EmptyFiles: input.EmptyFiles,
    74  			S3:         input.S3,
    75  		})
    76  	}
    77  	// We sort the inputs so that the order is deterministic. Note that it's
    78  	// not possible for 2 inputs to have the same path so this is guaranteed to
    79  	// produce a deterministic order.
    80  	sort.Slice(result.inputs, func(i, j int) bool {
    81  		return bytes.Compare(result.inputs[i].FileInfo.Hash, result.inputs[j].FileInfo.Hash) < 0
    82  	})
    83  	return result, nil
    84  }
    85  
    86  func (d *pfsIterator) Reset() {
    87  	d.location = -1
    88  }
    89  
    90  func (d *pfsIterator) Len() int {
    91  	return len(d.inputs)
    92  }
    93  
    94  func (d *pfsIterator) Datum() []*common.Input {
    95  	return []*common.Input{d.inputs[d.location]}
    96  }
    97  
    98  func (d *pfsIterator) DatumN(n int) []*common.Input {
    99  	return []*common.Input{d.inputs[n]}
   100  }
   101  
   102  func (d *pfsIterator) Next() bool {
   103  	if d.location < len(d.inputs) {
   104  		d.location++
   105  	}
   106  	return d.location < len(d.inputs)
   107  }
   108  
   109  type listIterator struct {
   110  	inputs   []*common.Input
   111  	location int
   112  }
   113  
   114  func newListIterator(pachClient *client.APIClient, inputs []*common.Input) (Iterator, error) {
   115  	result := &listIterator{}
   116  	// make sure it gets initialized properly
   117  	result.Reset()
   118  	result.inputs = inputs
   119  	return result, nil
   120  }
   121  
   122  func (d *listIterator) Reset() {
   123  	d.location = -1
   124  }
   125  
   126  func (d *listIterator) Len() int {
   127  	return len(d.inputs)
   128  }
   129  
   130  func (d *listIterator) Datum() []*common.Input {
   131  	return []*common.Input{d.inputs[d.location]}
   132  }
   133  
   134  func (d *listIterator) DatumN(n int) []*common.Input {
   135  	return []*common.Input{d.inputs[n]}
   136  }
   137  
   138  func (d *listIterator) Next() bool {
   139  	if d.location < len(d.inputs) {
   140  		d.location++
   141  	}
   142  	return d.location < len(d.inputs)
   143  }
   144  
   145  type unionIterator struct {
   146  	iterators []Iterator
   147  	unionIdx  int
   148  	location  int
   149  }
   150  
   151  func newUnionIterator(pachClient *client.APIClient, union []*pps.Input) (Iterator, error) {
   152  	result := &unionIterator{}
   153  	defer result.Reset()
   154  	for _, input := range union {
   155  		datumIterator, err := NewIterator(pachClient, input)
   156  		if err != nil {
   157  			return nil, err
   158  		}
   159  		result.iterators = append(result.iterators, datumIterator)
   160  	}
   161  	return result, nil
   162  }
   163  
   164  func (d *unionIterator) Reset() {
   165  	for _, input := range d.iterators {
   166  		input.Reset()
   167  	}
   168  	d.unionIdx = 0
   169  	d.location = -1
   170  }
   171  
   172  func (d *unionIterator) Len() int {
   173  	result := 0
   174  	for _, datumIterator := range d.iterators {
   175  		result += datumIterator.Len()
   176  	}
   177  	return result
   178  }
   179  
   180  func (d *unionIterator) Next() bool {
   181  	if d.unionIdx >= len(d.iterators) {
   182  		return false
   183  	}
   184  	if !d.iterators[d.unionIdx].Next() {
   185  		d.unionIdx++
   186  		return d.Next()
   187  	}
   188  	d.location++
   189  	return true
   190  }
   191  
   192  func (d *unionIterator) Datum() []*common.Input {
   193  	return d.iterators[d.unionIdx].Datum()
   194  }
   195  
   196  func (d *unionIterator) DatumN(n int) []*common.Input {
   197  	for _, datumIterator := range d.iterators {
   198  		if n < datumIterator.Len() {
   199  			return datumIterator.DatumN(n)
   200  		}
   201  		n -= datumIterator.Len()
   202  	}
   203  	panic("index out of bounds")
   204  }
   205  
   206  type crossIterator struct {
   207  	iterators     []Iterator
   208  	started, done bool
   209  	location      int
   210  }
   211  
   212  func newCrossIterator(pachClient *client.APIClient, cross []*pps.Input) (Iterator, error) {
   213  	result := &crossIterator{}
   214  	defer result.Reset() // Call Next() on all inner iterators once
   215  	for _, iterator := range cross {
   216  		datumIterator, err := NewIterator(pachClient, iterator)
   217  		if err != nil {
   218  			return nil, err
   219  		}
   220  		result.iterators = append(result.iterators, datumIterator)
   221  	}
   222  	result.location = -1
   223  	return result, nil
   224  }
   225  
   226  func newCrossListIterator(pachClient *client.APIClient, cross [][]*common.Input) (Iterator, error) {
   227  	result := &crossIterator{}
   228  	defer result.Reset()
   229  	for _, iterator := range cross {
   230  		datumIterator, err := newListIterator(pachClient, iterator)
   231  		if err != nil {
   232  			return nil, err
   233  		}
   234  		result.iterators = append(result.iterators, datumIterator)
   235  	}
   236  	result.location = -1
   237  	return result, nil
   238  }
   239  
   240  func (d *crossIterator) Reset() {
   241  	inhabited := len(d.iterators) > 0
   242  	for _, iterators := range d.iterators {
   243  		iterators.Reset()
   244  		if !iterators.Next() {
   245  			inhabited = false
   246  		}
   247  	}
   248  	if !inhabited {
   249  		d.iterators = nil
   250  	}
   251  	d.location = -1
   252  	d.started = !inhabited
   253  	d.done = d.started
   254  }
   255  
   256  func (d *crossIterator) Len() int {
   257  	if len(d.iterators) == 0 {
   258  		return 0
   259  	}
   260  	result := d.iterators[0].Len()
   261  	for i := 1; i < len(d.iterators); i++ {
   262  		result *= d.iterators[i].Len()
   263  	}
   264  	return result
   265  }
   266  
   267  func (d *crossIterator) Next() bool {
   268  	if !d.started {
   269  		d.started = true
   270  		d.location++
   271  		// First call to Next() does nothing, as Reset() calls Next() on all inner
   272  		// datums once already
   273  		return true
   274  	}
   275  	if d.done {
   276  		return false
   277  	}
   278  	for _, input := range d.iterators {
   279  		// if we're at the end of the "row"
   280  		if !input.Next() {
   281  			// we reset the "row"
   282  			input.Reset()
   283  			// and start it back up
   284  			input.Next()
   285  			// after resetting this "row", start iterating through the next "row"
   286  		} else {
   287  			d.location++
   288  			return true
   289  		}
   290  	}
   291  	d.done = true
   292  	return false
   293  }
   294  
   295  func (d *crossIterator) Datum() []*common.Input {
   296  	var result []*common.Input
   297  	for _, datumIterator := range d.iterators {
   298  		result = append(result, datumIterator.Datum()...)
   299  	}
   300  	sortInputs(result)
   301  	return result
   302  }
   303  
   304  func (d *crossIterator) DatumN(n int) []*common.Input {
   305  	if n >= d.Len() {
   306  		panic("index out of bounds")
   307  	}
   308  	var result []*common.Input
   309  	for _, datumIterator := range d.iterators {
   310  		result = append(result, datumIterator.DatumN(n%datumIterator.Len())...)
   311  		n /= datumIterator.Len()
   312  	}
   313  	sortInputs(result)
   314  	return result
   315  }
   316  
   317  type groupIterator struct {
   318  	datums   [][]*common.Input
   319  	location int
   320  }
   321  
   322  func newGroupIterator(pachClient *client.APIClient, group []*pps.Input) (Iterator, error) {
   323  	groupMap := make(map[string][]*common.Input)
   324  	keys := make([]string, 0, len(group))
   325  	result := &groupIterator{}
   326  	defer result.Reset()
   327  
   328  	// okay, so we have a slice of pps Inputs
   329  	for _, input := range group {
   330  		// turn our inputs into iterators
   331  		datumIterator, err := NewIterator(pachClient, input)
   332  		if err != nil {
   333  			return nil, err
   334  		}
   335  		// iterate through each iterator to get the individual datums
   336  		for datumIterator.Next() {
   337  			datum := datumIterator.Datum()
   338  			for _, datumInput := range datum {
   339  				// put the datums in an map keyed by GroupBy
   340  				groupDatum, ok := groupMap[datumInput.GroupBy]
   341  				if !ok || groupDatum == nil {
   342  					// make sure we keep track of new keys
   343  					keys = append(keys, datumInput.GroupBy)
   344  				}
   345  				groupMap[datumInput.GroupBy] = append(groupDatum, datumInput)
   346  			}
   347  		}
   348  	}
   349  	// sort everything by the group_by
   350  	sort.Strings(keys)
   351  
   352  	// put each equivalence class into its own datum
   353  	for _, key := range keys {
   354  		result.datums = append(result.datums, groupMap[key])
   355  	}
   356  	return result, nil
   357  }
   358  
   359  func (d *groupIterator) Reset() {
   360  	d.location = -1
   361  }
   362  
   363  func (d *groupIterator) Len() int {
   364  	return len(d.datums)
   365  }
   366  
   367  func (d *groupIterator) Next() bool {
   368  	if d.location < len(d.datums) {
   369  		d.location++
   370  	}
   371  	return d.location < len(d.datums)
   372  }
   373  
   374  func (d *groupIterator) Datum() []*common.Input {
   375  	return d.datums[d.location]
   376  }
   377  
   378  func (d *groupIterator) DatumN(n int) []*common.Input {
   379  	d.location = n
   380  	return d.Datum()
   381  }
   382  
   383  type joinIterator struct {
   384  	datums   [][]*common.Input
   385  	location int
   386  }
   387  
   388  func newJoinIterator(pachClient *client.APIClient, join []*pps.Input) (Iterator, error) {
   389  	result := &joinIterator{}
   390  	om := ordered_map.NewOrderedMap()
   391  
   392  	for i, input := range join {
   393  		datumIterator, err := NewIterator(pachClient, input)
   394  		if err != nil {
   395  			return nil, err
   396  		}
   397  		for datumIterator.Next() {
   398  			x := datumIterator.Datum()
   399  			for _, k := range x {
   400  				tupleI, ok := om.Get(k.JoinOn)
   401  				var tuple [][]*common.Input
   402  				if !ok {
   403  					tuple = make([][]*common.Input, len(join))
   404  				} else {
   405  					tuple = tupleI.([][]*common.Input)
   406  				}
   407  				tuple[i] = append(tuple[i], k)
   408  				om.Set(k.JoinOn, tuple)
   409  			}
   410  		}
   411  	}
   412  
   413  	iter := om.IterFunc()
   414  	for kv, ok := iter(); ok; kv, ok = iter() {
   415  		tuple := kv.Value.([][]*common.Input)
   416  		missing := false
   417  		var filteredTuple [][]*common.Input
   418  		for i, inputs := range tuple {
   419  			if len(inputs) == 0 {
   420  				missing = true
   421  				continue
   422  			}
   423  			if join[i].Pfs != nil && join[i].Pfs.OuterJoin {
   424  				filteredTuple = append(filteredTuple, inputs)
   425  			}
   426  		}
   427  		if missing {
   428  			tuple = filteredTuple
   429  		}
   430  		cross, err := newCrossListIterator(pachClient, tuple)
   431  		if err != nil {
   432  			return nil, err
   433  		}
   434  		for cross.Next() {
   435  			result.datums = append(result.datums, cross.Datum())
   436  		}
   437  	}
   438  	result.location = -1
   439  	return result, nil
   440  }
   441  
   442  func (d *joinIterator) Reset() {
   443  	d.location = -1
   444  }
   445  
   446  func (d *joinIterator) Len() int {
   447  	return len(d.datums)
   448  }
   449  
   450  func (d *joinIterator) Next() bool {
   451  	if d.location < len(d.datums) {
   452  		d.location++
   453  	}
   454  	return d.location < len(d.datums)
   455  }
   456  
   457  func (d *joinIterator) Datum() []*common.Input {
   458  	var result []*common.Input
   459  	result = append(result, d.datums[d.location]...)
   460  	sortInputs(result)
   461  	return result
   462  }
   463  
   464  func (d *joinIterator) DatumN(n int) []*common.Input {
   465  	d.location = n
   466  	return d.Datum()
   467  }
   468  
   469  type gitIterator struct {
   470  	inputs   []*common.Input
   471  	location int
   472  }
   473  
   474  func newGitIterator(pachClient *client.APIClient, input *pps.GitInput) (Iterator, error) {
   475  	result := &gitIterator{}
   476  	defer result.Reset()
   477  	if input.Commit == "" {
   478  		// this can happen if a pipeline with multiple inputs has been triggered
   479  		// before all commits have inputs
   480  		return result, nil
   481  	}
   482  	fileInfo, err := pachClient.InspectFile(input.Name, input.Commit, "/commit.json")
   483  	if err != nil {
   484  		return nil, err
   485  	}
   486  	result.inputs = append(
   487  		result.inputs,
   488  		&common.Input{
   489  			FileInfo: fileInfo,
   490  			Name:     input.Name,
   491  			Branch:   input.Branch,
   492  			GitURL:   input.URL,
   493  		},
   494  	)
   495  	return result, nil
   496  }
   497  
   498  func (d *gitIterator) Reset() {
   499  	d.location = -1
   500  }
   501  
   502  func (d *gitIterator) Len() int {
   503  	return len(d.inputs)
   504  }
   505  
   506  func (d *gitIterator) Datum() []*common.Input {
   507  	return []*common.Input{d.inputs[d.location]}
   508  }
   509  
   510  func (d *gitIterator) Next() bool {
   511  	if d.location < len(d.inputs) {
   512  		d.location++
   513  	}
   514  	return d.location < len(d.inputs)
   515  }
   516  
   517  func (d *gitIterator) DatumN(n int) []*common.Input {
   518  	if n < d.location {
   519  		d.Reset()
   520  	}
   521  	for d.location != n {
   522  		d.Next()
   523  	}
   524  	return d.Datum()
   525  }
   526  
   527  func newCronIterator(pachClient *client.APIClient, input *pps.CronInput) (Iterator, error) {
   528  	return newPFSIterator(pachClient, &pps.PFSInput{
   529  		Name:   input.Name,
   530  		Repo:   input.Repo,
   531  		Branch: "master",
   532  		Commit: input.Commit,
   533  		Glob:   "/*",
   534  	})
   535  }
   536  
   537  // NewIterator creates an Iterator for an input.
   538  func NewIterator(pachClient *client.APIClient, input *pps.Input) (Iterator, error) {
   539  	switch {
   540  	case input.Pfs != nil:
   541  		return newPFSIterator(pachClient, input.Pfs)
   542  	case input.Union != nil:
   543  		return newUnionIterator(pachClient, input.Union)
   544  	case input.Cross != nil:
   545  		return newCrossIterator(pachClient, input.Cross)
   546  	case input.Join != nil:
   547  		return newJoinIterator(pachClient, input.Join)
   548  	case input.Group != nil:
   549  		return newGroupIterator(pachClient, input.Group)
   550  	case input.Cron != nil:
   551  		return newCronIterator(pachClient, input.Cron)
   552  	case input.Git != nil:
   553  		return newGitIterator(pachClient, input.Git)
   554  	}
   555  	return nil, errors.Errorf("unrecognized input type: %v", input)
   556  }
   557  
   558  func sortInputs(inputs []*common.Input) {
   559  	sort.Slice(inputs, func(i, j int) bool {
   560  		return inputs[i].Name < inputs[j].Name
   561  	})
   562  }