github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/datum/iterator.go (about) 1 package datum 2 3 import ( 4 "bytes" 5 "io" 6 "sort" 7 8 glob "github.com/pachyderm/ohmyglob" 9 10 "github.com/pachyderm/pachyderm/src/client" 11 "github.com/pachyderm/pachyderm/src/client/pfs" 12 "github.com/pachyderm/pachyderm/src/client/pkg/errors" 13 "github.com/pachyderm/pachyderm/src/client/pps" 14 "github.com/pachyderm/pachyderm/src/server/pkg/path" 15 "github.com/pachyderm/pachyderm/src/server/worker/common" 16 17 "github.com/cevaris/ordered_map" 18 ) 19 20 // Iterator is an interface which allows you to iterate through the datums 21 // for a job. A datum iterator keeps track of which datum it is on, which can be Reset() 22 // The intended use is by using this pattern `for di.Next() { ... datum := di.Datum() ... }` 23 // Note that since you start the loop by a call to Next(), the datum iterator's location starts at -1 24 type Iterator interface { 25 Reset() 26 Len() int 27 Next() bool 28 Datum() []*common.Input 29 DatumN(int) []*common.Input 30 } 31 32 type pfsIterator struct { 33 inputs []*common.Input 34 location int 35 } 36 37 func newPFSIterator(pachClient *client.APIClient, input *pps.PFSInput) (Iterator, error) { 38 result := &pfsIterator{} 39 // make sure it gets initialized properly (location = -1) 40 result.Reset() 41 if input.Commit == "" { 42 // this can happen if a pipeline with multiple inputs has been triggered 43 // before all commits have inputs 44 return result, nil 45 } 46 fs, err := pachClient.GlobFileStream(pachClient.Ctx(), &pfs.GlobFileRequest{ 47 Commit: client.NewCommit(input.Repo, input.Commit), 48 Pattern: input.Glob, 49 }) 50 if err != nil { 51 return nil, err 52 } 53 for { 54 fileInfo, err := fs.Recv() 55 if errors.Is(err, io.EOF) { 56 break 57 } else if err != nil { 58 return nil, err 59 } 60 g, err := glob.Compile(path.Clean(input.Glob), '/') 61 if err != nil { 62 return nil, err 63 } 64 joinOn := g.Replace(fileInfo.File.Path, input.JoinOn) 65 groupBy := g.Replace(fileInfo.File.Path, input.GroupBy) 66 result.inputs = append(result.inputs, &common.Input{ 67 FileInfo: fileInfo, 68 JoinOn: joinOn, 69 GroupBy: groupBy, 70 Name: input.Name, 71 Lazy: input.Lazy, 72 Branch: input.Branch, 73 EmptyFiles: input.EmptyFiles, 74 S3: input.S3, 75 }) 76 } 77 // We sort the inputs so that the order is deterministic. Note that it's 78 // not possible for 2 inputs to have the same path so this is guaranteed to 79 // produce a deterministic order. 80 sort.Slice(result.inputs, func(i, j int) bool { 81 return bytes.Compare(result.inputs[i].FileInfo.Hash, result.inputs[j].FileInfo.Hash) < 0 82 }) 83 return result, nil 84 } 85 86 func (d *pfsIterator) Reset() { 87 d.location = -1 88 } 89 90 func (d *pfsIterator) Len() int { 91 return len(d.inputs) 92 } 93 94 func (d *pfsIterator) Datum() []*common.Input { 95 return []*common.Input{d.inputs[d.location]} 96 } 97 98 func (d *pfsIterator) DatumN(n int) []*common.Input { 99 return []*common.Input{d.inputs[n]} 100 } 101 102 func (d *pfsIterator) Next() bool { 103 if d.location < len(d.inputs) { 104 d.location++ 105 } 106 return d.location < len(d.inputs) 107 } 108 109 type listIterator struct { 110 inputs []*common.Input 111 location int 112 } 113 114 func newListIterator(pachClient *client.APIClient, inputs []*common.Input) (Iterator, error) { 115 result := &listIterator{} 116 // make sure it gets initialized properly 117 result.Reset() 118 result.inputs = inputs 119 return result, nil 120 } 121 122 func (d *listIterator) Reset() { 123 d.location = -1 124 } 125 126 func (d *listIterator) Len() int { 127 return len(d.inputs) 128 } 129 130 func (d *listIterator) Datum() []*common.Input { 131 return []*common.Input{d.inputs[d.location]} 132 } 133 134 func (d *listIterator) DatumN(n int) []*common.Input { 135 return []*common.Input{d.inputs[n]} 136 } 137 138 func (d *listIterator) Next() bool { 139 if d.location < len(d.inputs) { 140 d.location++ 141 } 142 return d.location < len(d.inputs) 143 } 144 145 type unionIterator struct { 146 iterators []Iterator 147 unionIdx int 148 location int 149 } 150 151 func newUnionIterator(pachClient *client.APIClient, union []*pps.Input) (Iterator, error) { 152 result := &unionIterator{} 153 defer result.Reset() 154 for _, input := range union { 155 datumIterator, err := NewIterator(pachClient, input) 156 if err != nil { 157 return nil, err 158 } 159 result.iterators = append(result.iterators, datumIterator) 160 } 161 return result, nil 162 } 163 164 func (d *unionIterator) Reset() { 165 for _, input := range d.iterators { 166 input.Reset() 167 } 168 d.unionIdx = 0 169 d.location = -1 170 } 171 172 func (d *unionIterator) Len() int { 173 result := 0 174 for _, datumIterator := range d.iterators { 175 result += datumIterator.Len() 176 } 177 return result 178 } 179 180 func (d *unionIterator) Next() bool { 181 if d.unionIdx >= len(d.iterators) { 182 return false 183 } 184 if !d.iterators[d.unionIdx].Next() { 185 d.unionIdx++ 186 return d.Next() 187 } 188 d.location++ 189 return true 190 } 191 192 func (d *unionIterator) Datum() []*common.Input { 193 return d.iterators[d.unionIdx].Datum() 194 } 195 196 func (d *unionIterator) DatumN(n int) []*common.Input { 197 for _, datumIterator := range d.iterators { 198 if n < datumIterator.Len() { 199 return datumIterator.DatumN(n) 200 } 201 n -= datumIterator.Len() 202 } 203 panic("index out of bounds") 204 } 205 206 type crossIterator struct { 207 iterators []Iterator 208 started, done bool 209 location int 210 } 211 212 func newCrossIterator(pachClient *client.APIClient, cross []*pps.Input) (Iterator, error) { 213 result := &crossIterator{} 214 defer result.Reset() // Call Next() on all inner iterators once 215 for _, iterator := range cross { 216 datumIterator, err := NewIterator(pachClient, iterator) 217 if err != nil { 218 return nil, err 219 } 220 result.iterators = append(result.iterators, datumIterator) 221 } 222 result.location = -1 223 return result, nil 224 } 225 226 func newCrossListIterator(pachClient *client.APIClient, cross [][]*common.Input) (Iterator, error) { 227 result := &crossIterator{} 228 defer result.Reset() 229 for _, iterator := range cross { 230 datumIterator, err := newListIterator(pachClient, iterator) 231 if err != nil { 232 return nil, err 233 } 234 result.iterators = append(result.iterators, datumIterator) 235 } 236 result.location = -1 237 return result, nil 238 } 239 240 func (d *crossIterator) Reset() { 241 inhabited := len(d.iterators) > 0 242 for _, iterators := range d.iterators { 243 iterators.Reset() 244 if !iterators.Next() { 245 inhabited = false 246 } 247 } 248 if !inhabited { 249 d.iterators = nil 250 } 251 d.location = -1 252 d.started = !inhabited 253 d.done = d.started 254 } 255 256 func (d *crossIterator) Len() int { 257 if len(d.iterators) == 0 { 258 return 0 259 } 260 result := d.iterators[0].Len() 261 for i := 1; i < len(d.iterators); i++ { 262 result *= d.iterators[i].Len() 263 } 264 return result 265 } 266 267 func (d *crossIterator) Next() bool { 268 if !d.started { 269 d.started = true 270 d.location++ 271 // First call to Next() does nothing, as Reset() calls Next() on all inner 272 // datums once already 273 return true 274 } 275 if d.done { 276 return false 277 } 278 for _, input := range d.iterators { 279 // if we're at the end of the "row" 280 if !input.Next() { 281 // we reset the "row" 282 input.Reset() 283 // and start it back up 284 input.Next() 285 // after resetting this "row", start iterating through the next "row" 286 } else { 287 d.location++ 288 return true 289 } 290 } 291 d.done = true 292 return false 293 } 294 295 func (d *crossIterator) Datum() []*common.Input { 296 var result []*common.Input 297 for _, datumIterator := range d.iterators { 298 result = append(result, datumIterator.Datum()...) 299 } 300 sortInputs(result) 301 return result 302 } 303 304 func (d *crossIterator) DatumN(n int) []*common.Input { 305 if n >= d.Len() { 306 panic("index out of bounds") 307 } 308 var result []*common.Input 309 for _, datumIterator := range d.iterators { 310 result = append(result, datumIterator.DatumN(n%datumIterator.Len())...) 311 n /= datumIterator.Len() 312 } 313 sortInputs(result) 314 return result 315 } 316 317 type groupIterator struct { 318 datums [][]*common.Input 319 location int 320 } 321 322 func newGroupIterator(pachClient *client.APIClient, group []*pps.Input) (Iterator, error) { 323 groupMap := make(map[string][]*common.Input) 324 keys := make([]string, 0, len(group)) 325 result := &groupIterator{} 326 defer result.Reset() 327 328 // okay, so we have a slice of pps Inputs 329 for _, input := range group { 330 // turn our inputs into iterators 331 datumIterator, err := NewIterator(pachClient, input) 332 if err != nil { 333 return nil, err 334 } 335 // iterate through each iterator to get the individual datums 336 for datumIterator.Next() { 337 datum := datumIterator.Datum() 338 for _, datumInput := range datum { 339 // put the datums in an map keyed by GroupBy 340 groupDatum, ok := groupMap[datumInput.GroupBy] 341 if !ok || groupDatum == nil { 342 // make sure we keep track of new keys 343 keys = append(keys, datumInput.GroupBy) 344 } 345 groupMap[datumInput.GroupBy] = append(groupDatum, datumInput) 346 } 347 } 348 } 349 // sort everything by the group_by 350 sort.Strings(keys) 351 352 // put each equivalence class into its own datum 353 for _, key := range keys { 354 result.datums = append(result.datums, groupMap[key]) 355 } 356 return result, nil 357 } 358 359 func (d *groupIterator) Reset() { 360 d.location = -1 361 } 362 363 func (d *groupIterator) Len() int { 364 return len(d.datums) 365 } 366 367 func (d *groupIterator) Next() bool { 368 if d.location < len(d.datums) { 369 d.location++ 370 } 371 return d.location < len(d.datums) 372 } 373 374 func (d *groupIterator) Datum() []*common.Input { 375 return d.datums[d.location] 376 } 377 378 func (d *groupIterator) DatumN(n int) []*common.Input { 379 d.location = n 380 return d.Datum() 381 } 382 383 type joinIterator struct { 384 datums [][]*common.Input 385 location int 386 } 387 388 func newJoinIterator(pachClient *client.APIClient, join []*pps.Input) (Iterator, error) { 389 result := &joinIterator{} 390 om := ordered_map.NewOrderedMap() 391 392 for i, input := range join { 393 datumIterator, err := NewIterator(pachClient, input) 394 if err != nil { 395 return nil, err 396 } 397 for datumIterator.Next() { 398 x := datumIterator.Datum() 399 for _, k := range x { 400 tupleI, ok := om.Get(k.JoinOn) 401 var tuple [][]*common.Input 402 if !ok { 403 tuple = make([][]*common.Input, len(join)) 404 } else { 405 tuple = tupleI.([][]*common.Input) 406 } 407 tuple[i] = append(tuple[i], k) 408 om.Set(k.JoinOn, tuple) 409 } 410 } 411 } 412 413 iter := om.IterFunc() 414 for kv, ok := iter(); ok; kv, ok = iter() { 415 tuple := kv.Value.([][]*common.Input) 416 missing := false 417 var filteredTuple [][]*common.Input 418 for i, inputs := range tuple { 419 if len(inputs) == 0 { 420 missing = true 421 continue 422 } 423 if join[i].Pfs != nil && join[i].Pfs.OuterJoin { 424 filteredTuple = append(filteredTuple, inputs) 425 } 426 } 427 if missing { 428 tuple = filteredTuple 429 } 430 cross, err := newCrossListIterator(pachClient, tuple) 431 if err != nil { 432 return nil, err 433 } 434 for cross.Next() { 435 result.datums = append(result.datums, cross.Datum()) 436 } 437 } 438 result.location = -1 439 return result, nil 440 } 441 442 func (d *joinIterator) Reset() { 443 d.location = -1 444 } 445 446 func (d *joinIterator) Len() int { 447 return len(d.datums) 448 } 449 450 func (d *joinIterator) Next() bool { 451 if d.location < len(d.datums) { 452 d.location++ 453 } 454 return d.location < len(d.datums) 455 } 456 457 func (d *joinIterator) Datum() []*common.Input { 458 var result []*common.Input 459 result = append(result, d.datums[d.location]...) 460 sortInputs(result) 461 return result 462 } 463 464 func (d *joinIterator) DatumN(n int) []*common.Input { 465 d.location = n 466 return d.Datum() 467 } 468 469 type gitIterator struct { 470 inputs []*common.Input 471 location int 472 } 473 474 func newGitIterator(pachClient *client.APIClient, input *pps.GitInput) (Iterator, error) { 475 result := &gitIterator{} 476 defer result.Reset() 477 if input.Commit == "" { 478 // this can happen if a pipeline with multiple inputs has been triggered 479 // before all commits have inputs 480 return result, nil 481 } 482 fileInfo, err := pachClient.InspectFile(input.Name, input.Commit, "/commit.json") 483 if err != nil { 484 return nil, err 485 } 486 result.inputs = append( 487 result.inputs, 488 &common.Input{ 489 FileInfo: fileInfo, 490 Name: input.Name, 491 Branch: input.Branch, 492 GitURL: input.URL, 493 }, 494 ) 495 return result, nil 496 } 497 498 func (d *gitIterator) Reset() { 499 d.location = -1 500 } 501 502 func (d *gitIterator) Len() int { 503 return len(d.inputs) 504 } 505 506 func (d *gitIterator) Datum() []*common.Input { 507 return []*common.Input{d.inputs[d.location]} 508 } 509 510 func (d *gitIterator) Next() bool { 511 if d.location < len(d.inputs) { 512 d.location++ 513 } 514 return d.location < len(d.inputs) 515 } 516 517 func (d *gitIterator) DatumN(n int) []*common.Input { 518 if n < d.location { 519 d.Reset() 520 } 521 for d.location != n { 522 d.Next() 523 } 524 return d.Datum() 525 } 526 527 func newCronIterator(pachClient *client.APIClient, input *pps.CronInput) (Iterator, error) { 528 return newPFSIterator(pachClient, &pps.PFSInput{ 529 Name: input.Name, 530 Repo: input.Repo, 531 Branch: "master", 532 Commit: input.Commit, 533 Glob: "/*", 534 }) 535 } 536 537 // NewIterator creates an Iterator for an input. 538 func NewIterator(pachClient *client.APIClient, input *pps.Input) (Iterator, error) { 539 switch { 540 case input.Pfs != nil: 541 return newPFSIterator(pachClient, input.Pfs) 542 case input.Union != nil: 543 return newUnionIterator(pachClient, input.Union) 544 case input.Cross != nil: 545 return newCrossIterator(pachClient, input.Cross) 546 case input.Join != nil: 547 return newJoinIterator(pachClient, input.Join) 548 case input.Group != nil: 549 return newGroupIterator(pachClient, input.Group) 550 case input.Cron != nil: 551 return newCronIterator(pachClient, input.Cron) 552 case input.Git != nil: 553 return newGitIterator(pachClient, input.Git) 554 } 555 return nil, errors.Errorf("unrecognized input type: %v", input) 556 } 557 558 func sortInputs(inputs []*common.Input) { 559 sort.Slice(inputs, func(i, j int) bool { 560 return inputs[i].Name < inputs[j].Name 561 }) 562 }