github.com/pachyderm/pachyderm@v1.13.4/src/client/pps.go (about) 1 package client 2 3 import ( 4 "crypto/sha256" 5 "encoding/hex" 6 "io" 7 "time" 8 9 "github.com/pachyderm/pachyderm/src/client/pfs" 10 "github.com/pachyderm/pachyderm/src/client/pkg/errors" 11 "github.com/pachyderm/pachyderm/src/client/pkg/grpcutil" 12 "github.com/pachyderm/pachyderm/src/client/pps" 13 "github.com/pachyderm/pachyderm/src/server/pkg/errutil" 14 15 "github.com/gogo/protobuf/types" 16 ) 17 18 const ( 19 // PPSEtcdPrefixEnv is the environment variable that specifies the etcd 20 // prefix that PPS uses. 21 PPSEtcdPrefixEnv = "PPS_ETCD_PREFIX" 22 // PPSWorkerIPEnv is the environment variable that a worker can use to 23 // see its own IP. The IP address is made available through the 24 // Kubernetes downward API. 25 PPSWorkerIPEnv = "PPS_WORKER_IP" 26 // PPSPodNameEnv is the environment variable that a pod can use to 27 // see its own name. The pod name is made available through the 28 // Kubernetes downward API. 29 PPSPodNameEnv = "PPS_POD_NAME" 30 // PPSPipelineNameEnv is the env var that sets the name of the pipeline 31 // that the workers are running. 32 PPSPipelineNameEnv = "PPS_PIPELINE_NAME" 33 // PPSJobIDEnv is the env var that sets the ID of the job that the 34 // workers are running (if the workers belong to an orphan job, rather than a 35 // pipeline). 36 PPSJobIDEnv = "PPS_JOB_ID" 37 // PPSSpecCommitEnv is the namespace in which pachyderm is deployed 38 PPSSpecCommitEnv = "PPS_SPEC_COMMIT" 39 // PPSInputPrefix is the prefix of the path where datums are downloaded 40 // to. A datum of an input named `XXX` is downloaded to `/pfs/XXX/`. 41 PPSInputPrefix = "/pfs" 42 // PPSScratchSpace is where pps workers store data while it's waiting to be 43 // processed. 44 PPSScratchSpace = ".scratch" 45 // PPSWorkerPortEnv is environment variable name for the port that workers 46 // use for their gRPC server 47 PPSWorkerPortEnv = "PPS_WORKER_GRPC_PORT" 48 // PPSWorkerVolume is the name of the volume in which workers store 49 // data. 50 PPSWorkerVolume = "pachyderm-worker" 51 // PPSWorkerUserContainerName is the name of the container that runs 52 // the user code to process data. 53 PPSWorkerUserContainerName = "user" 54 // PPSWorkerSidecarContainerName is the name of the sidecar container 55 // that runs alongside of each worker container. 56 PPSWorkerSidecarContainerName = "storage" 57 // GCGenerationKey is the etcd key that stores a counter that the 58 // GC utility increments when it runs, so as to invalidate all cache. 59 GCGenerationKey = "gc-generation" 60 // JobIDEnv is an env var that is added to the environment of user pipeline 61 // code and indicates the id of the job currently being run. 62 JobIDEnv = "PACH_JOB_ID" 63 // OutputCommitIDEnv is an env var that is added to the environment of user 64 // pipelined code and indicates the id of the output commit. 65 OutputCommitIDEnv = "PACH_OUTPUT_COMMIT_ID" 66 // PeerPortEnv is the env var that sets a custom peer port 67 PeerPortEnv = "PEER_PORT" 68 69 // ReprocessSpecUntilSuccess is used in the pipeline.ReprocessSpec field. It 70 // is the default behavior of reattempting failed datums in each job. 71 ReprocessSpecUntilSuccess = "until_success" 72 // ReprocessSpecEveryJob is used in the pipeline.ReprocessSpec field. With 73 // this, a pipeline will reprocess every datum in every job, regardless of if 74 // it succeeded or failed previously. 75 ReprocessSpecEveryJob = "every_job" 76 ) 77 78 // NewJob creates a pps.Job. 79 func NewJob(jobID string) *pps.Job { 80 return &pps.Job{ID: jobID} 81 } 82 83 // DatumTagPrefix hashes a pipeline salt to a string of a fixed size for use as 84 // the prefix for datum output trees. This prefix allows us to do garbage 85 // collection correctly. 86 func DatumTagPrefix(salt string) string { 87 // We need to hash the salt because UUIDs are not necessarily 88 // random in every bit. 89 h := sha256.New() 90 h.Write([]byte(salt)) 91 return hex.EncodeToString(h.Sum(nil))[:4] 92 } 93 94 // NewPFSInput returns a new PFS input. It only includes required options. 95 func NewPFSInput(repo string, glob string) *pps.Input { 96 return &pps.Input{ 97 Pfs: &pps.PFSInput{ 98 Repo: repo, 99 Glob: glob, 100 }, 101 } 102 } 103 104 // NewPFSInputOpts returns a new PFS input. It includes all options. 105 func NewPFSInputOpts(name string, repo string, branch string, glob string, joinOn string, groupBy string, outerJoin bool, lazy bool, trigger *pfs.Trigger) *pps.Input { 106 return &pps.Input{ 107 Pfs: &pps.PFSInput{ 108 Name: name, 109 Repo: repo, 110 Branch: branch, 111 Glob: glob, 112 JoinOn: joinOn, 113 OuterJoin: outerJoin, 114 GroupBy: groupBy, 115 Lazy: lazy, 116 Trigger: trigger, 117 }, 118 } 119 } 120 121 // NewS3PFSInput returns a new PFS input with 'S3' set. 122 func NewS3PFSInput(name string, repo string, branch string) *pps.Input { 123 return &pps.Input{ 124 Pfs: &pps.PFSInput{ 125 Name: name, 126 Repo: repo, 127 Branch: branch, 128 S3: true, 129 }, 130 } 131 } 132 133 // NewCrossInput returns an input which is the cross product of other inputs. 134 // That means that all combination of datums will be seen by the job / 135 // pipeline. 136 func NewCrossInput(input ...*pps.Input) *pps.Input { 137 return &pps.Input{ 138 Cross: input, 139 } 140 } 141 142 // NewJoinInput returns an input which is the join of other inputs. 143 // That means that all combination of datums which match on `joinOn` will be seen by the job / 144 // pipeline. 145 func NewJoinInput(input ...*pps.Input) *pps.Input { 146 return &pps.Input{ 147 Join: input, 148 } 149 } 150 151 // NewUnionInput returns an input which is the union of other inputs. That 152 // means that all datums from any of the inputs will be seen individually by 153 // the job / pipeline. 154 func NewUnionInput(input ...*pps.Input) *pps.Input { 155 return &pps.Input{ 156 Union: input, 157 } 158 } 159 160 // NewGroupInput returns an input which groups the inputs by the GroupBy pattern. 161 // That means that it will return a datum for each group of input datums matching 162 // a particular GroupBy pattern 163 func NewGroupInput(input ...*pps.Input) *pps.Input { 164 return &pps.Input{ 165 Group: input, 166 } 167 } 168 169 // NewCronInput returns an input which will trigger based on a timed schedule. 170 // It uses cron syntax to specify the schedule. The input will be exposed to 171 // jobs as `/pfs/<name>/<timestamp>`. The timestamp uses the RFC 3339 format, 172 // e.g. `2006-01-02T15:04:05Z07:00`. It only takes required options. 173 func NewCronInput(name string, spec string) *pps.Input { 174 return &pps.Input{ 175 Cron: &pps.CronInput{ 176 Name: name, 177 Spec: spec, 178 }, 179 } 180 } 181 182 // NewCronInputOpts returns an input which will trigger based on a timed schedule. 183 // It uses cron syntax to specify the schedule. The input will be exposed to 184 // jobs as `/pfs/<name>/<timestamp>`. The timestamp uses the RFC 3339 format, 185 // e.g. `2006-01-02T15:04:05Z07:00`. It includes all the options. 186 func NewCronInputOpts(name string, repo string, spec string, overwrite bool) *pps.Input { 187 return &pps.Input{ 188 Cron: &pps.CronInput{ 189 Name: name, 190 Repo: repo, 191 Spec: spec, 192 Overwrite: overwrite, 193 }, 194 } 195 } 196 197 // NewJobInput creates a pps.JobInput. 198 func NewJobInput(repoName string, commitID string, glob string) *pps.JobInput { 199 return &pps.JobInput{ 200 Commit: NewCommit(repoName, commitID), 201 Glob: glob, 202 } 203 } 204 205 // NewPipeline creates a pps.Pipeline. 206 func NewPipeline(pipelineName string) *pps.Pipeline { 207 return &pps.Pipeline{Name: pipelineName} 208 } 209 210 // CreateJob creates and runs a job in PPS. 211 // This function is mostly useful internally, users should generally run work 212 // by creating pipelines as well. 213 func (c APIClient) CreateJob(pipeline string, outputCommit, statsCommit *pfs.Commit) (*pps.Job, error) { 214 job, err := c.PpsAPIClient.CreateJob( 215 c.Ctx(), 216 &pps.CreateJobRequest{ 217 Pipeline: NewPipeline(pipeline), 218 OutputCommit: outputCommit, 219 StatsCommit: statsCommit, 220 }, 221 ) 222 return job, grpcutil.ScrubGRPC(err) 223 } 224 225 // InspectJob returns info about a specific job. 226 // blockState will cause the call to block until the job reaches a terminal state (failure or success). 227 // full indicates that the full job info should be returned. 228 func (c APIClient) InspectJob(jobID string, blockState bool, full ...bool) (*pps.JobInfo, error) { 229 req := &pps.InspectJobRequest{ 230 Job: NewJob(jobID), 231 BlockState: blockState, 232 } 233 if len(full) > 0 { 234 req.Full = full[0] 235 } 236 jobInfo, err := c.PpsAPIClient.InspectJob(c.Ctx(), req) 237 return jobInfo, grpcutil.ScrubGRPC(err) 238 } 239 240 // InspectJobOutputCommit returns info about a job that created a commit. 241 // blockState will cause the call to block until the job reaches a terminal state (failure or success). 242 func (c APIClient) InspectJobOutputCommit(repoName, commitID string, blockState bool) (*pps.JobInfo, error) { 243 jobInfo, err := c.PpsAPIClient.InspectJob( 244 c.Ctx(), 245 &pps.InspectJobRequest{ 246 OutputCommit: NewCommit(repoName, commitID), 247 BlockState: blockState, 248 }) 249 return jobInfo, grpcutil.ScrubGRPC(err) 250 } 251 252 // ListJob returns info about all jobs. 253 // If pipelineName is non empty then only jobs that were started by the named pipeline will be returned 254 // If inputCommit is non-nil then only jobs which took the specific commits as inputs will be returned. 255 // The order of the inputCommits doesn't matter. 256 // If outputCommit is non-nil then only the job which created that commit as output will be returned. 257 // 'history' controls whether jobs from historical versions of pipelines are returned, it has the following semantics: 258 // 0: Return jobs from the current version of the pipeline or pipelines. 259 // 1: Return the above and jobs from the next most recent version 260 // 2: etc. 261 //-1: Return jobs from all historical versions. 262 // 'includePipelineInfo' controls whether the JobInfo passed to 'f' includes 263 // details fromt the pipeline spec (e.g. the transform). Leaving this 'false' 264 // can improve performance. 265 func (c APIClient) ListJob(pipelineName string, inputCommit []*pfs.Commit, outputCommit *pfs.Commit, history int64, includePipelineInfo bool) ([]*pps.JobInfo, error) { 266 var result []*pps.JobInfo 267 if err := c.ListJobF(pipelineName, inputCommit, outputCommit, history, 268 includePipelineInfo, func(ji *pps.JobInfo) error { 269 result = append(result, ji) 270 return nil 271 }); err != nil { 272 return nil, err 273 } 274 return result, nil 275 } 276 277 // ListJobF is a previous version of ListJobFilterF, returning info about all jobs 278 // and calling f on each JobInfo 279 func (c APIClient) ListJobF(pipelineName string, inputCommit []*pfs.Commit, 280 outputCommit *pfs.Commit, history int64, includePipelineInfo bool, 281 f func(*pps.JobInfo) error) error { 282 return c.ListJobFilterF(pipelineName, inputCommit, outputCommit, history, includePipelineInfo, "", f) 283 } 284 285 // ListJobFilterF returns info about all jobs, calling f with each JobInfo. 286 // If f returns an error iteration of jobs will stop and ListJobF will return 287 // that error, unless the error is errutil.ErrBreak in which case it will 288 // return nil. 289 // If pipelineName is non empty then only jobs that were started by the named pipeline will be returned 290 // If inputCommit is non-nil then only jobs which took the specific commits as inputs will be returned. 291 // The order of the inputCommits doesn't matter. 292 // If outputCommit is non-nil then only the job which created that commit as output will be returned. 293 // 'history' controls whether jobs from historical versions of pipelines are returned, it has the following semantics: 294 // 0: Return jobs from the current version of the pipeline or pipelines. 295 // 1: Return the above and jobs from the next most recent version 296 // 2: etc. 297 //-1: Return jobs from all historical versions. 298 // 'includePipelineInfo' controls whether the JobInfo passed to 'f' includes 299 // details fromt the pipeline spec--setting this to 'false' can improve 300 // performance. 301 func (c APIClient) ListJobFilterF(pipelineName string, inputCommit []*pfs.Commit, 302 outputCommit *pfs.Commit, history int64, includePipelineInfo bool, jqFilter string, 303 f func(*pps.JobInfo) error) error { 304 var pipeline *pps.Pipeline 305 if pipelineName != "" { 306 pipeline = NewPipeline(pipelineName) 307 } 308 client, err := c.PpsAPIClient.ListJobStream( 309 c.Ctx(), 310 &pps.ListJobRequest{ 311 Pipeline: pipeline, 312 InputCommit: inputCommit, 313 OutputCommit: outputCommit, 314 History: history, 315 Full: includePipelineInfo, 316 JqFilter: jqFilter, 317 }) 318 if err != nil { 319 return grpcutil.ScrubGRPC(err) 320 } 321 for { 322 ji, err := client.Recv() 323 if errors.Is(err, io.EOF) { 324 return nil 325 } else if err != nil { 326 return grpcutil.ScrubGRPC(err) 327 } 328 if err := f(ji); err != nil { 329 if errors.Is(err, errutil.ErrBreak) { 330 return nil 331 } 332 return err 333 } 334 } 335 } 336 337 // FlushJob calls f with all the jobs which were triggered by commits. 338 // If toPipelines is non-nil then only the jobs between commits and those 339 // pipelines in the DAG will be returned. 340 func (c APIClient) FlushJob(commits []*pfs.Commit, toPipelines []string, f func(*pps.JobInfo) error) error { 341 req := &pps.FlushJobRequest{ 342 Commits: commits, 343 } 344 for _, pipeline := range toPipelines { 345 req.ToPipelines = append(req.ToPipelines, NewPipeline(pipeline)) 346 } 347 client, err := c.PpsAPIClient.FlushJob(c.Ctx(), req) 348 if err != nil { 349 return grpcutil.ScrubGRPC(err) 350 } 351 for { 352 jobInfo, err := client.Recv() 353 if err != nil { 354 if errors.Is(err, io.EOF) { 355 return nil 356 } 357 return grpcutil.ScrubGRPC(err) 358 } 359 if err := f(jobInfo); err != nil { 360 return err 361 } 362 } 363 } 364 365 // FlushJobAll returns all the jobs which were triggered by commits. 366 // If toPipelines is non-nil then only the jobs between commits and those 367 // pipelines in the DAG will be returned. 368 func (c APIClient) FlushJobAll(commits []*pfs.Commit, toPipelines []string) ([]*pps.JobInfo, error) { 369 var result []*pps.JobInfo 370 if err := c.FlushJob(commits, toPipelines, func(ji *pps.JobInfo) error { 371 result = append(result, ji) 372 return nil 373 }); err != nil { 374 return nil, err 375 } 376 return result, nil 377 } 378 379 // DeleteJob deletes a job. 380 func (c APIClient) DeleteJob(jobID string) error { 381 _, err := c.PpsAPIClient.DeleteJob( 382 c.Ctx(), 383 &pps.DeleteJobRequest{ 384 Job: NewJob(jobID), 385 }, 386 ) 387 return grpcutil.ScrubGRPC(err) 388 } 389 390 // StopJob stops a job. 391 func (c APIClient) StopJob(jobID string) error { 392 _, err := c.PpsAPIClient.StopJob( 393 c.Ctx(), 394 &pps.StopJobRequest{ 395 Job: NewJob(jobID), 396 }, 397 ) 398 return grpcutil.ScrubGRPC(err) 399 } 400 401 // RestartDatum restarts a datum that's being processed as part of a job. 402 // datumFilter is a slice of strings which are matched against either the Path 403 // or Hash of the datum, the order of the strings in datumFilter is irrelevant. 404 func (c APIClient) RestartDatum(jobID string, datumFilter []string) error { 405 _, err := c.PpsAPIClient.RestartDatum( 406 c.Ctx(), 407 &pps.RestartDatumRequest{ 408 Job: NewJob(jobID), 409 DataFilters: datumFilter, 410 }, 411 ) 412 return grpcutil.ScrubGRPC(err) 413 } 414 415 // ListDatum returns info about datums in a Job 416 func (c APIClient) ListDatum(jobID string, pageSize, page int64) (*pps.ListDatumResponse, error) { 417 return c.listDatum(NewJob(jobID), nil, pageSize, page) 418 } 419 420 // ListDatumInput returns info about datums for a pipeline with input. The 421 // pipeline doesn't need to exist. 422 func (c APIClient) ListDatumInput(input *pps.Input, pageSize, page int64) (*pps.ListDatumResponse, error) { 423 return c.listDatum(nil, input, pageSize, page) 424 } 425 426 func (c APIClient) listDatum(job *pps.Job, input *pps.Input, pageSize, page int64) (*pps.ListDatumResponse, error) { 427 client, err := c.PpsAPIClient.ListDatumStream( 428 c.Ctx(), 429 &pps.ListDatumRequest{ 430 Input: input, 431 PageSize: pageSize, 432 Page: page, 433 Job: job, 434 }, 435 ) 436 if err != nil { 437 return nil, grpcutil.ScrubGRPC(err) 438 } 439 resp := &pps.ListDatumResponse{} 440 first := true 441 for { 442 r, err := client.Recv() 443 if errors.Is(err, io.EOF) { 444 break 445 } else if err != nil { 446 return nil, grpcutil.ScrubGRPC(err) 447 } 448 if first { 449 resp.TotalPages = r.TotalPages 450 resp.Page = r.Page 451 first = false 452 } 453 resp.DatumInfos = append(resp.DatumInfos, r.DatumInfo) 454 } 455 return resp, nil 456 } 457 458 // ListDatumOption represents an optional modification to a ListDatum request 459 type ListDatumOption func(*pps.ListDatumRequest) error 460 461 // WithStatusOnly causes a ListDatum request to only retrieve status information for datums, 462 // which can improve performance 463 func WithStatusOnly() ListDatumOption { 464 return func(req *pps.ListDatumRequest) error { 465 req.StatusOnly = true 466 return nil 467 } 468 } 469 470 // ListDatumF returns info about datums in a Job, calling f with each datum info. 471 func (c APIClient) ListDatumF(jobID string, pageSize int64, page int64, f func(di *pps.DatumInfo) error, options ...ListDatumOption) error { 472 return c.listDatumF(NewJob(jobID), nil, pageSize, page, f, options...) 473 } 474 475 // ListDatumInputF returns info about datums for a pipeline with input, calling 476 // f with each datum info. The pipeline doesn't need to exist. 477 func (c APIClient) ListDatumInputF(input *pps.Input, pageSize, page int64, f func(di *pps.DatumInfo) error) error { 478 return c.listDatumF(nil, input, pageSize, page, f) 479 } 480 481 func (c APIClient) listDatumF(job *pps.Job, input *pps.Input, pageSize, page int64, f func(di *pps.DatumInfo) error, options ...ListDatumOption) error { 482 req := &pps.ListDatumRequest{ 483 Input: input, 484 PageSize: pageSize, 485 Page: page, 486 Job: job, 487 } 488 for _, opt := range options { 489 if err := opt(req); err != nil { 490 return err 491 } 492 } 493 client, err := c.PpsAPIClient.ListDatumStream(c.Ctx(), req) 494 if err != nil { 495 return grpcutil.ScrubGRPC(err) 496 } 497 for { 498 resp, err := client.Recv() 499 if errors.Is(err, io.EOF) { 500 return nil 501 } else if err != nil { 502 return grpcutil.ScrubGRPC(err) 503 } 504 if err := f(resp.DatumInfo); err != nil { 505 if errors.Is(err, errutil.ErrBreak) { 506 return nil 507 } 508 return err 509 } 510 } 511 } 512 513 // InspectDatum returns info about a single datum 514 func (c APIClient) InspectDatum(jobID string, datumID string) (*pps.DatumInfo, error) { 515 datumInfo, err := c.PpsAPIClient.InspectDatum( 516 c.Ctx(), 517 &pps.InspectDatumRequest{ 518 Datum: &pps.Datum{ 519 ID: datumID, 520 Job: NewJob(jobID), 521 }, 522 }, 523 ) 524 if err != nil { 525 return nil, grpcutil.ScrubGRPC(err) 526 } 527 return datumInfo, nil 528 } 529 530 // LogsIter iterates through log messages returned from pps.GetLogs. Logs can 531 // be fetched with 'Next()'. The log message received can be examined with 532 // 'Message()', and any errors can be examined with 'Err()'. 533 type LogsIter struct { 534 logsClient pps.API_GetLogsClient 535 msg *pps.LogMessage 536 err error 537 } 538 539 // Next retrieves the next relevant log message from pachd 540 func (l *LogsIter) Next() bool { 541 if l.err != nil { 542 l.msg = nil 543 return false 544 } 545 l.msg, l.err = l.logsClient.Recv() 546 return l.err == nil 547 } 548 549 // Message returns the most recently retrieve log message (as an annotated log 550 // line, in the form of a pps.LogMessage) 551 func (l *LogsIter) Message() *pps.LogMessage { 552 return l.msg 553 } 554 555 // Err retrieves any errors encountered in the course of calling 'Next()'. 556 func (l *LogsIter) Err() error { 557 if errors.Is(l.err, io.EOF) { 558 return nil 559 } 560 return grpcutil.ScrubGRPC(l.err) 561 } 562 563 // GetLogs gets logs from a job (logs includes stdout and stderr). 'pipelineName', 564 // 'jobID', 'data', and 'datumID', are all filters. To forego any filter, 565 // simply pass an empty value, though one of 'pipelineName' and 'jobID' 566 // must be set. Responses are written to 'messages' 567 func (c APIClient) GetLogs( 568 pipelineName string, 569 jobID string, 570 data []string, 571 datumID string, 572 master bool, 573 follow bool, 574 since time.Duration, 575 ) *LogsIter { 576 return c.getLogs(pipelineName, jobID, data, datumID, master, follow, since, false) 577 } 578 579 // GetLogsLoki gets logs from a job (logs includes stdout and stderr). 'pipelineName', 580 // 'jobID', 'data', and 'datumID', are all filters. To forego any filter, 581 // simply pass an empty value, though one of 'pipelineName' and 'jobID' 582 // must be set. Responses are written to 'messages' 583 func (c APIClient) GetLogsLoki( 584 pipelineName string, 585 jobID string, 586 data []string, 587 datumID string, 588 master bool, 589 follow bool, 590 since time.Duration, 591 ) *LogsIter { 592 return c.getLogs(pipelineName, jobID, data, datumID, master, follow, since, true) 593 } 594 595 func (c APIClient) getLogs( 596 pipelineName string, 597 jobID string, 598 data []string, 599 datumID string, 600 master bool, 601 follow bool, 602 since time.Duration, 603 useLoki bool, 604 ) *LogsIter { 605 request := pps.GetLogsRequest{ 606 Master: master, 607 Follow: follow, 608 UseLokiBackend: useLoki, 609 Since: types.DurationProto(since), 610 } 611 if pipelineName != "" { 612 request.Pipeline = NewPipeline(pipelineName) 613 } 614 if jobID != "" { 615 request.Job = NewJob(jobID) 616 } 617 request.DataFilters = data 618 if datumID != "" { 619 request.Datum = &pps.Datum{ 620 Job: NewJob(jobID), 621 ID: datumID, 622 } 623 } 624 resp := &LogsIter{} 625 resp.logsClient, resp.err = c.PpsAPIClient.GetLogs(c.Ctx(), &request) 626 resp.err = grpcutil.ScrubGRPC(resp.err) 627 return resp 628 } 629 630 // CreatePipeline creates a new pipeline, pipelines are the main computation 631 // object in PPS they create a flow of data from a set of input Repos to an 632 // output Repo (which has the same name as the pipeline). Whenever new data is 633 // committed to one of the input repos the pipelines will create jobs to bring 634 // the output Repo up to data. 635 // image is the Docker image to run the jobs in. 636 // cmd is the command passed to the Docker run invocation. 637 // NOTE as with Docker cmd is not run inside a shell that means that things 638 // like wildcard globbing (*), pipes (|) and file redirects (> and >>) will not 639 // work. To get that behavior you should have your command be a shell of your 640 // choice and pass a shell script to stdin. 641 // stdin is a slice of lines that are sent to your command on stdin. Lines need 642 // not end in newline characters. 643 // parallelism is how many copies of your container should run in parallel. You 644 // may pass 0 for parallelism in which case PPS will set the parallelism based 645 // on available resources. 646 // input specifies a set of Repos that will be visible to the jobs during runtime. 647 // commits to these repos will cause the pipeline to create new jobs to process them. 648 // update indicates that you want to update an existing pipeline 649 func (c APIClient) CreatePipeline( 650 name string, 651 image string, 652 cmd []string, 653 stdin []string, 654 parallelismSpec *pps.ParallelismSpec, 655 input *pps.Input, 656 outputBranch string, 657 update bool, 658 ) error { 659 _, err := c.PpsAPIClient.CreatePipeline( 660 c.Ctx(), 661 &pps.CreatePipelineRequest{ 662 Pipeline: NewPipeline(name), 663 Transform: &pps.Transform{ 664 Image: image, 665 Cmd: cmd, 666 Stdin: stdin, 667 }, 668 ParallelismSpec: parallelismSpec, 669 Input: input, 670 OutputBranch: outputBranch, 671 Update: update, 672 }, 673 ) 674 return grpcutil.ScrubGRPC(err) 675 } 676 677 // InspectPipeline returns info about a specific pipeline. 678 func (c APIClient) InspectPipeline(pipelineName string) (*pps.PipelineInfo, error) { 679 pipelineInfo, err := c.PpsAPIClient.InspectPipeline( 680 c.Ctx(), 681 &pps.InspectPipelineRequest{ 682 Pipeline: NewPipeline(pipelineName), 683 }, 684 ) 685 return pipelineInfo, grpcutil.ScrubGRPC(err) 686 } 687 688 // ListPipeline returns info about all pipelines. 689 func (c APIClient) ListPipeline() ([]*pps.PipelineInfo, error) { 690 pipelineInfos, err := c.PpsAPIClient.ListPipeline( 691 c.Ctx(), 692 &pps.ListPipelineRequest{}, 693 ) 694 if err != nil { 695 return nil, grpcutil.ScrubGRPC(err) 696 } 697 return pipelineInfos.PipelineInfo, nil 698 } 699 700 // ListPipelineHistory returns historical information about pipelines. 701 // `pipeline` specifies which pipeline to return history about, if it's equal 702 // to "" then ListPipelineHistory returns historical information about all 703 // pipelines. 704 // `history` specifies how many historical revisions to return: 705 // 0: Return the current version of the pipeline or pipelines. 706 // 1: Return the above and the next most recent version 707 // 2: etc. 708 //-1: Return all historical versions. 709 func (c APIClient) ListPipelineHistory(pipeline string, history int64) ([]*pps.PipelineInfo, error) { 710 var _pipeline *pps.Pipeline 711 if pipeline != "" { 712 _pipeline = NewPipeline(pipeline) 713 } 714 pipelineInfos, err := c.PpsAPIClient.ListPipeline( 715 c.Ctx(), 716 &pps.ListPipelineRequest{ 717 Pipeline: _pipeline, 718 History: history, 719 }, 720 ) 721 if err != nil { 722 return nil, grpcutil.ScrubGRPC(err) 723 } 724 return pipelineInfos.PipelineInfo, nil 725 } 726 727 // DeletePipeline deletes a pipeline along with its output Repo. 728 func (c APIClient) DeletePipeline(name string, force bool, splitTransaction ...bool) error { 729 req := &pps.DeletePipelineRequest{ 730 Pipeline: NewPipeline(name), 731 Force: force, 732 } 733 if len(splitTransaction) > 0 { 734 req.SplitTransaction = splitTransaction[0] 735 } 736 _, err := c.PpsAPIClient.DeletePipeline( 737 c.Ctx(), 738 req, 739 ) 740 return grpcutil.ScrubGRPC(err) 741 } 742 743 // StartPipeline restarts a stopped pipeline. 744 func (c APIClient) StartPipeline(name string) error { 745 _, err := c.PpsAPIClient.StartPipeline( 746 c.Ctx(), 747 &pps.StartPipelineRequest{ 748 Pipeline: NewPipeline(name), 749 }, 750 ) 751 return grpcutil.ScrubGRPC(err) 752 } 753 754 // StopPipeline prevents a pipeline from processing things, it can be restarted 755 // with StartPipeline. 756 func (c APIClient) StopPipeline(name string) error { 757 _, err := c.PpsAPIClient.StopPipeline( 758 c.Ctx(), 759 &pps.StopPipelineRequest{ 760 Pipeline: NewPipeline(name), 761 }, 762 ) 763 return grpcutil.ScrubGRPC(err) 764 } 765 766 // RunPipeline runs a pipeline. It can be passed a list of commit provenance. 767 // This will trigger a new job provenant on those commits, effectively running the pipeline on the data in those commits. 768 func (c APIClient) RunPipeline(name string, provenance []*pfs.CommitProvenance, jobID string) error { 769 _, err := c.PpsAPIClient.RunPipeline( 770 c.Ctx(), 771 &pps.RunPipelineRequest{ 772 Pipeline: NewPipeline(name), 773 Provenance: provenance, 774 JobID: jobID, 775 }, 776 ) 777 return grpcutil.ScrubGRPC(err) 778 } 779 780 // RunCron runs a pipeline. It can be passed a list of commit provenance. 781 // This will trigger a new job provenant on those commits, effectively running the pipeline on the data in those commits. 782 func (c APIClient) RunCron(name string) error { 783 _, err := c.PpsAPIClient.RunCron( 784 c.Ctx(), 785 &pps.RunCronRequest{ 786 Pipeline: NewPipeline(name), 787 }, 788 ) 789 return grpcutil.ScrubGRPC(err) 790 } 791 792 // CreateSecret creates a secret on the cluster. 793 func (c APIClient) CreateSecret(file []byte) error { 794 _, err := c.PpsAPIClient.CreateSecret( 795 c.Ctx(), 796 &pps.CreateSecretRequest{ 797 File: file, 798 }, 799 ) 800 return grpcutil.ScrubGRPC(err) 801 } 802 803 // DeleteSecret deletes a secret from the cluster. 804 func (c APIClient) DeleteSecret(secret string) error { 805 _, err := c.PpsAPIClient.DeleteSecret( 806 c.Ctx(), 807 &pps.DeleteSecretRequest{ 808 Secret: &pps.Secret{Name: secret}, 809 }, 810 ) 811 return grpcutil.ScrubGRPC(err) 812 } 813 814 // InspectSecret returns info about a specific secret. 815 func (c APIClient) InspectSecret(secret string) (*pps.SecretInfo, error) { 816 secretInfo, err := c.PpsAPIClient.InspectSecret( 817 c.Ctx(), 818 &pps.InspectSecretRequest{ 819 Secret: &pps.Secret{Name: secret}, 820 }, 821 ) 822 return secretInfo, grpcutil.ScrubGRPC(err) 823 } 824 825 // ListSecret returns info about all Pachyderm secrets. 826 func (c APIClient) ListSecret() ([]*pps.SecretInfo, error) { 827 secretInfos, err := c.PpsAPIClient.ListSecret( 828 c.Ctx(), 829 &types.Empty{}, 830 ) 831 if err != nil { 832 return nil, grpcutil.ScrubGRPC(err) 833 } 834 return secretInfos.SecretInfo, nil 835 } 836 837 // CreatePipelineService creates a new pipeline service. 838 func (c APIClient) CreatePipelineService( 839 name string, 840 image string, 841 cmd []string, 842 stdin []string, 843 parallelismSpec *pps.ParallelismSpec, 844 input *pps.Input, 845 update bool, 846 internalPort int32, 847 externalPort int32, 848 annotations map[string]string, 849 ) error { 850 _, err := c.PpsAPIClient.CreatePipeline( 851 c.Ctx(), 852 &pps.CreatePipelineRequest{ 853 Pipeline: NewPipeline(name), 854 Metadata: &pps.Metadata{ 855 Annotations: annotations, 856 }, 857 Transform: &pps.Transform{ 858 Image: image, 859 Cmd: cmd, 860 Stdin: stdin, 861 }, 862 ParallelismSpec: parallelismSpec, 863 Input: input, 864 Update: update, 865 Service: &pps.Service{ 866 InternalPort: internalPort, 867 ExternalPort: externalPort, 868 }, 869 }, 870 ) 871 return grpcutil.ScrubGRPC(err) 872 } 873 874 // GarbageCollect garbage collects unused data. Currently GC needs to be run 875 // while no data is being added or removed (which, among other things, implies 876 // that there shouldn't be jobs actively running). Pfs Garbage collection uses 877 // bloom filters to keep track of live objects because it can store more 878 // objects than can be indexed in memory. This means that there is a chance for 879 // unreferenced objects to not be GCed, this chance increases as the number of 880 // objects in the system increases. You can tradeoff using more memory to get a 881 // lower chance of collisions, the default value is 10 MB and collisions should 882 // be unlikely until you have 10 million objects. 883 func (c APIClient) GarbageCollect(memoryBytes int64) error { 884 _, err := c.PpsAPIClient.GarbageCollect( 885 c.Ctx(), 886 &pps.GarbageCollectRequest{MemoryBytes: memoryBytes}, 887 ) 888 return grpcutil.ScrubGRPC(err) 889 } 890 891 // GetDatumTotalTime sums the timing stats from a DatumInfo 892 func GetDatumTotalTime(s *pps.ProcessStats) time.Duration { 893 totalDuration := time.Duration(0) 894 duration, _ := types.DurationFromProto(s.DownloadTime) 895 totalDuration += duration 896 duration, _ = types.DurationFromProto(s.ProcessTime) 897 totalDuration += duration 898 duration, _ = types.DurationFromProto(s.UploadTime) 899 totalDuration += duration 900 return totalDuration 901 }