github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/worker.go (about) 1 package worker 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "path" 8 "time" 9 10 etcd "github.com/coreos/etcd/clientv3" 11 docker "github.com/fsouza/go-dockerclient" 12 "golang.org/x/sync/errgroup" 13 14 "github.com/pachyderm/pachyderm/src/client" 15 "github.com/pachyderm/pachyderm/src/client/auth" 16 "github.com/pachyderm/pachyderm/src/client/pkg/errors" 17 "github.com/pachyderm/pachyderm/src/client/pps" 18 "github.com/pachyderm/pachyderm/src/server/pkg/backoff" 19 "github.com/pachyderm/pachyderm/src/server/pkg/dlock" 20 "github.com/pachyderm/pachyderm/src/server/pkg/ppsutil" 21 "github.com/pachyderm/pachyderm/src/server/pkg/watch" 22 "github.com/pachyderm/pachyderm/src/server/pkg/work" 23 "github.com/pachyderm/pachyderm/src/server/worker/driver" 24 "github.com/pachyderm/pachyderm/src/server/worker/logs" 25 "github.com/pachyderm/pachyderm/src/server/worker/pipeline/service" 26 "github.com/pachyderm/pachyderm/src/server/worker/pipeline/spout" 27 "github.com/pachyderm/pachyderm/src/server/worker/pipeline/transform" 28 "github.com/pachyderm/pachyderm/src/server/worker/server" 29 "github.com/pachyderm/pachyderm/src/server/worker/stats" 30 ) 31 32 const ( 33 masterLockPath = "_master_worker_lock" 34 ) 35 36 // The Worker object represents 37 type Worker struct { 38 APIServer *server.APIServer // Provides rpcs for other nodes in the cluster 39 driver driver.Driver // Provides common functions used by worker code 40 status *transform.Status // An interface for inspecting and canceling the actively running task 41 } 42 43 // NewWorker constructs a Worker object that provides all worker functionality: 44 // 1. a master goroutine that attempts to obtain the master lock for the pipeline workers and direct jobs 45 // 2. a worker goroutine that gets tasks from the master and processes them 46 // 3. an api server that serves requests for status or cross-worker communication 47 // 4. a driver that provides common functionality between the above components 48 func NewWorker( 49 pachClient *client.APIClient, 50 etcdClient *etcd.Client, 51 etcdPrefix string, 52 pipelineInfo *pps.PipelineInfo, 53 workerName string, 54 namespace string, 55 hashtreePath string, 56 rootPath string, 57 ) (*Worker, error) { 58 stats.InitPrometheus() 59 60 hasDocker := true 61 if _, err := os.Stat("/var/run/docker.sock"); err != nil { 62 hasDocker = false 63 } 64 65 driver, err := driver.NewDriver( 66 pipelineInfo, 67 pachClient, 68 etcdClient, 69 etcdPrefix, 70 hashtreePath, 71 rootPath, 72 namespace, 73 ) 74 if err != nil { 75 return nil, err 76 } 77 78 if pipelineInfo.Transform.Image != "" && hasDocker { 79 docker, err := docker.NewClientFromEnv() 80 if err != nil { 81 return nil, err 82 } 83 image, err := docker.InspectImage(pipelineInfo.Transform.Image) 84 if err != nil { 85 return nil, errors.Wrapf(err, "error inspecting image %s", pipelineInfo.Transform.Image) 86 } 87 if pipelineInfo.Transform.User == "" { 88 pipelineInfo.Transform.User = image.Config.User 89 } 90 if pipelineInfo.Transform.WorkingDir == "" { 91 pipelineInfo.Transform.WorkingDir = image.Config.WorkingDir 92 } 93 if pipelineInfo.Transform.Cmd == nil { 94 if len(image.Config.Entrypoint) == 0 { 95 ppsutil.FailPipeline(pachClient.Ctx(), etcdClient, driver.Pipelines(), 96 pipelineInfo.Pipeline.Name, 97 "nothing to run: no transform.cmd and no entrypoint") 98 } 99 pipelineInfo.Transform.Cmd = image.Config.Entrypoint 100 } 101 } 102 103 worker := &Worker{ 104 driver: driver, 105 status: &transform.Status{}, 106 } 107 108 worker.APIServer = server.NewAPIServer(driver, worker.status, workerName) 109 110 go worker.master(etcdClient, etcdPrefix) 111 go worker.worker() 112 return worker, nil 113 } 114 115 func (w *Worker) worker() { 116 ctx := w.driver.PachClient().Ctx() 117 logger := logs.NewStatlessLogger(w.driver.PipelineInfo()) 118 119 backoff.RetryUntilCancel(ctx, func() error { 120 eg, ctx := errgroup.WithContext(ctx) 121 driver := w.driver.WithContext(ctx) 122 123 // Clean the driver hashtree cache for any jobs that are deleted 124 eg.Go(func() error { 125 return driver.Jobs().ReadOnly(ctx).WatchF(func(e *watch.Event) error { 126 var key string 127 jobInfo := &pps.EtcdJobInfo{} 128 if err := e.Unmarshal(&key, jobInfo); err != nil { 129 return err 130 } 131 132 if e.Type == watch.EventDelete || (e.Type == watch.EventPut && ppsutil.IsTerminal(jobInfo.State)) { 133 driver.ChunkCaches().RemoveCache(key) 134 driver.ChunkStatsCaches().RemoveCache(key) 135 } 136 return nil 137 }) 138 }) 139 140 // Run any worker tasks that the master creates 141 eg.Go(func() error { 142 return driver.NewTaskWorker().Run( 143 ctx, 144 func(ctx context.Context, subtask *work.Task) error { 145 driver := w.driver.WithContext(ctx) 146 return transform.Worker(driver, logger, subtask, w.status) 147 }, 148 ) 149 }) 150 151 return eg.Wait() 152 }, backoff.NewConstantBackOff(200*time.Millisecond), func(err error, d time.Duration) error { 153 if st, ok := err.(errors.StackTracer); ok { 154 logger.Logf("worker failed, retrying in %v:\n%s\n%+v", d, err, st.StackTrace()) 155 } else { 156 logger.Logf("worker failed, retrying in %v:\n%s", d, err) 157 } 158 return nil 159 }) 160 } 161 162 func (w *Worker) master(etcdClient *etcd.Client, etcdPrefix string) { 163 pipelineInfo := w.driver.PipelineInfo() 164 logger := logs.NewMasterLogger(pipelineInfo) 165 lockPath := path.Join(etcdPrefix, masterLockPath, pipelineInfo.Pipeline.Name, pipelineInfo.Salt) 166 masterLock := dlock.NewDLock(etcdClient, lockPath) 167 168 b := backoff.NewInfiniteBackOff() 169 // Setting a high backoff so that when this master fails, the other 170 // workers are more likely to become the master. 171 // Also, we've observed race conditions where StopPipeline would cause 172 // a master to restart before it's deleted. PPS would then get confused 173 // by the restart and create the workers again, because the restart would 174 // bring the pipeline state from PAUSED to RUNNING. By setting a high 175 // retry interval, the master would be deleted before it gets a chance 176 // to restart. 177 b.InitialInterval = 10 * time.Second 178 backoff.RetryNotify(func() error { 179 // We use pachClient.Ctx here because it contains auth information. 180 ctx, cancel := context.WithCancel(w.driver.PachClient().Ctx()) 181 defer cancel() // make sure that everything this loop might spawn gets cleaned up 182 ctx, err := masterLock.Lock(ctx) 183 if err != nil { 184 return err 185 } 186 defer masterLock.Unlock(ctx) 187 188 // Create a new driver that uses a new cancelable pachClient 189 return runSpawner(w.driver.WithContext(ctx), logger) 190 }, b, func(err error, d time.Duration) error { 191 if auth.IsErrNotAuthorized(err) { 192 logger.Logf("failing %q due to auth rejection", pipelineInfo.Pipeline.Name) 193 return ppsutil.FailPipeline( 194 w.driver.PachClient().Ctx(), 195 etcdClient, 196 w.driver.Pipelines(), 197 pipelineInfo.Pipeline.Name, 198 "worker master could not access output repo to watch for new commits", 199 ) 200 } 201 logger.Logf("master: error running the master process, retrying in %v: %v", d, err) 202 return nil 203 }) 204 } 205 206 type spawnerFunc func(driver.Driver, logs.TaggedLogger) error 207 208 // Run runs the spawner for a given pipeline. This switches between several 209 // underlying functions based on the configuration in pipelineInfo (e.g. if 210 // it is a service, a spout, or a transform pipeline). 211 func runSpawner(driver driver.Driver, logger logs.TaggedLogger) error { 212 pipelineType, runFn := func() (string, spawnerFunc) { 213 switch { 214 case driver.PipelineInfo().Service != nil: 215 return "service", service.Run 216 case driver.PipelineInfo().Spout != nil: 217 return "spout", spout.Run 218 default: 219 return "transform", transform.Run 220 } 221 }() 222 223 return logger.LogStep(fmt.Sprintf("%v spawner process", pipelineType), func() error { 224 return runFn(driver, logger) 225 }) 226 }