github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/worker.go (about)

     1  package worker
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"path"
     8  	"time"
     9  
    10  	etcd "github.com/coreos/etcd/clientv3"
    11  	docker "github.com/fsouza/go-dockerclient"
    12  	"golang.org/x/sync/errgroup"
    13  
    14  	"github.com/pachyderm/pachyderm/src/client"
    15  	"github.com/pachyderm/pachyderm/src/client/auth"
    16  	"github.com/pachyderm/pachyderm/src/client/pkg/errors"
    17  	"github.com/pachyderm/pachyderm/src/client/pps"
    18  	"github.com/pachyderm/pachyderm/src/server/pkg/backoff"
    19  	"github.com/pachyderm/pachyderm/src/server/pkg/dlock"
    20  	"github.com/pachyderm/pachyderm/src/server/pkg/ppsutil"
    21  	"github.com/pachyderm/pachyderm/src/server/pkg/watch"
    22  	"github.com/pachyderm/pachyderm/src/server/pkg/work"
    23  	"github.com/pachyderm/pachyderm/src/server/worker/driver"
    24  	"github.com/pachyderm/pachyderm/src/server/worker/logs"
    25  	"github.com/pachyderm/pachyderm/src/server/worker/pipeline/service"
    26  	"github.com/pachyderm/pachyderm/src/server/worker/pipeline/spout"
    27  	"github.com/pachyderm/pachyderm/src/server/worker/pipeline/transform"
    28  	"github.com/pachyderm/pachyderm/src/server/worker/server"
    29  	"github.com/pachyderm/pachyderm/src/server/worker/stats"
    30  )
    31  
    32  const (
    33  	masterLockPath = "_master_worker_lock"
    34  )
    35  
    36  // The Worker object represents
    37  type Worker struct {
    38  	APIServer *server.APIServer // Provides rpcs for other nodes in the cluster
    39  	driver    driver.Driver     // Provides common functions used by worker code
    40  	status    *transform.Status // An interface for inspecting and canceling the actively running task
    41  }
    42  
    43  // NewWorker constructs a Worker object that provides all worker functionality:
    44  //  1. a master goroutine that attempts to obtain the master lock for the pipeline workers and direct jobs
    45  //  2. a worker goroutine that gets tasks from the master and processes them
    46  //  3. an api server that serves requests for status or cross-worker communication
    47  //  4. a driver that provides common functionality between the above components
    48  func NewWorker(
    49  	pachClient *client.APIClient,
    50  	etcdClient *etcd.Client,
    51  	etcdPrefix string,
    52  	pipelineInfo *pps.PipelineInfo,
    53  	workerName string,
    54  	namespace string,
    55  	hashtreePath string,
    56  	rootPath string,
    57  ) (*Worker, error) {
    58  	stats.InitPrometheus()
    59  
    60  	hasDocker := true
    61  	if _, err := os.Stat("/var/run/docker.sock"); err != nil {
    62  		hasDocker = false
    63  	}
    64  
    65  	driver, err := driver.NewDriver(
    66  		pipelineInfo,
    67  		pachClient,
    68  		etcdClient,
    69  		etcdPrefix,
    70  		hashtreePath,
    71  		rootPath,
    72  		namespace,
    73  	)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  
    78  	if pipelineInfo.Transform.Image != "" && hasDocker {
    79  		docker, err := docker.NewClientFromEnv()
    80  		if err != nil {
    81  			return nil, err
    82  		}
    83  		image, err := docker.InspectImage(pipelineInfo.Transform.Image)
    84  		if err != nil {
    85  			return nil, errors.Wrapf(err, "error inspecting image %s", pipelineInfo.Transform.Image)
    86  		}
    87  		if pipelineInfo.Transform.User == "" {
    88  			pipelineInfo.Transform.User = image.Config.User
    89  		}
    90  		if pipelineInfo.Transform.WorkingDir == "" {
    91  			pipelineInfo.Transform.WorkingDir = image.Config.WorkingDir
    92  		}
    93  		if pipelineInfo.Transform.Cmd == nil {
    94  			if len(image.Config.Entrypoint) == 0 {
    95  				ppsutil.FailPipeline(pachClient.Ctx(), etcdClient, driver.Pipelines(),
    96  					pipelineInfo.Pipeline.Name,
    97  					"nothing to run: no transform.cmd and no entrypoint")
    98  			}
    99  			pipelineInfo.Transform.Cmd = image.Config.Entrypoint
   100  		}
   101  	}
   102  
   103  	worker := &Worker{
   104  		driver: driver,
   105  		status: &transform.Status{},
   106  	}
   107  
   108  	worker.APIServer = server.NewAPIServer(driver, worker.status, workerName)
   109  
   110  	go worker.master(etcdClient, etcdPrefix)
   111  	go worker.worker()
   112  	return worker, nil
   113  }
   114  
   115  func (w *Worker) worker() {
   116  	ctx := w.driver.PachClient().Ctx()
   117  	logger := logs.NewStatlessLogger(w.driver.PipelineInfo())
   118  
   119  	backoff.RetryUntilCancel(ctx, func() error {
   120  		eg, ctx := errgroup.WithContext(ctx)
   121  		driver := w.driver.WithContext(ctx)
   122  
   123  		// Clean the driver hashtree cache for any jobs that are deleted
   124  		eg.Go(func() error {
   125  			return driver.Jobs().ReadOnly(ctx).WatchF(func(e *watch.Event) error {
   126  				var key string
   127  				jobInfo := &pps.EtcdJobInfo{}
   128  				if err := e.Unmarshal(&key, jobInfo); err != nil {
   129  					return err
   130  				}
   131  
   132  				if e.Type == watch.EventDelete || (e.Type == watch.EventPut && ppsutil.IsTerminal(jobInfo.State)) {
   133  					driver.ChunkCaches().RemoveCache(key)
   134  					driver.ChunkStatsCaches().RemoveCache(key)
   135  				}
   136  				return nil
   137  			})
   138  		})
   139  
   140  		// Run any worker tasks that the master creates
   141  		eg.Go(func() error {
   142  			return driver.NewTaskWorker().Run(
   143  				ctx,
   144  				func(ctx context.Context, subtask *work.Task) error {
   145  					driver := w.driver.WithContext(ctx)
   146  					return transform.Worker(driver, logger, subtask, w.status)
   147  				},
   148  			)
   149  		})
   150  
   151  		return eg.Wait()
   152  	}, backoff.NewConstantBackOff(200*time.Millisecond), func(err error, d time.Duration) error {
   153  		if st, ok := err.(errors.StackTracer); ok {
   154  			logger.Logf("worker failed, retrying in %v:\n%s\n%+v", d, err, st.StackTrace())
   155  		} else {
   156  			logger.Logf("worker failed, retrying in %v:\n%s", d, err)
   157  		}
   158  		return nil
   159  	})
   160  }
   161  
   162  func (w *Worker) master(etcdClient *etcd.Client, etcdPrefix string) {
   163  	pipelineInfo := w.driver.PipelineInfo()
   164  	logger := logs.NewMasterLogger(pipelineInfo)
   165  	lockPath := path.Join(etcdPrefix, masterLockPath, pipelineInfo.Pipeline.Name, pipelineInfo.Salt)
   166  	masterLock := dlock.NewDLock(etcdClient, lockPath)
   167  
   168  	b := backoff.NewInfiniteBackOff()
   169  	// Setting a high backoff so that when this master fails, the other
   170  	// workers are more likely to become the master.
   171  	// Also, we've observed race conditions where StopPipeline would cause
   172  	// a master to restart before it's deleted.  PPS would then get confused
   173  	// by the restart and create the workers again, because the restart would
   174  	// bring the pipeline state from PAUSED to RUNNING.  By setting a high
   175  	// retry interval, the master would be deleted before it gets a chance
   176  	// to restart.
   177  	b.InitialInterval = 10 * time.Second
   178  	backoff.RetryNotify(func() error {
   179  		// We use pachClient.Ctx here because it contains auth information.
   180  		ctx, cancel := context.WithCancel(w.driver.PachClient().Ctx())
   181  		defer cancel() // make sure that everything this loop might spawn gets cleaned up
   182  		ctx, err := masterLock.Lock(ctx)
   183  		if err != nil {
   184  			return err
   185  		}
   186  		defer masterLock.Unlock(ctx)
   187  
   188  		// Create a new driver that uses a new cancelable pachClient
   189  		return runSpawner(w.driver.WithContext(ctx), logger)
   190  	}, b, func(err error, d time.Duration) error {
   191  		if auth.IsErrNotAuthorized(err) {
   192  			logger.Logf("failing %q due to auth rejection", pipelineInfo.Pipeline.Name)
   193  			return ppsutil.FailPipeline(
   194  				w.driver.PachClient().Ctx(),
   195  				etcdClient,
   196  				w.driver.Pipelines(),
   197  				pipelineInfo.Pipeline.Name,
   198  				"worker master could not access output repo to watch for new commits",
   199  			)
   200  		}
   201  		logger.Logf("master: error running the master process, retrying in %v: %v", d, err)
   202  		return nil
   203  	})
   204  }
   205  
   206  type spawnerFunc func(driver.Driver, logs.TaggedLogger) error
   207  
   208  // Run runs the spawner for a given pipeline.  This switches between several
   209  // underlying functions based on the configuration in pipelineInfo (e.g. if
   210  // it is a service, a spout, or a transform pipeline).
   211  func runSpawner(driver driver.Driver, logger logs.TaggedLogger) error {
   212  	pipelineType, runFn := func() (string, spawnerFunc) {
   213  		switch {
   214  		case driver.PipelineInfo().Service != nil:
   215  			return "service", service.Run
   216  		case driver.PipelineInfo().Spout != nil:
   217  			return "spout", spout.Run
   218  		default:
   219  			return "transform", transform.Run
   220  		}
   221  	}()
   222  
   223  	return logger.LogStep(fmt.Sprintf("%v spawner process", pipelineType), func() error {
   224  		return runFn(driver, logger)
   225  	})
   226  }