github.com/proppy/fleet@v0.1.4/agent/agent.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"time"
     7  
     8  	log "github.com/coreos/fleet/third_party/github.com/golang/glog"
     9  
    10  	"github.com/coreos/fleet/event"
    11  	"github.com/coreos/fleet/job"
    12  	"github.com/coreos/fleet/machine"
    13  	"github.com/coreos/fleet/registry"
    14  	"github.com/coreos/fleet/systemd"
    15  )
    16  
    17  const (
    18  	// TTL to use with all state pushed to Registry
    19  	DefaultTTL = "30s"
    20  
    21  	// Refresh TTLs at 1/2 the TTL length
    22  	refreshInterval = 2
    23  )
    24  
    25  // The Agent owns all of the coordination between the Registry, the local
    26  // Machine, and the local SystemdManager.
    27  type Agent struct {
    28  	registry      *registry.Registry
    29  	events        *event.EventBus
    30  	machine       *machine.Machine
    31  	ttl           time.Duration
    32  	systemdPrefix string
    33  
    34  	state   *AgentState
    35  	systemd *systemd.SystemdManager
    36  
    37  	// channel used to shutdown any open connections/channels the Agent holds
    38  	stop chan bool
    39  }
    40  
    41  func New(registry *registry.Registry, events *event.EventBus, machine *machine.Machine, ttl, unitPrefix string) (*Agent, error) {
    42  	ttldur, err := time.ParseDuration(ttl)
    43  	if err != nil {
    44  		return nil, err
    45  	}
    46  
    47  	state := NewState()
    48  	mgr := systemd.NewSystemdManager(machine, unitPrefix)
    49  
    50  	return &Agent{registry, events, machine, ttldur, unitPrefix, state, mgr, nil}, nil
    51  }
    52  
    53  // Access Agent's machine field
    54  func (a *Agent) Machine() *machine.Machine {
    55  	return a.machine
    56  }
    57  
    58  // Trigger all async processes the Agent intends to run
    59  func (a *Agent) Run() {
    60  	a.stop = make(chan bool)
    61  
    62  	handler := NewEventHandler(a)
    63  	a.events.AddListener("agent", a.machine, handler)
    64  
    65  	go a.systemd.Publish(a.events, a.stop)
    66  	go a.Heartbeat(a.ttl, a.stop)
    67  
    68  	// Block until we receive a stop signal
    69  	<-a.stop
    70  
    71  	a.events.RemoveListener("agent", a.machine)
    72  }
    73  
    74  // Stop all async processes the Agent is running
    75  func (a *Agent) Stop() {
    76  	log.V(1).Info("Stopping Agent")
    77  	close(a.stop)
    78  }
    79  
    80  // Clear any presence data from the Registry
    81  func (a *Agent) Purge() {
    82  	log.V(1).Info("Removing Agent from Registry")
    83  	bootId := a.machine.State().BootId
    84  	err := a.registry.RemoveMachineState(bootId)
    85  	if err != nil {
    86  		log.Errorf("Failed to remove Machine %s from Registry: %s", bootId, err.Error())
    87  	}
    88  
    89  	for _, j := range a.registry.GetAllJobsByMachine(bootId) {
    90  		log.V(1).Infof("Clearing JobState(%s) from Registry", j.Name)
    91  		a.registry.RemoveJobState(j.Name)
    92  
    93  		offer := job.NewOfferFromJob(j)
    94  		log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name)
    95  		a.registry.CreateJobOffer(offer)
    96  		log.Infof("Published JobOffer(%s)", offer.Job.Name)
    97  	}
    98  }
    99  
   100  // Periodically report to the Registry at an interval equal to
   101  // half of the provided ttl. Stop reporting when the provided
   102  // channel is closed.
   103  func (a *Agent) Heartbeat(ttl time.Duration, stop chan bool) {
   104  
   105  	// Explicitly heartbeat immediately to push state to the
   106  	// Registry as quickly as possible
   107  	a.machine.RefreshState()
   108  	a.registry.SetMachineState(a.machine.State(), a.ttl)
   109  
   110  	interval := ttl / refreshInterval
   111  	for true {
   112  		select {
   113  		case <-stop:
   114  			log.V(2).Info("MachineHeartbeat exiting due to stop signal")
   115  			return
   116  		case <-time.Tick(interval):
   117  			log.V(2).Info("MachineHeartbeat tick")
   118  			a.machine.RefreshState()
   119  			err := a.registry.SetMachineState(a.machine.State(), a.ttl)
   120  			if err != nil {
   121  				log.Errorf("MachineHeartbeat failed: %v", err)
   122  			}
   123  		}
   124  	}
   125  }
   126  
   127  // Instruct the Agent to start the provided Job
   128  func (a *Agent) StartJob(j *job.Job) {
   129  	a.state.TrackJobConflicts(j.Name, j.Payload.Conflicts())
   130  
   131  	log.Infof("Starting Job(%s)", j.Name)
   132  	a.systemd.StartJob(j)
   133  }
   134  
   135  // Inform the Registry that a Job must be rescheduled
   136  func (a *Agent) RescheduleJob(j *job.Job) {
   137  	log.V(2).Infof("Stopping Job(%s)", j.Name)
   138  	a.registry.UnscheduleJob(j.Name)
   139  
   140  	offer := job.NewOfferFromJob(*j)
   141  	log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name)
   142  	a.registry.CreateJobOffer(offer)
   143  	log.Infof("Published JobOffer(%s)", offer.Job.Name)
   144  }
   145  
   146  // Instruct the Agent to stop the provided Job and
   147  // all of its peers
   148  func (a *Agent) StopJob(jobName string) {
   149  	log.Infof("Stopping Job(%s)", jobName)
   150  	a.systemd.StopJob(jobName)
   151  	a.ReportJobState(jobName, nil)
   152  
   153  	a.state.Lock()
   154  	reversePeers := a.state.GetJobsByPeer(jobName)
   155  	a.state.DropPeersJob(jobName)
   156  	a.state.DropJobConflicts(jobName)
   157  	a.state.Unlock()
   158  
   159  	for _, peer := range reversePeers {
   160  		log.Infof("Stopping Peer(%s) of Job(%s)", peer, jobName)
   161  		a.registry.StopJob(peer)
   162  	}
   163  }
   164  
   165  // Persist the state of the given Job into the Registry
   166  func (a *Agent) ReportJobState(jobName string, jobState *job.JobState) {
   167  	if jobState == nil {
   168  		err := a.registry.RemoveJobState(jobName)
   169  		if err != nil {
   170  			log.V(1).Infof("Failed to remove JobState from Registry: %s", jobName, err.Error())
   171  		}
   172  	} else {
   173  		a.registry.SaveJobState(jobName, jobState)
   174  	}
   175  }
   176  
   177  // Submit all possible bids for unresolved offers
   178  func (a *Agent) BidForPossibleJobs() {
   179  	a.state.Lock()
   180  	offers := a.state.GetOffersWithoutBids()
   181  	a.state.Unlock()
   182  
   183  	log.V(2).Infof("Checking %d unbade offers", len(offers))
   184  	for i, _ := range offers {
   185  		offer := offers[i]
   186  		log.V(2).Infof("Checking ability to run Job(%s)", offer.Job.Name)
   187  		if a.AbleToRun(&offer.Job) {
   188  			log.V(2).Infof("Able to run Job(%s), submitting bid", offer.Job.Name)
   189  			a.Bid(offer.Job.Name)
   190  		} else {
   191  			log.V(2).Infof("Still unable to run Job(%s)", offer.Job.Name)
   192  		}
   193  	}
   194  }
   195  
   196  // Submit a bid for the given Job
   197  func (a *Agent) Bid(jobName string) {
   198  	log.Infof("Submitting JobBid for Job(%s)", jobName)
   199  
   200  	jb := job.NewBid(jobName, a.machine.State().BootId)
   201  	a.registry.SubmitJobBid(jb)
   202  
   203  	a.state.Lock()
   204  	defer a.state.Unlock()
   205  
   206  	a.state.TrackBid(jb.JobName)
   207  }
   208  
   209  // Instruct the Agent that an offer has been created and must
   210  // be tracked until it is resolved
   211  func (a *Agent) TrackOffer(jo job.JobOffer) {
   212  	a.state.Lock()
   213  	defer a.state.Unlock()
   214  
   215  	log.V(2).Infof("Tracking JobOffer(%s)", jo.Job.Name)
   216  	a.state.TrackOffer(jo)
   217  
   218  	peers := jo.Job.Payload.Peers()
   219  	log.V(2).Infof("Tracking peers of JobOffer(%s): %v", jo.Job.Name, peers)
   220  	a.state.TrackJobPeers(jo.Job.Name, jo.Job.Payload.Peers())
   221  }
   222  
   223  // Instruct the Agent that the given offer has been resolved
   224  // and may be ignored in future conflict calculations
   225  func (a *Agent) OfferResolved(jobName string) {
   226  	a.state.Lock()
   227  	defer a.state.Unlock()
   228  
   229  	log.V(2).Infof("Dropping JobOffer(%s)", jobName)
   230  	a.state.DropOffer(jobName)
   231  
   232  	a.state.DropBid(jobName)
   233  }
   234  
   235  // Pull a Job and its payload from the Registry
   236  func (a *Agent) FetchJob(jobName string) *job.Job {
   237  	log.V(1).Infof("Fetching Job(%s) from Registry", jobName)
   238  	j := a.registry.GetJob(jobName)
   239  	if j == nil {
   240  		log.V(1).Infof("Job not found in Registry")
   241  	}
   242  	return j
   243  }
   244  
   245  // Submit all possible bids for known peers of the provided job
   246  func (a *Agent) BidForPossiblePeers(jobName string) {
   247  	a.state.Lock()
   248  	peers := a.state.GetJobsByPeer(jobName)
   249  	a.state.Unlock()
   250  
   251  	for _, peer := range peers {
   252  		log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName)
   253  
   254  		peerJob := a.FetchJob(peer)
   255  		if peerJob != nil && a.AbleToRun(peerJob) {
   256  			log.Infof("Submitting bid for Peer(%s) of Job(%s)", peer, jobName)
   257  			a.Bid(peer)
   258  		} else {
   259  			log.V(1).Infof("Unable to bid for Peer(%s) of Job(%s)", peer, jobName)
   260  		}
   261  	}
   262  }
   263  
   264  // Determine if the Agent can run the provided Job
   265  func (a *Agent) AbleToRun(j *job.Job) bool {
   266  	requirements := j.Requirements()
   267  	if len(requirements) == 0 {
   268  		log.V(1).Infof("Job(%s) has no requirements", j.Name)
   269  		return true
   270  	}
   271  
   272  	var reqString string
   273  	for key, slice := range requirements {
   274  		reqString += fmt.Sprintf("%s = [", key)
   275  		for _, val := range slice {
   276  			reqString += fmt.Sprintf("%s, ", val)
   277  		}
   278  		reqString += fmt.Sprint("] ")
   279  	}
   280  
   281  	log.V(1).Infof("Job(%s) has requirements: %s", j.Name, reqString)
   282  
   283  	metadata := extractMachineMetadata(requirements)
   284  	log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata)
   285  	if !a.machine.HasMetadata(metadata) {
   286  		log.V(1).Infof("Unable to run Job(%s), local Machine metadata insufficient", j.Name)
   287  		return false
   288  	}
   289  
   290  	bootID, ok := requirements["ConditionMachineBootID"]
   291  	if ok && len(bootID) > 0 && a.machine.State().BootId == bootID[0] {
   292  		log.V(1).Infof("Agent does not pass MachineBootID condition for Job(%s)", j.Name)
   293  		return false
   294  	}
   295  
   296  	peers := j.Payload.Peers()
   297  	if len(peers) > 0 {
   298  		log.V(1).Infof("Asserting required Peers %v of Job(%s) are scheduled locally", peers, j.Name)
   299  		for _, peer := range peers {
   300  			if !a.peerScheduledHere(j.Name, peer) {
   301  				log.V(1).Infof("Required Peer(%s) of Job(%s) is not scheduled locally", peer, j.Name)
   302  				return false
   303  			}
   304  		}
   305  	} else {
   306  		log.V(2).Infof("Job(%s) has no peers to worry about", j.Name)
   307  	}
   308  
   309  	if conflicted, conflictedJobName := a.state.HasConflict(j.Name, j.Payload.Conflicts()); conflicted {
   310  		log.V(1).Infof("Job(%s) has conflict with Job(%s)", j.Name, conflictedJobName)
   311  		return false
   312  	}
   313  
   314  	return true
   315  }
   316  
   317  // Return all machine-related metadata from a job requirements map
   318  func extractMachineMetadata(requirements map[string][]string) map[string][]string {
   319  	metadata := make(map[string][]string)
   320  
   321  	for key, values := range requirements {
   322  		if !strings.HasPrefix(key, "MachineMetadata") {
   323  			continue
   324  		}
   325  
   326  		// Strip off leading 'MachineMetadata'
   327  		key = key[15:]
   328  
   329  		if len(values) == 0 {
   330  			log.V(2).Infof("Machine metadata requirement %s provided no values, ignoring.", key)
   331  			continue
   332  		}
   333  
   334  		metadata[key] = values
   335  	}
   336  
   337  	return metadata
   338  }
   339  
   340  // Determine if all necessary peers of a Job are scheduled to this Agent
   341  func (a *Agent) peerScheduledHere(jobName, peerName string) bool {
   342  	log.V(1).Infof("Looking for target of Peer(%s)", peerName)
   343  
   344  	//FIXME: ideally the machine would use its own knowledge rather than calling GetJobTarget
   345  	if tgt := a.registry.GetJobTarget(peerName); tgt == nil || tgt.BootId != a.machine.State().BootId {
   346  		log.V(1).Infof("Peer(%s) of Job(%s) not scheduled here", peerName, jobName)
   347  		return false
   348  	}
   349  
   350  	log.V(1).Infof("Peer(%s) of Job(%s) scheduled here", peerName, jobName)
   351  	return true
   352  }
   353  
   354  func (a *Agent) UnresolvedJobOffers() []job.JobOffer {
   355  	return a.registry.UnresolvedJobOffers()
   356  }