github.com/proppy/fleet@v0.1.4/agent/agent.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "strings" 6 "time" 7 8 log "github.com/coreos/fleet/third_party/github.com/golang/glog" 9 10 "github.com/coreos/fleet/event" 11 "github.com/coreos/fleet/job" 12 "github.com/coreos/fleet/machine" 13 "github.com/coreos/fleet/registry" 14 "github.com/coreos/fleet/systemd" 15 ) 16 17 const ( 18 // TTL to use with all state pushed to Registry 19 DefaultTTL = "30s" 20 21 // Refresh TTLs at 1/2 the TTL length 22 refreshInterval = 2 23 ) 24 25 // The Agent owns all of the coordination between the Registry, the local 26 // Machine, and the local SystemdManager. 27 type Agent struct { 28 registry *registry.Registry 29 events *event.EventBus 30 machine *machine.Machine 31 ttl time.Duration 32 systemdPrefix string 33 34 state *AgentState 35 systemd *systemd.SystemdManager 36 37 // channel used to shutdown any open connections/channels the Agent holds 38 stop chan bool 39 } 40 41 func New(registry *registry.Registry, events *event.EventBus, machine *machine.Machine, ttl, unitPrefix string) (*Agent, error) { 42 ttldur, err := time.ParseDuration(ttl) 43 if err != nil { 44 return nil, err 45 } 46 47 state := NewState() 48 mgr := systemd.NewSystemdManager(machine, unitPrefix) 49 50 return &Agent{registry, events, machine, ttldur, unitPrefix, state, mgr, nil}, nil 51 } 52 53 // Access Agent's machine field 54 func (a *Agent) Machine() *machine.Machine { 55 return a.machine 56 } 57 58 // Trigger all async processes the Agent intends to run 59 func (a *Agent) Run() { 60 a.stop = make(chan bool) 61 62 handler := NewEventHandler(a) 63 a.events.AddListener("agent", a.machine, handler) 64 65 go a.systemd.Publish(a.events, a.stop) 66 go a.Heartbeat(a.ttl, a.stop) 67 68 // Block until we receive a stop signal 69 <-a.stop 70 71 a.events.RemoveListener("agent", a.machine) 72 } 73 74 // Stop all async processes the Agent is running 75 func (a *Agent) Stop() { 76 log.V(1).Info("Stopping Agent") 77 close(a.stop) 78 } 79 80 // Clear any presence data from the Registry 81 func (a *Agent) Purge() { 82 log.V(1).Info("Removing Agent from Registry") 83 bootId := a.machine.State().BootId 84 err := a.registry.RemoveMachineState(bootId) 85 if err != nil { 86 log.Errorf("Failed to remove Machine %s from Registry: %s", bootId, err.Error()) 87 } 88 89 for _, j := range a.registry.GetAllJobsByMachine(bootId) { 90 log.V(1).Infof("Clearing JobState(%s) from Registry", j.Name) 91 a.registry.RemoveJobState(j.Name) 92 93 offer := job.NewOfferFromJob(j) 94 log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) 95 a.registry.CreateJobOffer(offer) 96 log.Infof("Published JobOffer(%s)", offer.Job.Name) 97 } 98 } 99 100 // Periodically report to the Registry at an interval equal to 101 // half of the provided ttl. Stop reporting when the provided 102 // channel is closed. 103 func (a *Agent) Heartbeat(ttl time.Duration, stop chan bool) { 104 105 // Explicitly heartbeat immediately to push state to the 106 // Registry as quickly as possible 107 a.machine.RefreshState() 108 a.registry.SetMachineState(a.machine.State(), a.ttl) 109 110 interval := ttl / refreshInterval 111 for true { 112 select { 113 case <-stop: 114 log.V(2).Info("MachineHeartbeat exiting due to stop signal") 115 return 116 case <-time.Tick(interval): 117 log.V(2).Info("MachineHeartbeat tick") 118 a.machine.RefreshState() 119 err := a.registry.SetMachineState(a.machine.State(), a.ttl) 120 if err != nil { 121 log.Errorf("MachineHeartbeat failed: %v", err) 122 } 123 } 124 } 125 } 126 127 // Instruct the Agent to start the provided Job 128 func (a *Agent) StartJob(j *job.Job) { 129 a.state.TrackJobConflicts(j.Name, j.Payload.Conflicts()) 130 131 log.Infof("Starting Job(%s)", j.Name) 132 a.systemd.StartJob(j) 133 } 134 135 // Inform the Registry that a Job must be rescheduled 136 func (a *Agent) RescheduleJob(j *job.Job) { 137 log.V(2).Infof("Stopping Job(%s)", j.Name) 138 a.registry.UnscheduleJob(j.Name) 139 140 offer := job.NewOfferFromJob(*j) 141 log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) 142 a.registry.CreateJobOffer(offer) 143 log.Infof("Published JobOffer(%s)", offer.Job.Name) 144 } 145 146 // Instruct the Agent to stop the provided Job and 147 // all of its peers 148 func (a *Agent) StopJob(jobName string) { 149 log.Infof("Stopping Job(%s)", jobName) 150 a.systemd.StopJob(jobName) 151 a.ReportJobState(jobName, nil) 152 153 a.state.Lock() 154 reversePeers := a.state.GetJobsByPeer(jobName) 155 a.state.DropPeersJob(jobName) 156 a.state.DropJobConflicts(jobName) 157 a.state.Unlock() 158 159 for _, peer := range reversePeers { 160 log.Infof("Stopping Peer(%s) of Job(%s)", peer, jobName) 161 a.registry.StopJob(peer) 162 } 163 } 164 165 // Persist the state of the given Job into the Registry 166 func (a *Agent) ReportJobState(jobName string, jobState *job.JobState) { 167 if jobState == nil { 168 err := a.registry.RemoveJobState(jobName) 169 if err != nil { 170 log.V(1).Infof("Failed to remove JobState from Registry: %s", jobName, err.Error()) 171 } 172 } else { 173 a.registry.SaveJobState(jobName, jobState) 174 } 175 } 176 177 // Submit all possible bids for unresolved offers 178 func (a *Agent) BidForPossibleJobs() { 179 a.state.Lock() 180 offers := a.state.GetOffersWithoutBids() 181 a.state.Unlock() 182 183 log.V(2).Infof("Checking %d unbade offers", len(offers)) 184 for i, _ := range offers { 185 offer := offers[i] 186 log.V(2).Infof("Checking ability to run Job(%s)", offer.Job.Name) 187 if a.AbleToRun(&offer.Job) { 188 log.V(2).Infof("Able to run Job(%s), submitting bid", offer.Job.Name) 189 a.Bid(offer.Job.Name) 190 } else { 191 log.V(2).Infof("Still unable to run Job(%s)", offer.Job.Name) 192 } 193 } 194 } 195 196 // Submit a bid for the given Job 197 func (a *Agent) Bid(jobName string) { 198 log.Infof("Submitting JobBid for Job(%s)", jobName) 199 200 jb := job.NewBid(jobName, a.machine.State().BootId) 201 a.registry.SubmitJobBid(jb) 202 203 a.state.Lock() 204 defer a.state.Unlock() 205 206 a.state.TrackBid(jb.JobName) 207 } 208 209 // Instruct the Agent that an offer has been created and must 210 // be tracked until it is resolved 211 func (a *Agent) TrackOffer(jo job.JobOffer) { 212 a.state.Lock() 213 defer a.state.Unlock() 214 215 log.V(2).Infof("Tracking JobOffer(%s)", jo.Job.Name) 216 a.state.TrackOffer(jo) 217 218 peers := jo.Job.Payload.Peers() 219 log.V(2).Infof("Tracking peers of JobOffer(%s): %v", jo.Job.Name, peers) 220 a.state.TrackJobPeers(jo.Job.Name, jo.Job.Payload.Peers()) 221 } 222 223 // Instruct the Agent that the given offer has been resolved 224 // and may be ignored in future conflict calculations 225 func (a *Agent) OfferResolved(jobName string) { 226 a.state.Lock() 227 defer a.state.Unlock() 228 229 log.V(2).Infof("Dropping JobOffer(%s)", jobName) 230 a.state.DropOffer(jobName) 231 232 a.state.DropBid(jobName) 233 } 234 235 // Pull a Job and its payload from the Registry 236 func (a *Agent) FetchJob(jobName string) *job.Job { 237 log.V(1).Infof("Fetching Job(%s) from Registry", jobName) 238 j := a.registry.GetJob(jobName) 239 if j == nil { 240 log.V(1).Infof("Job not found in Registry") 241 } 242 return j 243 } 244 245 // Submit all possible bids for known peers of the provided job 246 func (a *Agent) BidForPossiblePeers(jobName string) { 247 a.state.Lock() 248 peers := a.state.GetJobsByPeer(jobName) 249 a.state.Unlock() 250 251 for _, peer := range peers { 252 log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName) 253 254 peerJob := a.FetchJob(peer) 255 if peerJob != nil && a.AbleToRun(peerJob) { 256 log.Infof("Submitting bid for Peer(%s) of Job(%s)", peer, jobName) 257 a.Bid(peer) 258 } else { 259 log.V(1).Infof("Unable to bid for Peer(%s) of Job(%s)", peer, jobName) 260 } 261 } 262 } 263 264 // Determine if the Agent can run the provided Job 265 func (a *Agent) AbleToRun(j *job.Job) bool { 266 requirements := j.Requirements() 267 if len(requirements) == 0 { 268 log.V(1).Infof("Job(%s) has no requirements", j.Name) 269 return true 270 } 271 272 var reqString string 273 for key, slice := range requirements { 274 reqString += fmt.Sprintf("%s = [", key) 275 for _, val := range slice { 276 reqString += fmt.Sprintf("%s, ", val) 277 } 278 reqString += fmt.Sprint("] ") 279 } 280 281 log.V(1).Infof("Job(%s) has requirements: %s", j.Name, reqString) 282 283 metadata := extractMachineMetadata(requirements) 284 log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) 285 if !a.machine.HasMetadata(metadata) { 286 log.V(1).Infof("Unable to run Job(%s), local Machine metadata insufficient", j.Name) 287 return false 288 } 289 290 bootID, ok := requirements["ConditionMachineBootID"] 291 if ok && len(bootID) > 0 && a.machine.State().BootId == bootID[0] { 292 log.V(1).Infof("Agent does not pass MachineBootID condition for Job(%s)", j.Name) 293 return false 294 } 295 296 peers := j.Payload.Peers() 297 if len(peers) > 0 { 298 log.V(1).Infof("Asserting required Peers %v of Job(%s) are scheduled locally", peers, j.Name) 299 for _, peer := range peers { 300 if !a.peerScheduledHere(j.Name, peer) { 301 log.V(1).Infof("Required Peer(%s) of Job(%s) is not scheduled locally", peer, j.Name) 302 return false 303 } 304 } 305 } else { 306 log.V(2).Infof("Job(%s) has no peers to worry about", j.Name) 307 } 308 309 if conflicted, conflictedJobName := a.state.HasConflict(j.Name, j.Payload.Conflicts()); conflicted { 310 log.V(1).Infof("Job(%s) has conflict with Job(%s)", j.Name, conflictedJobName) 311 return false 312 } 313 314 return true 315 } 316 317 // Return all machine-related metadata from a job requirements map 318 func extractMachineMetadata(requirements map[string][]string) map[string][]string { 319 metadata := make(map[string][]string) 320 321 for key, values := range requirements { 322 if !strings.HasPrefix(key, "MachineMetadata") { 323 continue 324 } 325 326 // Strip off leading 'MachineMetadata' 327 key = key[15:] 328 329 if len(values) == 0 { 330 log.V(2).Infof("Machine metadata requirement %s provided no values, ignoring.", key) 331 continue 332 } 333 334 metadata[key] = values 335 } 336 337 return metadata 338 } 339 340 // Determine if all necessary peers of a Job are scheduled to this Agent 341 func (a *Agent) peerScheduledHere(jobName, peerName string) bool { 342 log.V(1).Infof("Looking for target of Peer(%s)", peerName) 343 344 //FIXME: ideally the machine would use its own knowledge rather than calling GetJobTarget 345 if tgt := a.registry.GetJobTarget(peerName); tgt == nil || tgt.BootId != a.machine.State().BootId { 346 log.V(1).Infof("Peer(%s) of Job(%s) not scheduled here", peerName, jobName) 347 return false 348 } 349 350 log.V(1).Infof("Peer(%s) of Job(%s) scheduled here", peerName, jobName) 351 return true 352 } 353 354 func (a *Agent) UnresolvedJobOffers() []job.JobOffer { 355 return a.registry.UnresolvedJobOffers() 356 }