github.com/Cloud-Foundations/Dominator@v0.3.4/lib/slavedriver/impl.go (about) 1 package slavedriver 2 3 import ( 4 "container/list" 5 "fmt" 6 "io" 7 "os" 8 "time" 9 10 "github.com/Cloud-Foundations/Dominator/lib/backoffdelay" 11 "github.com/Cloud-Foundations/Dominator/lib/format" 12 "github.com/Cloud-Foundations/Dominator/lib/fsutil" 13 "github.com/Cloud-Foundations/Dominator/lib/json" 14 "github.com/Cloud-Foundations/Dominator/lib/log" 15 "github.com/Cloud-Foundations/Dominator/lib/srpc" 16 ) 17 18 type jsonDatabase struct { 19 filename string 20 } 21 22 func dialWithRetry(network, address string, 23 timeout time.Duration) (*srpc.Client, error) { 24 stopTime := time.Now().Add(timeout) 25 sleeper := backoffdelay.NewExponential(100*time.Millisecond, time.Second, 1) 26 for ; time.Until(stopTime) >= 0; sleeper.Sleep() { 27 client, err := srpc.DialHTTP(network, address, time.Second) 28 if err != nil { 29 continue 30 } 31 if err := client.SetKeepAlivePeriod(time.Second * 30); err != nil { 32 client.Close() 33 return nil, err 34 } 35 return client, nil 36 37 } 38 return nil, fmt.Errorf("timed out connecting to: %s", address) 39 } 40 41 func listSlaves(slaves map[*Slave]struct{}) []SlaveInfo { 42 list := make([]SlaveInfo, 0, len(slaves)) 43 for slave := range slaves { 44 list = append(list, slave.info) 45 } 46 return list 47 } 48 49 func newSlaveDriver(options SlaveDriverOptions, slaveTrader SlaveTrader, 50 clientDialer clientDialerFunc, databaseDriver databaseLoadSaver, 51 logger log.DebugLogger) (*SlaveDriver, error) { 52 if options.MinimumIdleSlaves < 1 { 53 options.MinimumIdleSlaves = 1 54 } 55 if options.MaximumIdleSlaves < 1 { 56 options.MaximumIdleSlaves = 1 57 } 58 if options.MaximumIdleSlaves < options.MinimumIdleSlaves { 59 options.MaximumIdleSlaves = options.MinimumIdleSlaves 60 } 61 destroySlaveChannel := make(chan *Slave, 1) 62 getSlaveChannel := make(chan requestSlaveMessage) 63 getSlavesChannel := make(chan chan<- slaveRoll) 64 releaseSlaveChannel := make(chan *Slave, 1) 65 replaceIdleChannel := make(chan bool) 66 publicDriver := &SlaveDriver{ 67 options: options, 68 destroySlaveChannel: destroySlaveChannel, 69 getSlaveChannel: getSlaveChannel, 70 getSlavesChannel: getSlavesChannel, 71 logger: logger, 72 releaseSlaveChannel: releaseSlaveChannel, 73 replaceIdleChannel: replaceIdleChannel, 74 } 75 driver := &slaveDriver{ 76 options: options, 77 busySlaves: make(map[*Slave]struct{}), 78 clientDialer: clientDialer, 79 destroySlaveChannel: destroySlaveChannel, 80 databaseDriver: databaseDriver, 81 getSlaveChannel: getSlaveChannel, 82 getSlavesChannel: getSlavesChannel, 83 getterList: list.New(), 84 logger: logger, 85 pingResponseChannel: make(chan pingResponseMessage, 1), 86 publicDriver: publicDriver, 87 slaveTrader: slaveTrader, 88 releaseSlaveChannel: releaseSlaveChannel, 89 replaceIdleChannel: replaceIdleChannel, 90 } 91 if err := driver.loadSlaves(); err != nil { 92 driver.slaveTrader.Close() 93 return nil, err 94 } 95 go driver.watchRoll() 96 return publicDriver, nil 97 } 98 99 func (db *jsonDatabase) load() (*slaveRoll, error) { 100 var slaves slaveRoll 101 err := json.ReadFromFile(db.filename, &slaves) 102 if err != nil { 103 if os.IsNotExist(err) { 104 return nil, nil 105 } 106 return nil, err 107 } 108 return &slaves, nil 109 } 110 111 func (db *jsonDatabase) save(slaves slaveRoll) error { 112 return json.WriteToFile(db.filename, fsutil.PublicFilePerms, " ", slaves) 113 } 114 115 func (slave *Slave) acknowledge(logger log.DebugLogger) error { 116 if slave.acknowledgeChannel == nil { 117 return nil 118 } 119 errorChannel := make(chan error, 1) 120 slave.acknowledgeChannel <- errorChannel 121 slave.acknowledgeChannel = nil 122 timer := time.NewTimer(15 * time.Second) 123 select { 124 case err := <-errorChannel: 125 if err != nil { 126 return err 127 } else { 128 logger.Debugf(0, "acknowledged slave: %s\n", slave) 129 return nil 130 } 131 case <-timer.C: 132 return fmt.Errorf("timed out") 133 } 134 } 135 136 func (slave *Slave) getClient() *srpc.Client { 137 return slave.client 138 } 139 140 func (slave *Slave) ping(pingResponseChannel chan<- pingResponseMessage) { 141 errorChannel := make(chan error, 1) 142 timer := time.NewTimer(5 * time.Second) 143 go func() { 144 errorChannel <- slave.client.Ping() 145 slave.driver.logger.Debugf(1, "ping(%s) goroutine returning\n", slave) 146 }() 147 select { 148 case err := <-errorChannel: 149 pingResponseChannel <- pingResponseMessage{ 150 error: err, 151 slave: slave, 152 } 153 case <-timer.C: 154 pingResponseChannel <- pingResponseMessage{ 155 error: fmt.Errorf("timed out"), 156 slave: slave, 157 } 158 } 159 } 160 161 func (driver *SlaveDriver) getSlave(timeout time.Duration) (*Slave, error) { 162 driver.logger.Debugln(0, "getSlave() starting") 163 if timeout < 0 { 164 timeout = time.Hour 165 } 166 slaveChannel := make(chan *Slave) 167 driver.getSlaveChannel <- requestSlaveMessage{ 168 slaveChannel: slaveChannel, 169 timeout: time.Now().Add(timeout), 170 } 171 if slave := <-slaveChannel; slave == nil { 172 return nil, fmt.Errorf("timed out getting slave") 173 } else { 174 return slave, nil 175 } 176 } 177 178 func (driver *slaveDriver) createSlave(responseChannel chan<- *Slave) { 179 driver.logger.Debugln(0, "creating slave") 180 sleeper := backoffdelay.NewExponential(time.Second, time.Minute, 1) 181 for ; ; sleeper.Sleep() { 182 slaveInfo, acknowledgeChannel, err := driver.createSlaveMachine() 183 if err != nil { 184 driver.logger.Println(err) 185 continue 186 } 187 slave := &Slave{ 188 acknowledgeChannel: acknowledgeChannel, 189 clientAddress: fmt.Sprintf("%s:%d", slaveInfo.IpAddress, 190 driver.options.PortNumber), 191 info: slaveInfo, 192 driver: driver.publicDriver, 193 timeToPing: time.Now().Add(time.Minute), 194 } 195 slave.client, err = driver.clientDialer("tcp", slave.clientAddress, 196 time.Minute) 197 if err != nil { 198 e := driver.slaveTrader.DestroySlave(slaveInfo.Identifier) 199 if e != nil { 200 driver.logger.Printf("error destroying: %s: %s\n", 201 slaveInfo.Identifier, e) 202 } 203 driver.logger.Printf("error dialing: %s: %s\n", 204 slave.clientAddress, err) 205 continue 206 } 207 driver.logger.Printf("created slave: %s\n", slaveInfo.Identifier) 208 responseChannel <- slave 209 return 210 } 211 } 212 213 func (driver *slaveDriver) createSlaveMachine() (SlaveInfo, chan<- chan<- error, 214 error) { 215 if creator, ok := driver.slaveTrader.(SlaveTraderAcknowledger); ok { 216 acknowledgeChannel := make(chan chan<- error, 1) 217 slaveInfo, err := creator.CreateSlaveWithAcknowledger( 218 acknowledgeChannel) 219 if err != nil { 220 close(acknowledgeChannel) 221 return SlaveInfo{}, nil, err 222 } 223 return slaveInfo, acknowledgeChannel, err 224 } 225 slaveInfo, err := driver.slaveTrader.CreateSlave() 226 return slaveInfo, nil, err 227 } 228 229 func (driver *slaveDriver) destroySlave(slave *Slave, 230 responseChannel chan<- *Slave) { 231 driver.logger.Printf("destroying slave: %s\n", slave.info.Identifier) 232 startTime := time.Now() 233 err := driver.slaveTrader.DestroySlave(slave.info.Identifier) 234 if err != nil { 235 driver.logger.Printf("error destroying: %s: %s\n", 236 slave.info.Identifier, err) 237 responseChannel <- nil 238 return 239 } 240 if duration := time.Since(startTime); duration > 5*time.Second { 241 driver.logger.Printf("destroyed slave: %s in %s\n", 242 slave.info.Identifier, format.Duration(duration)) 243 } 244 responseChannel <- slave 245 } 246 247 func (driver *slaveDriver) getSlaves() slaveRoll { 248 return slaveRoll{ 249 BusySlaves: listSlaves(driver.busySlaves), 250 IdleSlaves: listSlaves(driver.idleSlaves), 251 Zombies: listSlaves(driver.zombies), 252 } 253 } 254 255 func (driver *slaveDriver) loadSlaves() error { 256 slavesFromDB, err := driver.databaseDriver.load() 257 if err != nil { 258 return err 259 } 260 if slavesFromDB == nil { 261 driver.idleSlaves = make(map[*Slave]struct{}) 262 driver.zombies = make(map[*Slave]struct{}) 263 return nil 264 } 265 slavesFromDB.BusySlaves = append(slavesFromDB.BusySlaves, 266 slavesFromDB.Zombies...) 267 driver.idleSlaves = make(map[*Slave]struct{}, len(slavesFromDB.IdleSlaves)) 268 driver.zombies = make(map[*Slave]struct{}, len(slavesFromDB.BusySlaves)) 269 for _, slaveInfo := range slavesFromDB.BusySlaves { 270 driver.zombies[&Slave{ 271 driver: driver.publicDriver, 272 info: slaveInfo, 273 }] = struct{}{} 274 } 275 for _, slaveInfo := range slavesFromDB.IdleSlaves { 276 slave := &Slave{ 277 clientAddress: fmt.Sprintf("%s:%d", slaveInfo.IpAddress, 278 driver.options.PortNumber), 279 info: slaveInfo, 280 driver: driver.publicDriver, 281 } 282 slave.client, err = driver.clientDialer("tcp", slave.clientAddress, 283 time.Minute) 284 if err != nil { 285 driver.logger.Printf("error dialing: %s: %s\n", slave.clientAddress, 286 err) 287 driver.zombies[slave] = struct{}{} 288 } else { 289 slave.timeToPing = time.Now().Add(time.Minute) 290 driver.idleSlaves[slave] = struct{}{} 291 } 292 } 293 return nil 294 } 295 296 // rollCall manages all the internal state. It should be called from a forever 297 // goroutine. 298 func (driver *slaveDriver) rollCall() { 299 driver.logger.Debugf(0, "rollCall(): %d idle, %d getters\n", 300 len(driver.idleSlaves), driver.getterList.Len()) 301 // First: if there is an idle slave, dispatch to a getter. 302 if len(driver.idleSlaves) > 0 && driver.getterList.Len() > 0 { 303 entry := driver.getterList.Front() 304 request := entry.Value.(requestSlaveMessage) 305 driver.getterList.Remove(entry) 306 if time.Since(request.timeout) > 0 { 307 request.slaveChannel <- nil // Getter wanted to give up by now. 308 close(request.slaveChannel) 309 return 310 } 311 for slave := range driver.idleSlaves { 312 if time.Since(slave.timeToPing) >= 0 || slave.pinging { 313 continue 314 } 315 request.slaveChannel <- slave // Consumed by getter. 316 close(request.slaveChannel) 317 delete(driver.idleSlaves, slave) 318 driver.busySlaves[slave] = struct{}{} 319 driver.writeState = true 320 driver.logger.Debugf(0, "sent slave: %s to getter\n", slave) 321 return 322 } 323 } 324 // Clean up expired getters and set timeout on when to next check. 325 wakeTimeout := time.Hour 326 var nextEntry *list.Element 327 for entry := driver.getterList.Front(); entry != nil; entry = nextEntry { 328 nextEntry = entry.Next() 329 request := entry.Value.(requestSlaveMessage) 330 if timeout := time.Until(request.timeout); timeout <= 0 { 331 request.slaveChannel <- nil // Getter wanted to give up by now. 332 close(request.slaveChannel) 333 driver.getterList.Remove(entry) 334 } else if timeout < wakeTimeout { 335 wakeTimeout = timeout 336 } 337 } 338 if driver.getterList.Len() > 0 || 339 uint(len(driver.idleSlaves)) < driver.options.MinimumIdleSlaves { 340 if driver.createdSlaveChannel == nil { 341 ch := make(chan *Slave, 1) 342 driver.createdSlaveChannel = ch 343 go driver.createSlave(ch) 344 } 345 } 346 if uint(len(driver.idleSlaves)) > driver.options.MaximumIdleSlaves && 347 driver.getterList.Len() < 1 { 348 for slave := range driver.idleSlaves { 349 if uint(len(driver.idleSlaves)) <= 350 driver.options.MaximumIdleSlaves { 351 break 352 } 353 delete(driver.idleSlaves, slave) 354 driver.zombies[slave] = struct{}{} 355 driver.writeState = true 356 } 357 } 358 for slave := range driver.zombies { // Close any connections. 359 if slave.client != nil { 360 if err := slave.client.Close(); err != nil { 361 driver.logger.Printf("error closing Client for slave: %s: %s\n", 362 slave, err) 363 } 364 slave.client = nil 365 } 366 } 367 for slave := range driver.zombies { // Destroy one zombie at a time. 368 if driver.destroyedSlaveChannel == nil { 369 ch := make(chan *Slave, 1) 370 driver.destroyedSlaveChannel = ch 371 go driver.destroySlave(slave, ch) 372 } 373 break 374 } 375 if driver.writeState { 376 if err := driver.databaseDriver.save(driver.getSlaves()); err != nil { 377 driver.logger.Println(err) 378 } else { 379 driver.writeState = false 380 } 381 } 382 for slave := range driver.idleSlaves { 383 if slave.pinging { 384 continue 385 } 386 if timeToPing := slave.timeToPing; time.Since(timeToPing) >= 0 { 387 slave.pinging = true 388 go slave.ping(driver.pingResponseChannel) 389 } else if timeout := time.Until(timeToPing); timeout < wakeTimeout { 390 wakeTimeout = timeout 391 } 392 } 393 if wakeTimeout < 0 { 394 wakeTimeout = 0 395 } 396 wakeTimer := time.NewTimer(wakeTimeout) 397 select { 398 case slave := <-driver.createdSlaveChannel: 399 driver.createdSlaveChannel = nil 400 if err := slave.acknowledge(driver.logger); err != nil { 401 driver.logger.Printf("error acknowledging slave: %s: %s\n", 402 slave, err) 403 break 404 } 405 driver.idleSlaves[slave] = struct{}{} 406 // Write state now to reduce chance of forgetting about this slave. 407 if err := driver.databaseDriver.save(driver.getSlaves()); err != nil { 408 driver.logger.Println(err) 409 driver.writeState = true 410 } else { 411 driver.writeState = false 412 } 413 return // Return now so that new slave can be sent to a getter quickly. 414 case slave := <-driver.destroySlaveChannel: 415 if _, ok := driver.idleSlaves[slave]; ok { 416 panic("destroying idle slave") 417 } 418 if _, ok := driver.zombies[slave]; ok { 419 panic("destroying zombie") 420 } 421 if _, ok := driver.busySlaves[slave]; !ok { 422 panic("destroying unknown slave") 423 } 424 delete(driver.busySlaves, slave) 425 driver.zombies[slave] = struct{}{} 426 driver.writeState = true 427 case slave := <-driver.destroyedSlaveChannel: 428 driver.destroyedSlaveChannel = nil 429 if slave != nil { 430 delete(driver.zombies, slave) 431 driver.writeState = true 432 } 433 case slaveChannel := <-driver.getSlaveChannel: 434 driver.getterList.PushBack(slaveChannel) 435 case slavesChannel := <-driver.getSlavesChannel: 436 slavesChannel <- driver.getSlaves() 437 case pingResponse := <-driver.pingResponseChannel: 438 slave := pingResponse.slave 439 slave.pinging = false 440 if err := pingResponse.error; err == nil { 441 slave.timeToPing = time.Now().Add(time.Minute) 442 driver.logger.Debugf(0, "ping: %s succeeded\n", slave) 443 } else { 444 driver.logger.Printf("error pinging: %s: %s\n", slave, err) 445 delete(driver.idleSlaves, slave) 446 driver.zombies[slave] = struct{}{} 447 driver.writeState = true 448 } 449 case slave := <-driver.releaseSlaveChannel: 450 if _, ok := driver.idleSlaves[slave]; ok { 451 panic("releasing idle slave") 452 } 453 if _, ok := driver.zombies[slave]; ok { 454 panic("releasing zombie") 455 } 456 if _, ok := driver.busySlaves[slave]; !ok { 457 panic("releasing unknown slave") 458 } 459 delete(driver.busySlaves, slave) 460 driver.idleSlaves[slave] = struct{}{} 461 driver.writeState = true 462 slave.timeToPing = time.Now().Add(100 * time.Millisecond) 463 case createIfNeeded := <-driver.replaceIdleChannel: 464 for slave := range driver.idleSlaves { 465 delete(driver.idleSlaves, slave) 466 driver.zombies[slave] = struct{}{} 467 driver.writeState = true 468 } 469 if createIfNeeded && driver.createdSlaveChannel == nil { 470 ch := make(chan *Slave, 1) 471 driver.createdSlaveChannel = ch 472 go driver.createSlave(ch) 473 } 474 case <-wakeTimer.C: 475 } 476 wakeTimer.Stop() 477 select { 478 case <-wakeTimer.C: 479 default: 480 } 481 } 482 483 func (driver *slaveDriver) watchRoll() { 484 for { 485 driver.rollCall() 486 } 487 } 488 489 func (driver *SlaveDriver) writeHtml(writer io.Writer) { 490 slavesChannel := make(chan slaveRoll) 491 driver.getSlavesChannel <- slavesChannel 492 slaves := <-slavesChannel 493 if len(slaves.BusySlaves) < 1 && len(slaves.IdleSlaves) < 1 && 494 len(slaves.Zombies) < 1 { 495 fmt.Fprintf(writer, "No slaves for %s<br>\n", driver.options.Purpose) 496 return 497 } 498 fmt.Fprintf(writer, "Slaves for %s:<br>\n", driver.options.Purpose) 499 for _, slave := range slaves.BusySlaves { 500 fmt.Fprintf(writer, 501 " <a href=\"http://%s:%d/\">%s</a> (busy)<br>\n", 502 slave.IpAddress, driver.options.PortNumber, slave) 503 } 504 for _, slave := range slaves.IdleSlaves { 505 fmt.Fprintf(writer, 506 " <a href=\"http://%s:%d/\">%s</a> (idle)<br>\n", 507 slave.IpAddress, driver.options.PortNumber, slave) 508 } 509 for _, slave := range slaves.Zombies { 510 fmt.Fprintf(writer, 511 " <a href=\"http://%s:%d/\">%s</a> (zombie)<br>\n", 512 slave.IpAddress, driver.options.PortNumber, slave) 513 } 514 }