github.com/axw/juju@v0.0.0-20161005053422-4bd6544d08d4/state/workers/restart.go (about) 1 // Copyright 2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package workers 5 6 import ( 7 "sync" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/loggo" 12 "github.com/juju/utils/clock" 13 14 "github.com/juju/juju/core/lease" 15 "github.com/juju/juju/worker" 16 "github.com/juju/juju/worker/catacomb" 17 ) 18 19 // RestartConfig holds a RestartWorkers' dependencies and configuration. 20 type RestartConfig struct { 21 Factory Factory 22 Logger loggo.Logger 23 Clock clock.Clock 24 Delay time.Duration 25 } 26 27 // Validate returns an error if config cannot drive a RestartWorkers. 28 func (config RestartConfig) Validate() error { 29 if config.Factory == nil { 30 return errors.NotValidf("nil Factory") 31 } 32 if config.Logger == (loggo.Logger{}) { 33 return errors.NotValidf("uninitialized Logger") 34 } 35 if config.Clock == nil { 36 return errors.NotValidf("nil Clock") 37 } 38 if config.Delay <= 0 { 39 return errors.NotValidf("non-positive Delay") 40 } 41 return nil 42 } 43 44 // NewRestartWorkers returns a worker that will live until Kill()ed, 45 // giving access to a set of sub-workers needed by the state package. 46 // 47 // These workers may die of their own accord at any time, and will be 48 // replaced after the configured delay; all active workers will be 49 // stopped before Wait returns. 50 func NewRestartWorkers(config RestartConfig) (*RestartWorkers, error) { 51 if err := config.Validate(); err != nil { 52 return nil, errors.Trace(err) 53 } 54 55 dw, err := NewDumbWorkers(DumbConfig{ 56 Factory: config.Factory, 57 Logger: config.Logger, 58 }) 59 if err != nil { 60 return nil, errors.Trace(err) 61 } 62 63 rw := &RestartWorkers{ 64 config: config, 65 workers: dw, 66 } 67 err = catacomb.Invoke(catacomb.Plan{ 68 Site: &rw.catacomb, 69 Work: rw.run, 70 Init: []worker.Worker{dw}, 71 }) 72 if err != nil { 73 return nil, errors.Trace(err) 74 } 75 return rw, nil 76 } 77 78 // RestartWorkers wraps a DumbWorkers and restarts/replaces workers as 79 // they fail. 80 type RestartWorkers struct { 81 config RestartConfig 82 catacomb catacomb.Catacomb 83 84 // mu protects workers. 85 mu sync.Mutex 86 workers *DumbWorkers 87 88 // wg tracks maintainer goroutines. 89 wg sync.WaitGroup 90 } 91 92 // TxnLogWatcher is part of the Workers interface. 93 func (rw *RestartWorkers) TxnLogWatcher() TxnLogWatcher { 94 rw.mu.Lock() 95 defer rw.mu.Unlock() 96 return rw.workers.txnLogWorker 97 } 98 99 // PresenceWatcher is part of the Workers interface. 100 func (rw *RestartWorkers) PresenceWatcher() PresenceWatcher { 101 rw.mu.Lock() 102 defer rw.mu.Unlock() 103 return rw.workers.presenceWorker 104 } 105 106 // LeadershipManager is part of the Workers interface. 107 func (rw *RestartWorkers) LeadershipManager() LeaseManager { 108 return DynamicLeaseManager{&rw.mu, &rw.workers.leadershipWorker} 109 } 110 111 // SingularManager is part of the Workers interface. 112 func (rw *RestartWorkers) SingularManager() LeaseManager { 113 return DynamicLeaseManager{&rw.mu, &rw.workers.singularWorker} 114 } 115 116 // Kill is part of the worker.Worker interface. 117 func (rw *RestartWorkers) Kill() { 118 rw.catacomb.Kill(nil) 119 } 120 121 // Wait is part of the worker.Worker interface. 122 func (rw *RestartWorkers) Wait() error { 123 return rw.catacomb.Wait() 124 } 125 126 func (rw *RestartWorkers) run() error { 127 128 replacers := []replacer{ 129 &txnLogWorkerReplacer{ 130 start: rw.config.Factory.NewTxnLogWorker, 131 current: rw.workers.txnLogWorker, 132 target: &rw.workers.txnLogWorker, 133 }, 134 &presenceWorkerReplacer{ 135 start: rw.config.Factory.NewPresenceWorker, 136 current: rw.workers.presenceWorker, 137 target: &rw.workers.presenceWorker, 138 }, 139 &leaseWorkerReplacer{ 140 start: rw.config.Factory.NewLeadershipWorker, 141 current: rw.workers.leadershipWorker, 142 target: &rw.workers.leadershipWorker, 143 }, 144 &leaseWorkerReplacer{ 145 start: rw.config.Factory.NewSingularWorker, 146 current: rw.workers.singularWorker, 147 target: &rw.workers.singularWorker, 148 }, 149 } 150 151 // begin critical section: cannot touch workers without mutex 152 for _, replacer := range replacers { 153 rw.wg.Add(1) 154 go rw.maintain(replacer) 155 } 156 <-rw.catacomb.Dying() 157 rw.wg.Wait() 158 // end critical section: potential workers writes all finished 159 160 return worker.Stop(rw.workers) 161 } 162 163 // maintain drives a replacer. See commentary in func, and docs on 164 // the replacer interface. 165 func (rw *RestartWorkers) maintain(replacer replacer) { 166 167 // Signal to the RestartWorkers that we've stopped trying to 168 // maintain a worker once we return from this func. 169 defer rw.wg.Done() 170 171 // First, wait until the worker actually needs replacement. 172 select { 173 case <-rw.catacomb.Dying(): 174 return 175 case <-replacer.needed(): 176 } 177 178 // Then try to create a replacement until we succeed... 179 for { 180 select { 181 case <-rw.catacomb.Dying(): 182 return 183 case <-rw.config.Clock.After(rw.config.Delay): 184 } 185 if replacer.prepare() { 186 break 187 } 188 } 189 190 // ...at which point it's OK to take the lock for long enough to 191 // set the replacement worker. 192 rw.mu.Lock() 193 replacer.replace() 194 rw.mu.Unlock() 195 196 // Finally, signal to the RestartWorkers that we'll maintain the 197 // new worker, effectively undoing the deferred Done above... 198 rw.wg.Add(1) 199 200 // ...and start again from the top. 201 go rw.maintain(replacer) 202 } 203 204 // replacer exists to satisfy the very narrow constraints of the 205 // RestartWorkers.maintain method. The methods will be called 206 // in the order defined, as annotated: 207 type replacer interface { 208 209 // needed returns a channel that will be closed when the 210 // original worker has failed and needs to be restarted; 211 // once this has happened... 212 needed() <-chan struct{} 213 214 // ...prepare will then be called repeatedly until it returns 215 // true, indicating that it's created a replacement worker; at 216 // which point... 217 prepare() bool 218 219 // ...the workers mutex will be acquired, and it's safe for the 220 // replacer to write the new worker to the target pointer (and 221 // update its own internal references so that the next call to 222 // needed() returns a channel tied to the new worker's 223 // lifetime). 224 replace() 225 226 // The actual *implementation* of the various kinds of replacer 227 // should not vary -- they'd be great candidates for codegen or 228 // even generics(!). 229 } 230 231 // txnLogWorkerReplacer implements replacer. Apart from the types, it 232 // should be identical to presenceWorkerReplacer and leaseWorkerReplacer. 233 type txnLogWorkerReplacer struct { 234 start func() (TxnLogWorker, error) 235 current TxnLogWorker 236 next TxnLogWorker 237 target *TxnLogWorker 238 } 239 240 func (r *txnLogWorkerReplacer) needed() <-chan struct{} { 241 return worker.Dead(r.current) 242 } 243 244 func (r *txnLogWorkerReplacer) prepare() bool { 245 var err error 246 r.next, err = r.start() 247 return err == nil 248 } 249 250 func (r *txnLogWorkerReplacer) replace() { 251 *r.target = r.next 252 r.current = r.next 253 r.next = nil 254 } 255 256 // presenceWorkerReplacer implements replacer. Apart from the types, it 257 // should be identical to txnLogWorkerReplacer and leaseWorkerReplacer. 258 type presenceWorkerReplacer struct { 259 start func() (PresenceWorker, error) 260 current PresenceWorker 261 next PresenceWorker 262 target *PresenceWorker 263 } 264 265 func (r *presenceWorkerReplacer) needed() <-chan struct{} { 266 return worker.Dead(r.current) 267 } 268 269 func (r *presenceWorkerReplacer) prepare() bool { 270 var err error 271 r.next, err = r.start() 272 return err == nil 273 } 274 275 func (r *presenceWorkerReplacer) replace() { 276 *r.target = r.next 277 r.current = r.next 278 r.next = nil 279 } 280 281 // leaseWorkerReplacer implements replacer. Apart from the types, it 282 // should be identical to presenceWorkerReplacer and txnLogWorkerReplacer. 283 type leaseWorkerReplacer struct { 284 start func() (LeaseWorker, error) 285 current LeaseWorker 286 next LeaseWorker 287 target *LeaseWorker 288 } 289 290 func (r *leaseWorkerReplacer) needed() <-chan struct{} { 291 return worker.Dead(r.current) 292 } 293 294 func (r *leaseWorkerReplacer) prepare() bool { 295 var err error 296 r.next, err = r.start() 297 return err == nil 298 } 299 300 func (r *leaseWorkerReplacer) replace() { 301 *r.target = r.next 302 r.current = r.next 303 r.next = nil 304 } 305 306 // DynamicLeaseManager is a workers.LeaseManager that calls a given function 307 // to acquire a fresh LeaseManager for each method call. This enables us to 308 // hide the fact that workers returned from RestartManager may become stale. 309 type DynamicLeaseManager struct { 310 mu *sync.Mutex 311 w *LeaseWorker 312 } 313 314 // Claim is part of the lease.Claimer interface. 315 func (d DynamicLeaseManager) Claim(leaseName, holderName string, duration time.Duration) error { 316 return d.Underlying().Claim(leaseName, holderName, duration) 317 } 318 319 // WaitUntilExpired is part of the lease.Claimer interface. 320 func (d DynamicLeaseManager) WaitUntilExpired(leaseName string) error { 321 return d.Underlying().WaitUntilExpired(leaseName) 322 } 323 324 // Token is part of the lease.Checker interface. 325 func (d DynamicLeaseManager) Token(leaseName, holderName string) lease.Token { 326 return d.Underlying().Token(leaseName, holderName) 327 } 328 329 // Underlying returns the current underlying LeaseManager. 330 func (d DynamicLeaseManager) Underlying() LeaseManager { 331 d.mu.Lock() 332 defer d.mu.Unlock() 333 return *d.w 334 }