github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/worker/singular/mongo_test.go (about) 1 // Copyright 2014 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package singular_test 5 6 import ( 7 "flag" 8 "fmt" 9 "strings" 10 "time" 11 12 "github.com/juju/loggo" 13 "github.com/juju/replicaset" 14 gitjujutesting "github.com/juju/testing" 15 jc "github.com/juju/testing/checkers" 16 "github.com/juju/utils" 17 gc "gopkg.in/check.v1" 18 "gopkg.in/mgo.v2" 19 20 "github.com/juju/juju/testing" 21 coretesting "github.com/juju/juju/testing" 22 "github.com/juju/juju/worker" 23 "github.com/juju/juju/worker/singular" 24 ) 25 26 var logger = loggo.GetLogger("juju.singular-test") 27 28 type mongoSuite struct { 29 testing.BaseSuite 30 } 31 32 var enableUnreliableTests = flag.Bool("juju.unreliabletests", false, "enable unreliable and slow tests") 33 34 var _ = gc.Suite(&mongoSuite{}) 35 36 func (*mongoSuite) SetUpSuite(c *gc.C) { 37 if !*enableUnreliableTests { 38 c.Skip("skipping unreliable tests") 39 } 40 } 41 42 // start replica set with three mongods 43 // start singular worker on each one. 44 // change worker priorities so the master changes. 45 // check that 46 // a) there is never more than one running at a time 47 // b) the running worker changes when the master changes. 48 49 func (*mongoSuite) TestMongoMastership(c *gc.C) { 50 insts, err := startReplicaSet(3) 51 c.Assert(err, jc.ErrorIsNil) 52 for _, inst := range insts { 53 defer inst.Destroy() 54 } 55 notifyCh := make(chan event, 100) 56 globalState := newGlobalAgentState(len(insts), notifyCh) 57 58 agents := startAgents(c, notifyCh, insts) 59 60 assertAgentsConnect(c, globalState) 61 62 // Wait for one of the agents to start. 63 for globalState.activeId == -1 { 64 globalState.waitEvent(c) 65 } 66 c.Logf("agent %d started; waiting for servers to sync", globalState.activeId) 67 time.Sleep(1 * time.Minute) 68 69 // Try to choose a different agent than the primary to 70 // make master (note we can't just do (activeId+1)%len(insts) 71 // because ids start at 1 not 0) 72 nextId := ((globalState.activeId+1)-1)%len(insts) + 1 73 74 c.Logf("giving agent %d priority to become master", nextId) 75 changeVotes(c, insts, nextId) 76 77 // Wait for the first agent to stop and another agent 78 // to start. Note that because of mongo's vagaries, we 79 // cannot be sure which agent will actually start, even 80 // though we've set our priorities to hope that a 81 // particular mongo instance (nextId) becomes master. 82 oldId := globalState.activeId 83 oldHasStopped := false 84 for { 85 if oldHasStopped && globalState.activeId != -1 { 86 break 87 } 88 got := globalState.waitEvent(c) 89 if got.kind == "stop" && got.id == oldId { 90 oldHasStopped = true 91 } 92 } 93 94 // Kill all the agents and wait for them to quit. 95 for _, a := range agents { 96 if a.Runner == nil { 97 panic("runner is nil") 98 } 99 a.Kill() 100 } 101 102 assertAgentsQuit(c, globalState) 103 } 104 105 func startAgents(c *gc.C, notifyCh chan<- event, insts []*gitjujutesting.MgoInstance) []*agent { 106 agents := make([]*agent, len(insts)) 107 for i, inst := range insts { 108 a := &agent{ 109 // Note: we use ids starting from 1 to match 110 // the replica set ids. 111 notify: ¬ifier{ 112 id: i + 1, 113 ch: notifyCh, 114 }, 115 Runner: newRunner(), 116 hostPort: inst.Addr(), 117 } 118 go func() { 119 err := a.run() 120 a.notify.agentQuit(err) 121 }() 122 agents[i] = a 123 } 124 return agents 125 } 126 127 // assertAgentsConnect waits for all the agents to connect. 128 func assertAgentsConnect(c *gc.C, globalState *globalAgentState) { 129 allConnected := func() bool { 130 for _, connected := range globalState.connected { 131 if !connected { 132 return false 133 } 134 } 135 return true 136 } 137 for !allConnected() { 138 globalState.waitEvent(c) 139 } 140 } 141 142 func assertAgentsQuit(c *gc.C, globalState *globalAgentState) { 143 allQuit := func() bool { 144 for _, quit := range globalState.quit { 145 if !quit { 146 return false 147 } 148 } 149 return true 150 } 151 for !allQuit() { 152 globalState.waitEvent(c) 153 } 154 } 155 156 type agent struct { 157 notify *notifier 158 worker.Runner 159 hostPort string 160 } 161 162 func (a *agent) run() error { 163 a.Runner.StartWorker(fmt.Sprint("mongo-", a.notify.id), a.mongoWorker) 164 return a.Runner.Wait() 165 } 166 167 func (a *agent) mongoWorker() (worker.Worker, error) { 168 dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, a.hostPort) 169 session, err := mgo.DialWithInfo(dialInfo) 170 if err != nil { 171 return nil, err 172 } 173 mc := &mongoConn{ 174 localHostPort: a.hostPort, 175 session: session, 176 } 177 runner := worker.NewRunner( 178 connectionIsFatal(mc), 179 func(err0, err1 error) bool { return true }, 180 ) 181 singularRunner, err := singular.New(runner, mc) 182 if err != nil { 183 return nil, fmt.Errorf("cannot start singular runner: %v", err) 184 } 185 a.notify.workerConnected() 186 singularRunner.StartWorker(fmt.Sprint("worker-", a.notify.id), func() (worker.Worker, error) { 187 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 188 return a.worker(session, stop) 189 }), nil 190 }) 191 return runner, nil 192 } 193 194 func (a *agent) worker(session *mgo.Session, stop <-chan struct{}) error { 195 a.notify.workerStarted() 196 defer a.notify.workerStopped() 197 coll := session.DB("foo").C("bar") 198 for { 199 select { 200 case <-stop: 201 return nil 202 case <-time.After(250 * time.Millisecond): 203 } 204 if err := coll.Insert(struct{}{}); err != nil { 205 return fmt.Errorf("insert error: %v", err) 206 } 207 a.notify.operation() 208 } 209 } 210 211 // globalAgentState keeps track of the global state 212 // of all the running "agents". The state is 213 // updated by the waitEvent method. 214 // The slices (connected, started and quit) hold an entry for each 215 // agent - the entry for the agent with id x is held at index x-1. 216 type globalAgentState struct { 217 numAgents int 218 notifyCh <-chan event 219 220 // connected reports which agents have ever connected. 221 connected []bool 222 223 // started reports which agents have started. 224 started []bool 225 226 // quit reports which agents have quit. 227 quit []bool 228 229 // activeId holds the id of the agent that is 230 // currently performing operations. 231 activeId int 232 } 233 234 // newGlobalAgentState returns a globalAgentState instance that keeps track 235 // of the given number of agents which all send events on notifyCh. 236 func newGlobalAgentState(numAgents int, notifyCh <-chan event) *globalAgentState { 237 return &globalAgentState{ 238 notifyCh: notifyCh, 239 numAgents: numAgents, 240 connected: make([]bool, numAgents), 241 242 started: make([]bool, numAgents), 243 244 quit: make([]bool, numAgents), 245 activeId: -1, 246 } 247 } 248 249 func (g *globalAgentState) String() string { 250 return fmt.Sprintf("{active %d; connected %s; started %s; quit %s}", 251 g.activeId, 252 boolsToStr(g.connected), 253 boolsToStr(g.started), 254 boolsToStr(g.quit), 255 ) 256 } 257 258 func boolsToStr(b []bool) string { 259 d := make([]byte, len(b)) 260 for i, ok := range b { 261 if ok { 262 d[i] = '1' 263 } else { 264 d[i] = '0' 265 } 266 } 267 return string(d) 268 } 269 270 // waitEvent waits for any event to happen and updates g 271 // accordingly. It ensures that expected invariants are 272 // maintained - if an invariant is violated, a fatal error 273 // will be generated using c. 274 func (g *globalAgentState) waitEvent(c *gc.C) event { 275 c.Logf("awaiting event; current state %s", g) 276 277 possible := g.possibleEvents() 278 c.Logf("possible: %q", possible) 279 280 got := expectNotification(c, g.notifyCh, possible) 281 index := got.id - 1 282 switch got.kind { 283 case "connect": 284 g.connected[index] = true 285 case "start": 286 g.started[index] = true 287 case "operation": 288 if g.activeId != -1 && g.activeId != got.id { 289 c.Fatalf("mixed operations from different agents") 290 } 291 g.activeId = got.id 292 case "stop": 293 g.activeId = -1 294 g.started[index] = false 295 case "quit": 296 g.quit[index] = true 297 c.Assert(got.info, gc.IsNil) 298 default: 299 c.Fatalf("unexpected event %q", got) 300 } 301 return got 302 } 303 304 func (g *globalAgentState) possibleEvents() []event { 305 var possible []event 306 for i := 0; i < g.numAgents; i++ { 307 isConnected, isStarted, hasQuit := g.connected[i], g.started[i], g.quit[i] 308 id := i + 1 309 addPossible := func(kind string) { 310 possible = append(possible, event{kind: kind, id: id}) 311 } 312 if !isConnected { 313 addPossible("connect") 314 continue 315 } 316 if isStarted { 317 if g.activeId == -1 || id == g.activeId { 318 // If there's no active worker, then we allow 319 // any worker to run an operation, but 320 // once a worker has successfully run an 321 // operation, it will be an error if any 322 // other worker runs an operation before 323 // the first worker has stopped. 324 addPossible("operation") 325 } 326 // It's always ok for a started worker to stop. 327 addPossible("stop") 328 } else { 329 // connect followed by connect is possible for a worker 330 // that's not master. 331 addPossible("connect") 332 333 // We allow any number of workers to start - it's 334 // ok as long as none of the extra workers actually 335 // manage to complete an operation successfully. 336 addPossible("start") 337 338 if !hasQuit { 339 addPossible("quit") 340 } 341 } 342 } 343 return possible 344 } 345 346 func mkEvent(s string) event { 347 var e event 348 if n, _ := fmt.Sscanf(s, "%s %d", &e.kind, &e.id); n != 2 { 349 panic("invalid event " + s) 350 } 351 return e 352 } 353 354 func mkEvents(ss ...string) []event { 355 events := make([]event, len(ss)) 356 for i, s := range ss { 357 events[i] = mkEvent(s) 358 } 359 return events 360 } 361 362 type event struct { 363 kind string 364 id int 365 info interface{} 366 } 367 368 func (e event) String() string { 369 if e.info != nil { 370 return fmt.Sprintf("%s %d %v", e.kind, e.id, e.info) 371 } else { 372 return fmt.Sprintf("%s %d", e.kind, e.id) 373 } 374 } 375 376 func oneOf(possible ...string) string { 377 return strings.Join(possible, "|") 378 } 379 380 func expectNotification(c *gc.C, notifyCh <-chan event, possible []event) event { 381 select { 382 case e := <-notifyCh: 383 c.Logf("received notification %q", e) 384 for _, p := range possible { 385 if e.kind == p.kind && e.id == p.id { 386 return e 387 } 388 } 389 c.Fatalf("event %q does not match any of %q", e, possible) 390 return e 391 case <-time.After(testing.LongWait): 392 c.Fatalf("timed out waiting for %q", possible) 393 } 394 panic("unreachable") 395 } 396 397 func changeVotes(c *gc.C, insts []*gitjujutesting.MgoInstance, voteId int) { 398 c.Logf("changing voting id to %v", voteId) 399 400 addrs := make([]string, len(insts)) 401 for i, inst := range insts { 402 addrs[i] = inst.Addr() 403 } 404 dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, addrs...) 405 406 session, err := mgo.DialWithInfo(dialInfo) 407 c.Assert(err, jc.ErrorIsNil) 408 defer session.Close() 409 410 members, err := replicaset.CurrentMembers(session) 411 c.Assert(err, jc.ErrorIsNil) 412 c.Assert(members, gc.HasLen, len(insts)) 413 for i := range members { 414 member := &members[i] 415 if member.Id == voteId { 416 member.Priority = nil 417 } else { 418 member.Priority = newFloat64(0.1) 419 } 420 } 421 c.Logf("new member set: %#v", members) 422 err = replicaset.Set(session, members) 423 c.Assert(err, jc.ErrorIsNil) 424 425 c.Logf("successfully changed replica set members") 426 } 427 428 type notifier struct { 429 id int 430 ch chan<- event 431 } 432 433 func (n *notifier) sendEvent(kind string, info interface{}) { 434 n.ch <- event{ 435 id: n.id, 436 kind: kind, 437 info: info, 438 } 439 } 440 441 func (n *notifier) workerConnected() { 442 n.sendEvent("connect", nil) 443 } 444 445 func (n *notifier) workerStarted() { 446 n.sendEvent("start", nil) 447 } 448 449 func (n *notifier) workerStopped() { 450 n.sendEvent("stop", nil) 451 } 452 453 func (n *notifier) operation() { 454 n.sendEvent("operation", nil) 455 } 456 457 func (n *notifier) agentQuit(err error) { 458 n.sendEvent("quit", err) 459 } 460 461 type mongoConn struct { 462 localHostPort string 463 session *mgo.Session 464 } 465 466 func (c *mongoConn) Ping() error { 467 return c.session.Ping() 468 } 469 470 func (c *mongoConn) IsMaster() (bool, error) { 471 hostPort, err := replicaset.MasterHostPort(c.session) 472 if err != nil { 473 logger.Errorf("replicaset.MasterHostPort returned error: %v", err) 474 return false, err 475 } 476 logger.Errorf("replicaset.MasterHostPort(%s) returned %s", c.localHostPort, hostPort) 477 logger.Errorf("-> %s IsMaster: %v", c.localHostPort, hostPort == c.localHostPort) 478 return hostPort == c.localHostPort, nil 479 } 480 481 const replicaSetName = "juju" 482 483 // startReplicaSet starts up a replica set with n mongo instances. 484 func startReplicaSet(n int) (_ []*gitjujutesting.MgoInstance, err error) { 485 insts := make([]*gitjujutesting.MgoInstance, 0, n) 486 root, err := newMongoInstance() 487 if err != nil { 488 return nil, err 489 } 490 insts = append(insts, root) 491 defer func() { 492 if err == nil { 493 return 494 } 495 for _, inst := range insts { 496 inst.Destroy() 497 } 498 }() 499 500 dialInfo := root.DialInfo() 501 dialInfo.Direct = true 502 dialInfo.Timeout = 60 * time.Second 503 504 session, err := root.DialDirect() 505 if err != nil { 506 return nil, fmt.Errorf("cannot dial root instance: %v", err) 507 } 508 defer session.Close() 509 510 logger.Infof("dialled root instance") 511 512 if err := replicaset.Initiate(session, root.Addr(), replicaSetName, nil); err != nil { 513 return nil, fmt.Errorf("cannot initiate replica set: %v", err) 514 } 515 var members []replicaset.Member 516 for i := 1; i < n; i++ { 517 inst, err := newMongoInstance() 518 if err != nil { 519 return nil, err 520 } 521 insts = append(insts, inst) 522 members = append(members, replicaset.Member{ 523 Address: inst.Addr(), 524 Priority: newFloat64(0.1), 525 Id: i + 1, 526 }) 527 } 528 attempt := utils.AttemptStrategy{ 529 Total: 60 * time.Second, 530 Delay: 1 * time.Second, 531 } 532 for a := attempt.Start(); a.Next(); { 533 err := replicaset.Add(session, members...) 534 if err == nil { 535 break 536 } 537 logger.Errorf("cannot add members: %v", err) 538 if !a.HasNext() { 539 return nil, fmt.Errorf("timed out trying to add members") 540 } 541 logger.Errorf("retrying") 542 } 543 return insts, err 544 } 545 546 func newMongoInstance() (*gitjujutesting.MgoInstance, error) { 547 inst := &gitjujutesting.MgoInstance{Params: []string{"--replSet", replicaSetName}} 548 if err := inst.Start(testing.Certs); err != nil { 549 return nil, fmt.Errorf("cannot start mongo server: %s", err.Error()) 550 } 551 return inst, nil 552 } 553 554 func newFloat64(f float64) *float64 { 555 return &f 556 } 557 558 // connectionIsFatal returns a function suitable for passing 559 // as the isFatal argument to worker.NewRunner, 560 // that diagnoses an error as fatal if the connection 561 // has failed or if the error is otherwise fatal. 562 // Copied from jujud. 563 func connectionIsFatal(conn singular.Conn) func(err error) bool { 564 return func(err error) bool { 565 if err := conn.Ping(); err != nil { 566 logger.Infof("error pinging %T: %v", conn, err) 567 return true 568 } 569 logger.Infof("error %q is not fatal", err) 570 return false 571 } 572 }