github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/worker/singular/mongo_test.go (about) 1 // Copyright 2014 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package singular_test 5 6 import ( 7 "flag" 8 "fmt" 9 "strings" 10 "time" 11 12 "github.com/juju/loggo" 13 "github.com/juju/utils" 14 "labix.org/v2/mgo" 15 gc "launchpad.net/gocheck" 16 17 "github.com/juju/juju/replicaset" 18 "github.com/juju/juju/testing" 19 "github.com/juju/juju/worker" 20 "github.com/juju/juju/worker/singular" 21 ) 22 23 var logger = loggo.GetLogger("juju.singular-test") 24 25 type mongoSuite struct { 26 testing.BaseSuite 27 } 28 29 var enableUnreliableTests = flag.Bool("juju.unreliabletests", false, "enable unreliable and slow tests") 30 31 var _ = gc.Suite(&mongoSuite{}) 32 33 func (*mongoSuite) SetUpSuite(c *gc.C) { 34 if !*enableUnreliableTests { 35 c.Skip("skipping unreliable tests") 36 } 37 } 38 39 // start replica set with three mongods 40 // start singular worker on each one. 41 // change worker priorities so the master changes. 42 // check that 43 // a) there is never more than one running at a time 44 // b) the running worker changes when the master changes. 45 46 func (*mongoSuite) TestMongoMastership(c *gc.C) { 47 insts, err := startReplicaSet(3) 48 c.Assert(err, gc.IsNil) 49 for _, inst := range insts { 50 defer inst.Destroy() 51 } 52 notifyCh := make(chan event, 100) 53 globalState := newGlobalAgentState(len(insts), notifyCh) 54 55 agents := startAgents(c, notifyCh, insts) 56 57 assertAgentsConnect(c, globalState) 58 59 // Wait for one of the agents to start. 60 for globalState.activeId == -1 { 61 globalState.waitEvent(c) 62 } 63 c.Logf("agent %d started; waiting for servers to sync", globalState.activeId) 64 time.Sleep(1 * time.Minute) 65 66 // Try to choose a different agent than the primary to 67 // make master (note we can't just do (activeId+1)%len(insts) 68 // because ids start at 1 not 0) 69 nextId := ((globalState.activeId+1)-1)%len(insts) + 1 70 71 c.Logf("giving agent %d priority to become master", nextId) 72 changeVotes(c, insts, nextId) 73 74 // Wait for the first agent to stop and another agent 75 // to start. Note that because of mongo's vagaries, we 76 // cannot be sure which agent will actually start, even 77 // though we've set our priorities to hope that a 78 // particular mongo instance (nextId) becomes master. 79 oldId := globalState.activeId 80 oldHasStopped := false 81 for { 82 if oldHasStopped && globalState.activeId != -1 { 83 break 84 } 85 got := globalState.waitEvent(c) 86 if got.kind == "stop" && got.id == oldId { 87 oldHasStopped = true 88 } 89 } 90 91 // Kill all the agents and wait for them to quit. 92 for _, a := range agents { 93 if a.Runner == nil { 94 panic("runner is nil") 95 } 96 a.Kill() 97 } 98 99 assertAgentsQuit(c, globalState) 100 } 101 102 func startAgents(c *gc.C, notifyCh chan<- event, insts []*testing.MgoInstance) []*agent { 103 agents := make([]*agent, len(insts)) 104 for i, inst := range insts { 105 a := &agent{ 106 // Note: we use ids starting from 1 to match 107 // the replica set ids. 108 notify: ¬ifier{ 109 id: i + 1, 110 ch: notifyCh, 111 }, 112 Runner: newRunner(), 113 hostPort: inst.Addr(), 114 } 115 go func() { 116 err := a.run() 117 a.notify.agentQuit(err) 118 }() 119 agents[i] = a 120 } 121 return agents 122 } 123 124 // assertAgentsConnect waits for all the agents to connect. 125 func assertAgentsConnect(c *gc.C, globalState *globalAgentState) { 126 allConnected := func() bool { 127 for _, connected := range globalState.connected { 128 if !connected { 129 return false 130 } 131 } 132 return true 133 } 134 for !allConnected() { 135 globalState.waitEvent(c) 136 } 137 } 138 139 func assertAgentsQuit(c *gc.C, globalState *globalAgentState) { 140 allQuit := func() bool { 141 for _, quit := range globalState.quit { 142 if !quit { 143 return false 144 } 145 } 146 return true 147 } 148 for !allQuit() { 149 globalState.waitEvent(c) 150 } 151 } 152 153 type agent struct { 154 notify *notifier 155 worker.Runner 156 hostPort string 157 } 158 159 func (a *agent) run() error { 160 a.Runner.StartWorker(fmt.Sprint("mongo-", a.notify.id), a.mongoWorker) 161 return a.Runner.Wait() 162 } 163 164 func (a *agent) mongoWorker() (worker.Worker, error) { 165 dialInfo := testing.MgoDialInfo(a.hostPort) 166 session, err := mgo.DialWithInfo(dialInfo) 167 if err != nil { 168 return nil, err 169 } 170 mc := &mongoConn{ 171 localHostPort: a.hostPort, 172 session: session, 173 } 174 runner := worker.NewRunner( 175 connectionIsFatal(mc), 176 func(err0, err1 error) bool { return true }, 177 ) 178 singularRunner, err := singular.New(runner, mc) 179 if err != nil { 180 return nil, fmt.Errorf("cannot start singular runner: %v", err) 181 } 182 a.notify.workerConnected() 183 singularRunner.StartWorker(fmt.Sprint("worker-", a.notify.id), func() (worker.Worker, error) { 184 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 185 return a.worker(session, stop) 186 }), nil 187 }) 188 return runner, nil 189 } 190 191 func (a *agent) worker(session *mgo.Session, stop <-chan struct{}) error { 192 a.notify.workerStarted() 193 defer a.notify.workerStopped() 194 coll := session.DB("foo").C("bar") 195 for { 196 select { 197 case <-stop: 198 return nil 199 case <-time.After(250 * time.Millisecond): 200 } 201 if err := coll.Insert(struct{}{}); err != nil { 202 return fmt.Errorf("insert error: %v", err) 203 } 204 a.notify.operation() 205 } 206 } 207 208 // globalAgentState keeps track of the global state 209 // of all the running "agents". The state is 210 // updated by the waitEvent method. 211 // The slices (connected, started and quit) hold an entry for each 212 // agent - the entry for the agent with id x is held at index x-1. 213 type globalAgentState struct { 214 numAgents int 215 notifyCh <-chan event 216 217 // connected reports which agents have ever connected. 218 connected []bool 219 220 // started reports which agents have started. 221 started []bool 222 223 // quit reports which agents have quit. 224 quit []bool 225 226 // activeId holds the id of the agent that is 227 // currently performing operations. 228 activeId int 229 } 230 231 // newGlobalAgentState returns a globalAgentState instance that keeps track 232 // of the given number of agents which all send events on notifyCh. 233 func newGlobalAgentState(numAgents int, notifyCh <-chan event) *globalAgentState { 234 return &globalAgentState{ 235 notifyCh: notifyCh, 236 numAgents: numAgents, 237 connected: make([]bool, numAgents), 238 239 started: make([]bool, numAgents), 240 241 quit: make([]bool, numAgents), 242 activeId: -1, 243 } 244 } 245 246 func (g *globalAgentState) String() string { 247 return fmt.Sprintf("{active %d; connected %s; started %s; quit %s}", 248 g.activeId, 249 boolsToStr(g.connected), 250 boolsToStr(g.started), 251 boolsToStr(g.quit), 252 ) 253 } 254 255 func boolsToStr(b []bool) string { 256 d := make([]byte, len(b)) 257 for i, ok := range b { 258 if ok { 259 d[i] = '1' 260 } else { 261 d[i] = '0' 262 } 263 } 264 return string(d) 265 } 266 267 // waitEvent waits for any event to happen and updates g 268 // accordingly. It ensures that expected invariants are 269 // maintained - if an invariant is violated, a fatal error 270 // will be generated using c. 271 func (g *globalAgentState) waitEvent(c *gc.C) event { 272 c.Logf("awaiting event; current state %s", g) 273 274 possible := g.possibleEvents() 275 c.Logf("possible: %q", possible) 276 277 got := expectNotification(c, g.notifyCh, possible) 278 index := got.id - 1 279 switch got.kind { 280 case "connect": 281 g.connected[index] = true 282 case "start": 283 g.started[index] = true 284 case "operation": 285 if g.activeId != -1 && g.activeId != got.id { 286 c.Fatalf("mixed operations from different agents") 287 } 288 g.activeId = got.id 289 case "stop": 290 g.activeId = -1 291 g.started[index] = false 292 case "quit": 293 g.quit[index] = true 294 c.Assert(got.info, gc.IsNil) 295 default: 296 c.Fatalf("unexpected event %q", got) 297 } 298 return got 299 } 300 301 func (g *globalAgentState) possibleEvents() []event { 302 var possible []event 303 for i := 0; i < g.numAgents; i++ { 304 isConnected, isStarted, hasQuit := g.connected[i], g.started[i], g.quit[i] 305 id := i + 1 306 addPossible := func(kind string) { 307 possible = append(possible, event{kind: kind, id: id}) 308 } 309 if !isConnected { 310 addPossible("connect") 311 continue 312 } 313 if isStarted { 314 if g.activeId == -1 || id == g.activeId { 315 // If there's no active worker, then we allow 316 // any worker to run an operation, but 317 // once a worker has successfully run an 318 // operation, it will be an error if any 319 // other worker runs an operation before 320 // the first worker has stopped. 321 addPossible("operation") 322 } 323 // It's always ok for a started worker to stop. 324 addPossible("stop") 325 } else { 326 // connect followed by connect is possible for a worker 327 // that's not master. 328 addPossible("connect") 329 330 // We allow any number of workers to start - it's 331 // ok as long as none of the extra workers actually 332 // manage to complete an operation successfully. 333 addPossible("start") 334 335 if !hasQuit { 336 addPossible("quit") 337 } 338 } 339 } 340 return possible 341 } 342 343 func mkEvent(s string) event { 344 var e event 345 if n, _ := fmt.Sscanf(s, "%s %d", &e.kind, &e.id); n != 2 { 346 panic("invalid event " + s) 347 } 348 return e 349 } 350 351 func mkEvents(ss ...string) []event { 352 events := make([]event, len(ss)) 353 for i, s := range ss { 354 events[i] = mkEvent(s) 355 } 356 return events 357 } 358 359 type event struct { 360 kind string 361 id int 362 info interface{} 363 } 364 365 func (e event) String() string { 366 if e.info != nil { 367 return fmt.Sprintf("%s %d %v", e.kind, e.id, e.info) 368 } else { 369 return fmt.Sprintf("%s %d", e.kind, e.id) 370 } 371 } 372 373 func oneOf(possible ...string) string { 374 return strings.Join(possible, "|") 375 } 376 377 func expectNotification(c *gc.C, notifyCh <-chan event, possible []event) event { 378 select { 379 case e := <-notifyCh: 380 c.Logf("received notification %q", e) 381 for _, p := range possible { 382 if e.kind == p.kind && e.id == p.id { 383 return e 384 } 385 } 386 c.Fatalf("event %q does not match any of %q", e, possible) 387 return e 388 case <-time.After(testing.LongWait): 389 c.Fatalf("timed out waiting for %q", possible) 390 } 391 panic("unreachable") 392 } 393 394 func changeVotes(c *gc.C, insts []*testing.MgoInstance, voteId int) { 395 c.Logf("changing voting id to %v", voteId) 396 397 addrs := make([]string, len(insts)) 398 for i, inst := range insts { 399 addrs[i] = inst.Addr() 400 } 401 dialInfo := testing.MgoDialInfo(addrs...) 402 403 session, err := mgo.DialWithInfo(dialInfo) 404 c.Assert(err, gc.IsNil) 405 defer session.Close() 406 407 members, err := replicaset.CurrentMembers(session) 408 c.Assert(err, gc.IsNil) 409 c.Assert(members, gc.HasLen, len(insts)) 410 for i := range members { 411 member := &members[i] 412 if member.Id == voteId { 413 member.Priority = nil 414 } else { 415 member.Priority = newFloat64(0.1) 416 } 417 } 418 c.Logf("new member set: %#v", members) 419 err = replicaset.Set(session, members) 420 c.Assert(err, gc.IsNil) 421 422 c.Logf("successfully changed replica set members") 423 } 424 425 type notifier struct { 426 id int 427 ch chan<- event 428 } 429 430 func (n *notifier) sendEvent(kind string, info interface{}) { 431 n.ch <- event{ 432 id: n.id, 433 kind: kind, 434 info: info, 435 } 436 } 437 438 func (n *notifier) workerConnected() { 439 n.sendEvent("connect", nil) 440 } 441 442 func (n *notifier) workerStarted() { 443 n.sendEvent("start", nil) 444 } 445 446 func (n *notifier) workerStopped() { 447 n.sendEvent("stop", nil) 448 } 449 450 func (n *notifier) operation() { 451 n.sendEvent("operation", nil) 452 } 453 454 func (n *notifier) agentQuit(err error) { 455 n.sendEvent("quit", err) 456 } 457 458 type mongoConn struct { 459 localHostPort string 460 session *mgo.Session 461 } 462 463 func (c *mongoConn) Ping() error { 464 return c.session.Ping() 465 } 466 467 func (c *mongoConn) IsMaster() (bool, error) { 468 hostPort, err := replicaset.MasterHostPort(c.session) 469 if err != nil { 470 logger.Errorf("replicaset.MasterHostPort returned error: %v", err) 471 return false, err 472 } 473 logger.Errorf("replicaset.MasterHostPort(%s) returned %s", c.localHostPort, hostPort) 474 logger.Errorf("-> %s IsMaster: %v", c.localHostPort, hostPort == c.localHostPort) 475 return hostPort == c.localHostPort, nil 476 } 477 478 const replicaSetName = "juju" 479 480 // startReplicaSet starts up a replica set with n mongo instances. 481 func startReplicaSet(n int) (_ []*testing.MgoInstance, err error) { 482 insts := make([]*testing.MgoInstance, 0, n) 483 root, err := newMongoInstance() 484 if err != nil { 485 return nil, err 486 } 487 insts = append(insts, root) 488 defer func() { 489 if err == nil { 490 return 491 } 492 for _, inst := range insts { 493 inst.Destroy() 494 } 495 }() 496 497 dialInfo := root.DialInfo() 498 dialInfo.Direct = true 499 dialInfo.Timeout = 60 * time.Second 500 501 session, err := root.DialDirect() 502 if err != nil { 503 return nil, fmt.Errorf("cannot dial root instance: %v", err) 504 } 505 defer session.Close() 506 507 logger.Infof("dialled root instance") 508 509 if err := replicaset.Initiate(session, root.Addr(), replicaSetName, nil); err != nil { 510 return nil, fmt.Errorf("cannot initiate replica set: %v", err) 511 } 512 var members []replicaset.Member 513 for i := 1; i < n; i++ { 514 inst, err := newMongoInstance() 515 if err != nil { 516 return nil, err 517 } 518 insts = append(insts, inst) 519 members = append(members, replicaset.Member{ 520 Address: inst.Addr(), 521 Priority: newFloat64(0.1), 522 Id: i + 1, 523 }) 524 } 525 attempt := utils.AttemptStrategy{ 526 Total: 60 * time.Second, 527 Delay: 1 * time.Second, 528 } 529 for a := attempt.Start(); a.Next(); { 530 err := replicaset.Add(session, members...) 531 if err == nil { 532 break 533 } 534 logger.Errorf("cannot add members: %v", err) 535 if !a.HasNext() { 536 return nil, fmt.Errorf("timed out trying to add members") 537 } 538 logger.Errorf("retrying") 539 } 540 return insts, err 541 } 542 543 func newMongoInstance() (*testing.MgoInstance, error) { 544 inst := &testing.MgoInstance{Params: []string{"--replSet", replicaSetName}} 545 if err := inst.Start(true); err != nil { 546 return nil, fmt.Errorf("cannot start mongo server: %s", err.Error()) 547 } 548 return inst, nil 549 } 550 551 func newFloat64(f float64) *float64 { 552 return &f 553 } 554 555 // connectionIsFatal returns a function suitable for passing 556 // as the isFatal argument to worker.NewRunner, 557 // that diagnoses an error as fatal if the connection 558 // has failed or if the error is otherwise fatal. 559 // Copied from jujud. 560 func connectionIsFatal(conn singular.Conn) func(err error) bool { 561 return func(err error) bool { 562 if err := conn.Ping(); err != nil { 563 logger.Infof("error pinging %T: %v", conn, err) 564 return true 565 } 566 logger.Infof("error %q is not fatal", err) 567 return false 568 } 569 }