github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/singular/mongo_test.go (about) 1 // Copyright 2014 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package singular_test 5 6 import ( 7 "flag" 8 "fmt" 9 "strings" 10 "time" 11 12 "github.com/juju/loggo" 13 "github.com/juju/replicaset" 14 gitjujutesting "github.com/juju/testing" 15 jc "github.com/juju/testing/checkers" 16 "github.com/juju/utils" 17 gc "gopkg.in/check.v1" 18 "gopkg.in/mgo.v2" 19 20 "github.com/juju/juju/testing" 21 coretesting "github.com/juju/juju/testing" 22 "github.com/juju/juju/worker" 23 "github.com/juju/juju/worker/singular" 24 ) 25 26 var logger = loggo.GetLogger("juju.singular-test") 27 28 type mongoSuite struct { 29 testing.BaseSuite 30 } 31 32 var _ = gc.Suite(&mongoSuite{}) 33 34 var enableUnreliableTests = flag.Bool("juju.unreliabletests", false, "enable unreliable and slow tests") 35 36 func (*mongoSuite) SetUpSuite(c *gc.C) { 37 if !*enableUnreliableTests { 38 c.Skip("skipping unreliable tests") 39 } 40 } 41 42 // start replica set with three mongods 43 // start singular worker on each one. 44 // change worker priorities so the master changes. 45 // check that 46 // a) there is never more than one running at a time 47 // b) the running worker changes when the master changes. 48 49 func (*mongoSuite) TestMongoMastership(c *gc.C) { 50 insts, err := startReplicaSet(3) 51 c.Assert(err, jc.ErrorIsNil) 52 for _, inst := range insts { 53 defer inst.Destroy() 54 } 55 notifyCh := make(chan event, 100) 56 globalState := newGlobalAgentState(len(insts), notifyCh) 57 58 agents := startAgents(c, notifyCh, insts) 59 60 assertAgentsConnect(c, globalState) 61 62 // Wait for one of the agents to start. 63 for globalState.activeId == -1 { 64 globalState.waitEvent(c) 65 } 66 c.Logf("agent %d started; waiting for servers to sync", globalState.activeId) 67 time.Sleep(1 * time.Minute) 68 69 // Try to choose a different agent than the primary to 70 // make master (note we can't just do (activeId+1)%len(insts) 71 // because ids start at 1 not 0) 72 nextId := ((globalState.activeId+1)-1)%len(insts) + 1 73 74 c.Logf("giving agent %d priority to become master", nextId) 75 changeVotes(c, insts, nextId) 76 77 // Wait for the first agent to stop and another agent 78 // to start. Note that because of mongo's vagaries, we 79 // cannot be sure which agent will actually start, even 80 // though we've set our priorities to hope that a 81 // particular mongo instance (nextId) becomes master. 82 oldId := globalState.activeId 83 oldHasStopped := false 84 for { 85 if oldHasStopped && globalState.activeId != -1 { 86 break 87 } 88 got := globalState.waitEvent(c) 89 if got.kind == "stop" && got.id == oldId { 90 oldHasStopped = true 91 } 92 } 93 94 // Kill all the agents and wait for them to quit. 95 for _, a := range agents { 96 if a.Runner == nil { 97 panic("runner is nil") 98 } 99 a.Kill() 100 } 101 102 assertAgentsQuit(c, globalState) 103 } 104 105 func startAgents(c *gc.C, notifyCh chan<- event, insts []*gitjujutesting.MgoInstance) []*testAgent { 106 agents := make([]*testAgent, len(insts)) 107 for i, inst := range insts { 108 a := &testAgent{ 109 // Note: we use ids starting from 1 to match 110 // the replica set ids. 111 notify: ¬ifier{ 112 id: i + 1, 113 ch: notifyCh, 114 }, 115 Runner: newRunner(), 116 hostPort: inst.Addr(), 117 } 118 go func() { 119 err := a.run() 120 a.notify.agentQuit(err) 121 }() 122 agents[i] = a 123 } 124 return agents 125 } 126 127 // assertAgentsConnect waits for all the agents to connect. 128 func assertAgentsConnect(c *gc.C, globalState *globalAgentState) { 129 allConnected := func() bool { 130 for _, connected := range globalState.connected { 131 if !connected { 132 return false 133 } 134 } 135 return true 136 } 137 for !allConnected() { 138 globalState.waitEvent(c) 139 } 140 } 141 142 func assertAgentsQuit(c *gc.C, globalState *globalAgentState) { 143 allQuit := func() bool { 144 for _, quit := range globalState.quit { 145 if !quit { 146 return false 147 } 148 } 149 return true 150 } 151 for !allQuit() { 152 globalState.waitEvent(c) 153 } 154 } 155 156 type testAgent struct { 157 notify *notifier 158 worker.Runner 159 hostPort string 160 } 161 162 func (a *testAgent) run() error { 163 a.Runner.StartWorker(fmt.Sprint("mongo-", a.notify.id), a.mongoWorker) 164 return a.Runner.Wait() 165 } 166 167 func (a *testAgent) mongoWorker() (worker.Worker, error) { 168 dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, a.hostPort) 169 session, err := mgo.DialWithInfo(dialInfo) 170 if err != nil { 171 return nil, err 172 } 173 mc := &mongoConn{ 174 localHostPort: a.hostPort, 175 session: session, 176 } 177 178 fn := func(err0, err1 error) bool { return true } 179 runner := worker.NewRunner(connectionIsFatal(mc), fn, worker.RestartDelay) 180 singularRunner, err := singular.New(runner, mc) 181 if err != nil { 182 return nil, fmt.Errorf("cannot start singular runner: %v", err) 183 } 184 a.notify.workerConnected() 185 singularRunner.StartWorker(fmt.Sprint("worker-", a.notify.id), func() (worker.Worker, error) { 186 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 187 return a.worker(session, stop) 188 }), nil 189 }) 190 return runner, nil 191 } 192 193 func (a *testAgent) worker(session *mgo.Session, stop <-chan struct{}) error { 194 a.notify.workerStarted() 195 defer a.notify.workerStopped() 196 coll := session.DB("foo").C("bar") 197 for { 198 select { 199 case <-stop: 200 return nil 201 case <-time.After(250 * time.Millisecond): 202 } 203 if err := coll.Insert(struct{}{}); err != nil { 204 return fmt.Errorf("insert error: %v", err) 205 } 206 a.notify.operation() 207 } 208 } 209 210 // globalAgentState keeps track of the global state 211 // of all the running "agents". The state is 212 // updated by the waitEvent method. 213 // The slices (connected, started and quit) hold an entry for each 214 // agent - the entry for the agent with id x is held at index x-1. 215 type globalAgentState struct { 216 numAgents int 217 notifyCh <-chan event 218 219 // connected reports which agents have ever connected. 220 connected []bool 221 222 // started reports which agents have started. 223 started []bool 224 225 // quit reports which agents have quit. 226 quit []bool 227 228 // activeId holds the id of the agent that is 229 // currently performing operations. 230 activeId int 231 } 232 233 // newGlobalAgentState returns a globalAgentState instance that keeps track 234 // of the given number of agents which all send events on notifyCh. 235 func newGlobalAgentState(numAgents int, notifyCh <-chan event) *globalAgentState { 236 return &globalAgentState{ 237 notifyCh: notifyCh, 238 numAgents: numAgents, 239 connected: make([]bool, numAgents), 240 241 started: make([]bool, numAgents), 242 243 quit: make([]bool, numAgents), 244 activeId: -1, 245 } 246 } 247 248 func (g *globalAgentState) String() string { 249 return fmt.Sprintf("{active %d; connected %s; started %s; quit %s}", 250 g.activeId, 251 boolsToStr(g.connected), 252 boolsToStr(g.started), 253 boolsToStr(g.quit), 254 ) 255 } 256 257 func boolsToStr(b []bool) string { 258 d := make([]byte, len(b)) 259 for i, ok := range b { 260 if ok { 261 d[i] = '1' 262 } else { 263 d[i] = '0' 264 } 265 } 266 return string(d) 267 } 268 269 // waitEvent waits for any event to happen and updates g 270 // accordingly. It ensures that expected invariants are 271 // maintained - if an invariant is violated, a fatal error 272 // will be generated using c. 273 func (g *globalAgentState) waitEvent(c *gc.C) event { 274 c.Logf("awaiting event; current state %s", g) 275 276 possible := g.possibleEvents() 277 c.Logf("possible: %q", possible) 278 279 got := expectNotification(c, g.notifyCh, possible) 280 index := got.id - 1 281 switch got.kind { 282 case "connect": 283 g.connected[index] = true 284 case "start": 285 g.started[index] = true 286 case "operation": 287 if g.activeId != -1 && g.activeId != got.id { 288 c.Fatalf("mixed operations from different agents") 289 } 290 g.activeId = got.id 291 case "stop": 292 g.activeId = -1 293 g.started[index] = false 294 case "quit": 295 g.quit[index] = true 296 c.Assert(got.info, gc.IsNil) 297 default: 298 c.Fatalf("unexpected event %q", got) 299 } 300 return got 301 } 302 303 func (g *globalAgentState) possibleEvents() []event { 304 var possible []event 305 for i := 0; i < g.numAgents; i++ { 306 isConnected, isStarted, hasQuit := g.connected[i], g.started[i], g.quit[i] 307 id := i + 1 308 addPossible := func(kind string) { 309 possible = append(possible, event{kind: kind, id: id}) 310 } 311 if !isConnected { 312 addPossible("connect") 313 continue 314 } 315 if isStarted { 316 if g.activeId == -1 || id == g.activeId { 317 // If there's no active worker, then we allow 318 // any worker to run an operation, but 319 // once a worker has successfully run an 320 // operation, it will be an error if any 321 // other worker runs an operation before 322 // the first worker has stopped. 323 addPossible("operation") 324 } 325 // It's always ok for a started worker to stop. 326 addPossible("stop") 327 } else { 328 // connect followed by connect is possible for a worker 329 // that's not master. 330 addPossible("connect") 331 332 // We allow any number of workers to start - it's 333 // ok as long as none of the extra workers actually 334 // manage to complete an operation successfully. 335 addPossible("start") 336 337 if !hasQuit { 338 addPossible("quit") 339 } 340 } 341 } 342 return possible 343 } 344 345 func mkEvent(s string) event { 346 var e event 347 if n, _ := fmt.Sscanf(s, "%s %d", &e.kind, &e.id); n != 2 { 348 panic("invalid event " + s) 349 } 350 return e 351 } 352 353 func mkEvents(ss ...string) []event { 354 events := make([]event, len(ss)) 355 for i, s := range ss { 356 events[i] = mkEvent(s) 357 } 358 return events 359 } 360 361 type event struct { 362 kind string 363 id int 364 info interface{} 365 } 366 367 func (e event) String() string { 368 if e.info != nil { 369 return fmt.Sprintf("%s %d %v", e.kind, e.id, e.info) 370 } else { 371 return fmt.Sprintf("%s %d", e.kind, e.id) 372 } 373 } 374 375 func oneOf(possible ...string) string { 376 return strings.Join(possible, "|") 377 } 378 379 func expectNotification(c *gc.C, notifyCh <-chan event, possible []event) event { 380 select { 381 case e := <-notifyCh: 382 c.Logf("received notification %q", e) 383 for _, p := range possible { 384 if e.kind == p.kind && e.id == p.id { 385 return e 386 } 387 } 388 c.Fatalf("event %q does not match any of %q", e, possible) 389 return e 390 case <-time.After(testing.LongWait): 391 c.Fatalf("timed out waiting for %q", possible) 392 } 393 panic("unreachable") 394 } 395 396 func changeVotes(c *gc.C, insts []*gitjujutesting.MgoInstance, voteId int) { 397 c.Logf("changing voting id to %v", voteId) 398 399 addrs := make([]string, len(insts)) 400 for i, inst := range insts { 401 addrs[i] = inst.Addr() 402 } 403 dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, addrs...) 404 405 session, err := mgo.DialWithInfo(dialInfo) 406 c.Assert(err, jc.ErrorIsNil) 407 defer session.Close() 408 409 members, err := replicaset.CurrentMembers(session) 410 c.Assert(err, jc.ErrorIsNil) 411 c.Assert(members, gc.HasLen, len(insts)) 412 for i := range members { 413 member := &members[i] 414 if member.Id == voteId { 415 member.Priority = nil 416 } else { 417 member.Priority = newFloat64(0.1) 418 } 419 } 420 c.Logf("new member set: %#v", members) 421 err = replicaset.Set(session, members) 422 c.Assert(err, jc.ErrorIsNil) 423 424 c.Logf("successfully changed replica set members") 425 } 426 427 type notifier struct { 428 id int 429 ch chan<- event 430 } 431 432 func (n *notifier) sendEvent(kind string, info interface{}) { 433 n.ch <- event{ 434 id: n.id, 435 kind: kind, 436 info: info, 437 } 438 } 439 440 func (n *notifier) workerConnected() { 441 n.sendEvent("connect", nil) 442 } 443 444 func (n *notifier) workerStarted() { 445 n.sendEvent("start", nil) 446 } 447 448 func (n *notifier) workerStopped() { 449 n.sendEvent("stop", nil) 450 } 451 452 func (n *notifier) operation() { 453 n.sendEvent("operation", nil) 454 } 455 456 func (n *notifier) agentQuit(err error) { 457 n.sendEvent("quit", err) 458 } 459 460 type mongoConn struct { 461 localHostPort string 462 session *mgo.Session 463 } 464 465 func (c *mongoConn) Ping() error { 466 return c.session.Ping() 467 } 468 469 func (c *mongoConn) IsMaster() (bool, error) { 470 hostPort, err := replicaset.MasterHostPort(c.session) 471 if err != nil { 472 logger.Errorf("replicaset.MasterHostPort returned error: %v", err) 473 return false, err 474 } 475 logger.Errorf("replicaset.MasterHostPort(%s) returned %s", c.localHostPort, hostPort) 476 logger.Errorf("-> %s IsMaster: %v", c.localHostPort, hostPort == c.localHostPort) 477 return hostPort == c.localHostPort, nil 478 } 479 480 const replicaSetName = "juju" 481 482 // startReplicaSet starts up a replica set with n mongo instances. 483 func startReplicaSet(n int) (_ []*gitjujutesting.MgoInstance, err error) { 484 insts := make([]*gitjujutesting.MgoInstance, 0, n) 485 root, err := newMongoInstance() 486 if err != nil { 487 return nil, err 488 } 489 insts = append(insts, root) 490 defer func() { 491 if err == nil { 492 return 493 } 494 for _, inst := range insts { 495 inst.Destroy() 496 } 497 }() 498 499 dialInfo := root.DialInfo() 500 dialInfo.Direct = true 501 dialInfo.Timeout = 60 * time.Second 502 503 session, err := root.DialDirect() 504 if err != nil { 505 return nil, fmt.Errorf("cannot dial root instance: %v", err) 506 } 507 defer session.Close() 508 509 logger.Infof("dialled root instance") 510 511 if err := replicaset.Initiate(session, root.Addr(), replicaSetName, nil); err != nil { 512 return nil, fmt.Errorf("cannot initiate replica set: %v", err) 513 } 514 var members []replicaset.Member 515 for i := 1; i < n; i++ { 516 inst, err := newMongoInstance() 517 if err != nil { 518 return nil, err 519 } 520 insts = append(insts, inst) 521 members = append(members, replicaset.Member{ 522 Address: inst.Addr(), 523 Priority: newFloat64(0.1), 524 Id: i + 1, 525 }) 526 } 527 // TODO(katco): 2016-08-09: lp:1611427 528 attempt := utils.AttemptStrategy{ 529 Total: 60 * time.Second, 530 Delay: 1 * time.Second, 531 } 532 for a := attempt.Start(); a.Next(); { 533 err := replicaset.Add(session, members...) 534 if err == nil { 535 break 536 } 537 logger.Errorf("cannot add members: %v", err) 538 if !a.HasNext() { 539 return nil, fmt.Errorf("timed out trying to add members") 540 } 541 logger.Errorf("retrying") 542 } 543 return insts, err 544 } 545 546 func newMongoInstance() (*gitjujutesting.MgoInstance, error) { 547 inst := &gitjujutesting.MgoInstance{Params: []string{"--replSet", replicaSetName}} 548 if err := inst.Start(testing.Certs); err != nil { 549 return nil, fmt.Errorf("cannot start mongo server: %s", err.Error()) 550 } 551 return inst, nil 552 } 553 554 func newFloat64(f float64) *float64 { 555 return &f 556 } 557 558 // connectionIsFatal returns a function suitable for passing 559 // as the isFatal argument to worker.NewRunner, 560 // that diagnoses an error as fatal if the connection 561 // has failed or if the error is otherwise fatal. 562 // Copied from jujud. 563 func connectionIsFatal(conn singular.Conn) func(err error) bool { 564 return func(err error) bool { 565 if err := conn.Ping(); err != nil { 566 logger.Infof("error pinging %T: %v", conn, err) 567 return true 568 } 569 logger.Infof("error %q is not fatal", err) 570 return false 571 } 572 }